fixedpoint/fixedpoint_neon.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331

// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// fixedpoint_neon.h: optimized NEON specializations of the templates
// in fixedpoint.h.

#ifndef GEMMLOWP_INTERNAL_FIXEDPOINT_NEON_H_
#define GEMMLOWP_INTERNAL_FIXEDPOINT_NEON_H_

#include <arm_neon.h>

namespace gemmlowp {

template <>
struct FixedPointRawTypeTraits<int32x4_t> {
  typedef std::int32_t ScalarRawType;
  static const int kLanes = 4;
};

template <>
struct FixedPointRawTypeTraits<int16x8_t> {
  typedef std::int16_t ScalarRawType;
  static const int kLanes = 8;
};

template <>
inline int32x4_t BitAnd(int32x4_t a, int32x4_t b) {
  return vandq_s32(a, b);
}

template <>
inline int16x8_t BitAnd(int16x8_t a, int16x8_t b) {
  return vandq_s16(a, b);
}

template <>
inline int32x4_t BitOr(int32x4_t a, int32x4_t b) {
  return vorrq_s32(a, b);
}

template <>
inline int16x8_t BitOr(int16x8_t a, int16x8_t b) {
  return vorrq_s16(a, b);
}

template <>
inline int32x4_t BitXor(int32x4_t a, int32x4_t b) {
  return veorq_s32(a, b);
}

template <>
inline int16x8_t BitXor(int16x8_t a, int16x8_t b) {
  return veorq_s16(a, b);
}

template <>
inline int32x4_t BitNot(int32x4_t a) {
  return veorq_s32(a, vdupq_n_s32(-1));
}

template <>
inline int16x8_t BitNot(int16x8_t a) {
  return veorq_s16(a, vdupq_n_s16(-1));
}

template <>
inline int32x4_t Add(int32x4_t a, int32x4_t b) {
  return vaddq_s32(a, b);
}

template <>
inline int16x8_t Add(int16x8_t a, int16x8_t b) {
  return vaddq_s16(a, b);
}

template <>
inline int32x4_t Sub(int32x4_t a, int32x4_t b) {
  return vsubq_s32(a, b);
}

template <>
inline int16x8_t Sub(int16x8_t a, int16x8_t b) {
  return vsubq_s16(a, b);
}

template <>
inline int32x4_t Neg(int32x4_t a) {
  return vnegq_s32(a);
}

template <>
inline int16x8_t Neg(int16x8_t a) {
  return vnegq_s16(a);
}

template <>
inline int32x4_t ShiftLeft(int32x4_t a, int offset) {
  return vshlq_s32(a, vdupq_n_s32(offset));
}

template <>
inline int16x8_t ShiftLeft(int16x8_t a, int offset) {
  return vshlq_s16(a, vdupq_n_s16(offset));
}

template <>
inline int32x4_t ShiftRight(int32x4_t a, int offset) {
  return vshlq_s32(a, vdupq_n_s32(-offset));
}

template <>
inline int16x8_t ShiftRight(int16x8_t a, int offset) {
  return vshlq_s16(a, vdupq_n_s16(-offset));
}

template <>
inline int32x4_t SelectUsingMask(int32x4_t if_mask, int32x4_t then_val,
                                 int32x4_t else_val) {
  return vbslq_s32(vreinterpretq_u32_s32(if_mask), then_val, else_val);
}

template <>
inline int16x8_t SelectUsingMask(int16x8_t if_mask, int16x8_t then_val,
                                 int16x8_t else_val) {
  return vbslq_s16(vreinterpretq_u16_s16(if_mask), then_val, else_val);
}

template <>
inline int32x4_t MaskIfEqual(int32x4_t a, int32x4_t b) {
  return vreinterpretq_s32_u32(vceqq_s32(a, b));
}

template <>
inline int16x8_t MaskIfEqual(int16x8_t a, int16x8_t b) {
  return vreinterpretq_s16_u16(vceqq_s16(a, b));
}

template <>
inline int32x4_t MaskIfNotEqual(int32x4_t a, int32x4_t b) {
  return BitNot(MaskIfEqual(a, b));
}

template <>
inline int16x8_t MaskIfNotEqual(int16x8_t a, int16x8_t b) {
  return BitNot(MaskIfEqual(a, b));
}

template <>
inline int32x4_t MaskIfZero(int32x4_t a) {
  return MaskIfEqual(a, vdupq_n_s32(0));
}

template <>
inline int16x8_t MaskIfZero(int16x8_t a) {
  return MaskIfEqual(a, vdupq_n_s16(0));
}

template <>
inline int32x4_t MaskIfNonZero(int32x4_t a) {
  return vreinterpretq_s32_u32(vtstq_s32(a, a));
}

template <>
inline int16x8_t MaskIfNonZero(int16x8_t a) {
  return vreinterpretq_s16_u16(vtstq_s16(a, a));
}

template <>
inline int32x4_t MaskIfGreaterThan(int32x4_t a, int32x4_t b) {
  return vreinterpretq_s32_u32(vcgtq_s32(a, b));
}

template <>
inline int16x8_t MaskIfGreaterThan(int16x8_t a, int16x8_t b) {
  return vreinterpretq_s16_u16(vcgtq_s16(a, b));
}

template <>
inline int32x4_t MaskIfGreaterThanOrEqual(int32x4_t a, int32x4_t b) {
  return vreinterpretq_s32_u32(vcgeq_s32(a, b));
}

template <>
inline int16x8_t MaskIfGreaterThanOrEqual(int16x8_t a, int16x8_t b) {
  return vreinterpretq_s16_u16(vcgeq_s16(a, b));
}

template <>
inline int32x4_t MaskIfLessThan(int32x4_t a, int32x4_t b) {
  return vreinterpretq_s32_u32(vcltq_s32(a, b));
}

template <>
inline int16x8_t MaskIfLessThan(int16x8_t a, int16x8_t b) {
  return vreinterpretq_s16_u16(vcltq_s16(a, b));
}

template <>
inline int32x4_t MaskIfLessThanOrEqual(int32x4_t a, int32x4_t b) {
  return vreinterpretq_s32_u32(vcleq_s32(a, b));
}

template <>
inline int16x8_t MaskIfLessThanOrEqual(int16x8_t a, int16x8_t b) {
  return vreinterpretq_s16_u16(vcleq_s16(a, b));
}

template <>
inline bool All(int32x4_t a) {
  a = vandq_s32(a, vextq_s32(a, a, 1));
  a = vandq_s32(a, vextq_s32(a, a, 2));
  return vgetq_lane_s32(a, 0);
}

template <>
inline bool All(int16x8_t a) {
  a = vandq_s16(a, vextq_s16(a, a, 1));
  a = vandq_s16(a, vextq_s16(a, a, 2));
  a = vandq_s16(a, vextq_s16(a, a, 4));
  return vgetq_lane_s16(a, 0);
}

template <>
inline bool Any(int32x4_t a) {
  a = vorrq_s32(a, vextq_s32(a, a, 1));
  a = vorrq_s32(a, vextq_s32(a, a, 2));
  return vgetq_lane_s32(a, 0);
}

template <>
inline bool Any(int16x8_t a) {
  a = vorrq_s16(a, vextq_s16(a, a, 1));
  a = vorrq_s16(a, vextq_s16(a, a, 2));
  a = vorrq_s16(a, vextq_s16(a, a, 4));
  return vgetq_lane_s16(a, 0);
}

template <>
inline int32x4_t RoundingHalfSum(int32x4_t a, int32x4_t b) {
  return vrhaddq_s32(a, b);
}

template <>
inline int16x8_t RoundingHalfSum(int16x8_t a, int16x8_t b) {
  return vrhaddq_s16(a, b);
}

template <>
inline int32x4_t SaturatingRoundingDoublingHighMul(int32x4_t a, int32x4_t b) {
  return vqrdmulhq_s32(a, b);
}

template <>
inline int16x8_t SaturatingRoundingDoublingHighMul(int16x8_t a, int16x8_t b) {
  return vqrdmulhq_s16(a, b);
}

template <>
inline int32x4_t RoundingDivideByPOT(int32x4_t x, int exponent) {
  const int32x4_t shift_vec = vdupq_n_s32(-exponent);
  const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift_vec), 31);
  const int32x4_t fixed_up_x = vqaddq_s32(x, fixup);
  return vrshlq_s32(fixed_up_x, shift_vec);
}

template <>
inline int16x8_t RoundingDivideByPOT(int16x8_t x, int exponent) {
  const int16x8_t shift_vec = vdupq_n_s16(-exponent);
  const int16x8_t fixup = vshrq_n_s16(vandq_s16(x, shift_vec), 15);
  const int16x8_t fixed_up_x = vqaddq_s16(x, fixup);
  return vrshlq_s16(fixed_up_x, shift_vec);
}

template <int Exponent>
struct ImplSaturatingRoundingMultiplyByPOT<Exponent, int32x4_t, 1> {
  static int32x4_t eval(int32x4_t x) { return vqshlq_n_s32(x, Exponent); }
};

template <int Exponent>
struct ImplSaturatingRoundingMultiplyByPOT<Exponent, int32x4_t, -1> {
  static int32x4_t eval(int32x4_t x) {
    const int32x4_t fixup = vshrq_n_s32(x, 31);
    const int32x4_t fixed_up_x = vqaddq_s32(x, fixup);
    return vrshrq_n_s32(fixed_up_x, -Exponent);
  }
};

template <int Exponent>
struct ImplSaturatingRoundingMultiplyByPOT<Exponent, int16x8_t, 1> {
  static int16x8_t eval(int16x8_t x) { return vqshlq_n_s16(x, Exponent); }
};

template <int Exponent>
struct ImplSaturatingRoundingMultiplyByPOT<Exponent, int16x8_t, -1> {
  static int16x8_t eval(int16x8_t x) {
    const int16x8_t fixup = vshrq_n_s16(x, 15);
    const int16x8_t fixed_up_x = vqaddq_s16(x, fixup);
    return vrshrq_n_s16(fixed_up_x, -Exponent);
  }
};

template <>
inline int32x4_t Dup<int32x4_t>(std::int32_t x) {
  return vdupq_n_s32(x);
}

template <>
inline int16x8_t Dup<int16x8_t>(std::int16_t x) {
  return vdupq_n_s16(x);
}

// So far this is only needed for int16.
template <>
inline int16x8_t SaturatingAdd(int16x8_t a, int16x8_t b) {
  return vqaddq_s16(a, b);
}

}  // end namespace gemmlowp

#endif  // GEMMLOWP_INTERNAL_FIXEDPOINT_NEON_H_