1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
|
@
@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
@
@ Use of this source code is governed by a BSD-style license
@ that can be found in the LICENSE file in the root of the source
@ tree. An additional intellectual property rights grant can be found
@ in the file PATENTS. All contributing project authors may
@ be found in the AUTHORS file in the root of the source tree.
@
@ This file contains some minimum and maximum functions, optimized for
@ ARM Neon platform. The description header can be found in
@ signal_processing_library.h
@
@ The reference C code is in file min_max_operations.c. Code here is basically
@ a loop unrolling by 8 with Neon instructions. Bit-exact.
#include "webrtc/system_wrappers/interface/asm_defines.h"
GLOBAL_FUNCTION WebRtcSpl_MaxAbsValueW16Neon
GLOBAL_FUNCTION WebRtcSpl_MaxAbsValueW32Neon
GLOBAL_FUNCTION WebRtcSpl_MaxValueW16Neon
GLOBAL_FUNCTION WebRtcSpl_MaxValueW32Neon
GLOBAL_FUNCTION WebRtcSpl_MinValueW16Neon
GLOBAL_FUNCTION WebRtcSpl_MinValueW32Neon
.align 2
@ int16_t WebRtcSpl_MaxAbsValueW16Neon(const int16_t* vector, int length);
DEFINE_FUNCTION WebRtcSpl_MaxAbsValueW16Neon
mov r2, #-1 @ Initialize the return value.
cmp r0, #0
beq END_MAX_ABS_VALUE_W16
cmp r1, #0
ble END_MAX_ABS_VALUE_W16
cmp r1, #8
blt LOOP_MAX_ABS_VALUE_W16
vmov.i16 q12, #0
sub r1, #8 @ Counter for loops
LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W16:
vld1.16 {q13}, [r0]!
subs r1, #8
vabs.s16 q13, q13 @ Note vabs doesn't change the value of -32768.
vmax.u16 q12, q13 @ Use u16 so we don't lose the value -32768.
bge LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W16
@ Find the maximum value in the Neon registers and move it to r2.
vmax.u16 d24, d25
vpmax.u16 d24, d24, d24
vpmax.u16 d24, d24, d24
adds r1, #8
vmov.u16 r2, d24[0]
beq END_MAX_ABS_VALUE_W16
LOOP_MAX_ABS_VALUE_W16:
ldrsh r3, [r0], #2
eor r12, r3, r3, asr #31 @ eor and then sub, to get absolute value.
sub r12, r12, r3, asr #31
cmp r2, r12
movlt r2, r12
subs r1, #1
bne LOOP_MAX_ABS_VALUE_W16
END_MAX_ABS_VALUE_W16:
cmp r2, #0x8000 @ Guard against the case for -32768.
subeq r2, #1
mov r0, r2
bx lr
@ int32_t WebRtcSpl_MaxAbsValueW32Neon(const int32_t* vector, int length);
DEFINE_FUNCTION WebRtcSpl_MaxAbsValueW32Neon
cmp r0, #0
moveq r0, #-1
beq EXIT @ Return -1 for a NULL pointer.
cmp r1, #0 @ length
movle r0, #-1
ble EXIT @ Return -1 if length <= 0.
vmov.i32 q11, #0
vmov.i32 q12, #0
cmp r1, #8
blt LOOP_MAX_ABS_VALUE_W32
sub r1, #8 @ Counter for loops
LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W32:
vld1.32 {q13, q14}, [r0]!
subs r1, #8 @ Counter for loops
vabs.s32 q13, q13 @ vabs doesn't change the value of 0x80000000.
vabs.s32 q14, q14
vmax.u32 q11, q13 @ Use u32 so we don't lose the value 0x80000000.
vmax.u32 q12, q14
bge LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W32
@ Find the maximum value in the Neon registers and move it to r2.
vmax.u32 q12, q11
vmax.u32 d24, d25
vpmax.u32 d24, d24, d24
adds r1, #8
vmov.u32 r2, d24[0]
beq END_MAX_ABS_VALUE_W32
LOOP_MAX_ABS_VALUE_W32:
ldr r3, [r0], #4
eor r12, r3, r3, asr #31 @ eor and then sub, to get absolute value.
sub r12, r12, r3, asr #31
cmp r2, r12
movcc r2, r12
subs r1, #1
bne LOOP_MAX_ABS_VALUE_W32
END_MAX_ABS_VALUE_W32:
mvn r0, #0x80000000 @ Guard against the case for 0x80000000.
cmp r2, r0
movcc r0, r2
EXIT:
bx lr
@ int16_t WebRtcSpl_MaxValueW16Neon(const int16_t* vector, int length);
DEFINE_FUNCTION WebRtcSpl_MaxValueW16Neon
mov r2, #0x8000 @ Initialize the return value.
cmp r0, #0
beq END_MAX_VALUE_W16
cmp r1, #0
ble END_MAX_VALUE_W16
vmov.i16 q12, #0x8000
cmp r1, #8
blt LOOP_MAX_VALUE_W16
sub r1, #8 @ Counter for loops
LOOP_UNROLLED_BY_8_MAX_VALUE_W16:
vld1.16 {q13}, [r0]!
subs r1, #8
vmax.s16 q12, q13
bge LOOP_UNROLLED_BY_8_MAX_VALUE_W16
@ Find the maximum value in the Neon registers and move it to r2.
vmax.s16 d24, d25
vpmax.s16 d24, d24, d24
vpmax.s16 d24, d24, d24
adds r1, #8
vmov.u16 r2, d24[0]
beq END_MAX_VALUE_W16
LOOP_MAX_VALUE_W16:
ldrsh r3, [r0], #2
cmp r2, r3
movlt r2, r3
subs r1, #1
bne LOOP_MAX_VALUE_W16
END_MAX_VALUE_W16:
mov r0, r2
bx lr
@ int32_t WebRtcSpl_MaxValueW32Neon(const int32_t* vector, int length);
DEFINE_FUNCTION WebRtcSpl_MaxValueW32Neon
mov r2, #0x80000000 @ Initialize the return value.
cmp r0, #0
beq END_MAX_VALUE_W32
cmp r1, #0
ble END_MAX_VALUE_W32
vmov.i32 q11, #0x80000000
vmov.i32 q12, #0x80000000
cmp r1, #8
blt LOOP_MAX_VALUE_W32
sub r1, #8 @ Counter for loops
LOOP_UNROLLED_BY_8_MAX_VALUE_W32:
vld1.32 {q13, q14}, [r0]!
subs r1, #8
vmax.s32 q11, q13
vmax.s32 q12, q14
bge LOOP_UNROLLED_BY_8_MAX_VALUE_W32
@ Find the maximum value in the Neon registers and move it to r2.
vmax.s32 q12, q11
vpmax.s32 d24, d24, d25
vpmax.s32 d24, d24, d24
adds r1, #8
vmov.s32 r2, d24[0]
beq END_MAX_VALUE_W32
LOOP_MAX_VALUE_W32:
ldr r3, [r0], #4
cmp r2, r3
movlt r2, r3
subs r1, #1
bne LOOP_MAX_VALUE_W32
END_MAX_VALUE_W32:
mov r0, r2
bx lr
@ int16_t WebRtcSpl_MinValueW16Neon(const int16_t* vector, int length);
DEFINE_FUNCTION WebRtcSpl_MinValueW16Neon
movw r2, #0x7FFF @ Initialize the return value.
cmp r0, #0
beq END_MIN_VALUE_W16
cmp r1, #0
ble END_MIN_VALUE_W16
vmov.i16 q12, #0x7FFF
cmp r1, #8
blt LOOP_MIN_VALUE_W16
sub r1, #8 @ Counter for loops
LOOP_UNROLLED_BY_8_MIN_VALUE_W16:
vld1.16 {q13}, [r0]!
subs r1, #8
vmin.s16 q12, q13
bge LOOP_UNROLLED_BY_8_MIN_VALUE_W16
@ Find the maximum value in the Neon registers and move it to r2.
vmin.s16 d24, d25
vpmin.s16 d24, d24, d24
vpmin.s16 d24, d24, d24
adds r1, #8
vmov.s16 r2, d24[0]
sxth r2, r2
beq END_MIN_VALUE_W16
LOOP_MIN_VALUE_W16:
ldrsh r3, [r0], #2
cmp r2, r3
movge r2, r3
subs r1, #1
bne LOOP_MIN_VALUE_W16
END_MIN_VALUE_W16:
mov r0, r2
bx lr
@ int32_t WebRtcSpl_MinValueW32Neon(const int32_t* vector, int length);
DEFINE_FUNCTION WebRtcSpl_MinValueW32Neon
mov r2, #0x7FFFFFFF @ Initialize the return value.
cmp r0, #0
beq END_MIN_VALUE_W32
cmp r1, #0
ble END_MIN_VALUE_W32
vdup.32 q11, r2
vdup.32 q12, r2
cmp r1, #8
blt LOOP_MIN_VALUE_W32
sub r1, #8 @ Counter for loops
LOOP_UNROLLED_BY_8_MIN_VALUE_W32:
vld1.32 {q13, q14}, [r0]!
subs r1, #8
vmin.s32 q11, q13
vmin.s32 q12, q14
bge LOOP_UNROLLED_BY_8_MIN_VALUE_W32
@ Find the maximum value in the Neon registers and move it to r2.
vmin.s32 q12, q11
vpmin.s32 d24, d24, d25
vpmin.s32 d24, d24, d24
adds r1, #8
vmov.s32 r2, d24[0]
beq END_MIN_VALUE_W32
LOOP_MIN_VALUE_W32:
ldr r3, [r0], #4
cmp r2, r3
movge r2, r3
subs r1, #1
bne LOOP_MIN_VALUE_W32
END_MIN_VALUE_W32:
mov r0, r2
bx lr
|