common_audio/signal_processing/min_max_operations_neon.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283

@
@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
@
@ Use of this source code is governed by a BSD-style license
@ that can be found in the LICENSE file in the root of the source
@ tree. An additional intellectual property rights grant can be found
@ in the file PATENTS.  All contributing project authors may
@ be found in the AUTHORS file in the root of the source tree.
@

@ This file contains some minimum and maximum functions, optimized for
@ ARM Neon platform. The description header can be found in
@ signal_processing_library.h
@
@ The reference C code is in file min_max_operations.c. Code here is basically
@ a loop unrolling by 8 with Neon instructions. Bit-exact.

#include "webrtc/system_wrappers/interface/asm_defines.h"

GLOBAL_FUNCTION WebRtcSpl_MaxAbsValueW16Neon
GLOBAL_FUNCTION WebRtcSpl_MaxAbsValueW32Neon
GLOBAL_FUNCTION WebRtcSpl_MaxValueW16Neon
GLOBAL_FUNCTION WebRtcSpl_MaxValueW32Neon
GLOBAL_FUNCTION WebRtcSpl_MinValueW16Neon
GLOBAL_FUNCTION WebRtcSpl_MinValueW32Neon

.align  2
@ int16_t WebRtcSpl_MaxAbsValueW16Neon(const int16_t* vector, int length);
DEFINE_FUNCTION WebRtcSpl_MaxAbsValueW16Neon
  mov r2, #-1                 @ Initialize the return value.
  cmp r0, #0
  beq END_MAX_ABS_VALUE_W16
  cmp r1, #0
  ble END_MAX_ABS_VALUE_W16

  cmp r1, #8
  blt LOOP_MAX_ABS_VALUE_W16

  vmov.i16 q12, #0
  sub r1, #8                  @ Counter for loops

LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W16:
  vld1.16 {q13}, [r0]!
  subs r1, #8
  vabs.s16 q13, q13           @ Note vabs doesn't change the value of -32768.
  vmax.u16 q12, q13           @ Use u16 so we don't lose the value -32768.
  bge LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W16

  @ Find the maximum value in the Neon registers and move it to r2.
  vmax.u16 d24, d25
  vpmax.u16 d24, d24, d24
  vpmax.u16 d24, d24, d24
  adds r1, #8
  vmov.u16 r2, d24[0]
  beq END_MAX_ABS_VALUE_W16

LOOP_MAX_ABS_VALUE_W16:
  ldrsh r3, [r0], #2
  eor r12, r3, r3, asr #31    @ eor and then sub, to get absolute value.
  sub r12, r12, r3, asr #31
  cmp r2, r12
  movlt r2, r12
  subs r1, #1
  bne LOOP_MAX_ABS_VALUE_W16

END_MAX_ABS_VALUE_W16:
  cmp r2, #0x8000             @ Guard against the case for -32768.
  subeq r2, #1
  mov r0, r2
  bx  lr


@ int32_t WebRtcSpl_MaxAbsValueW32Neon(const int32_t* vector, int length);
DEFINE_FUNCTION WebRtcSpl_MaxAbsValueW32Neon
  cmp r0, #0
  moveq r0, #-1
  beq EXIT                    @ Return -1 for a NULL pointer.
  cmp r1, #0                  @ length
  movle r0, #-1
  ble EXIT                    @ Return -1 if length <= 0.

  vmov.i32 q11, #0
  vmov.i32 q12, #0
  cmp r1, #8
  blt LOOP_MAX_ABS_VALUE_W32

  sub r1, #8                  @ Counter for loops

LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W32:
  vld1.32 {q13, q14}, [r0]!
  subs r1, #8                 @ Counter for loops
  vabs.s32 q13, q13           @ vabs doesn't change the value of 0x80000000.
  vabs.s32 q14, q14
  vmax.u32 q11, q13           @ Use u32 so we don't lose the value 0x80000000.
  vmax.u32 q12, q14
  bge LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W32

  @ Find the maximum value in the Neon registers and move it to r2.
  vmax.u32 q12, q11
  vmax.u32 d24, d25
  vpmax.u32 d24, d24, d24
  adds r1, #8
  vmov.u32 r2, d24[0]
  beq END_MAX_ABS_VALUE_W32

LOOP_MAX_ABS_VALUE_W32:
  ldr r3, [r0], #4
  eor r12, r3, r3, asr #31    @ eor and then sub, to get absolute value.
  sub r12, r12, r3, asr #31
  cmp r2, r12
  movcc r2, r12
  subs r1, #1
  bne LOOP_MAX_ABS_VALUE_W32

END_MAX_ABS_VALUE_W32:
  mvn r0, #0x80000000         @ Guard against the case for 0x80000000.
  cmp r2, r0
  movcc r0, r2

EXIT:
  bx  lr

@ int16_t WebRtcSpl_MaxValueW16Neon(const int16_t* vector, int length);
DEFINE_FUNCTION WebRtcSpl_MaxValueW16Neon
  mov r2, #0x8000             @ Initialize the return value.
  cmp r0, #0
  beq END_MAX_VALUE_W16
  cmp r1, #0
  ble END_MAX_VALUE_W16

  vmov.i16 q12, #0x8000
  cmp r1, #8
  blt LOOP_MAX_VALUE_W16

  sub r1, #8                  @ Counter for loops

LOOP_UNROLLED_BY_8_MAX_VALUE_W16:
  vld1.16 {q13}, [r0]!
  subs r1, #8
  vmax.s16 q12, q13
  bge LOOP_UNROLLED_BY_8_MAX_VALUE_W16

  @ Find the maximum value in the Neon registers and move it to r2.
  vmax.s16 d24, d25
  vpmax.s16 d24, d24, d24
  vpmax.s16 d24, d24, d24
  adds r1, #8
  vmov.u16 r2, d24[0]
  beq END_MAX_VALUE_W16

LOOP_MAX_VALUE_W16:
  ldrsh r3, [r0], #2
  cmp r2, r3
  movlt r2, r3
  subs r1, #1
  bne LOOP_MAX_VALUE_W16

END_MAX_VALUE_W16:
  mov r0, r2
  bx  lr

@ int32_t WebRtcSpl_MaxValueW32Neon(const int32_t* vector, int length);
DEFINE_FUNCTION WebRtcSpl_MaxValueW32Neon
  mov r2, #0x80000000         @ Initialize the return value.
  cmp r0, #0
  beq END_MAX_VALUE_W32
  cmp r1, #0
  ble END_MAX_VALUE_W32

  vmov.i32 q11, #0x80000000
  vmov.i32 q12, #0x80000000
  cmp r1, #8
  blt LOOP_MAX_VALUE_W32

  sub r1, #8                  @ Counter for loops

LOOP_UNROLLED_BY_8_MAX_VALUE_W32:
  vld1.32 {q13, q14}, [r0]!
  subs r1, #8
  vmax.s32 q11, q13
  vmax.s32 q12, q14
  bge LOOP_UNROLLED_BY_8_MAX_VALUE_W32

  @ Find the maximum value in the Neon registers and move it to r2.
  vmax.s32 q12, q11
  vpmax.s32 d24, d24, d25
  vpmax.s32 d24, d24, d24
  adds r1, #8
  vmov.s32 r2, d24[0]
  beq END_MAX_VALUE_W32

LOOP_MAX_VALUE_W32:
  ldr r3, [r0], #4
  cmp r2, r3
  movlt r2, r3
  subs r1, #1
  bne LOOP_MAX_VALUE_W32

END_MAX_VALUE_W32:
  mov r0, r2
  bx  lr

@ int16_t WebRtcSpl_MinValueW16Neon(const int16_t* vector, int length);
DEFINE_FUNCTION WebRtcSpl_MinValueW16Neon
  movw r2, #0x7FFF            @ Initialize the return value.
  cmp r0, #0
  beq END_MIN_VALUE_W16
  cmp r1, #0
  ble END_MIN_VALUE_W16

  vmov.i16 q12, #0x7FFF
  cmp r1, #8
  blt LOOP_MIN_VALUE_W16

  sub r1, #8                  @ Counter for loops

LOOP_UNROLLED_BY_8_MIN_VALUE_W16:
  vld1.16 {q13}, [r0]!
  subs r1, #8
  vmin.s16 q12, q13
  bge LOOP_UNROLLED_BY_8_MIN_VALUE_W16

  @ Find the maximum value in the Neon registers and move it to r2.
  vmin.s16 d24, d25
  vpmin.s16 d24, d24, d24
  vpmin.s16 d24, d24, d24
  adds r1, #8
  vmov.s16 r2, d24[0]
  sxth  r2, r2
  beq END_MIN_VALUE_W16

LOOP_MIN_VALUE_W16:
  ldrsh r3, [r0], #2
  cmp r2, r3
  movge r2, r3
  subs r1, #1
  bne LOOP_MIN_VALUE_W16

END_MIN_VALUE_W16:
  mov r0, r2
  bx  lr

@ int32_t WebRtcSpl_MinValueW32Neon(const int32_t* vector, int length);
DEFINE_FUNCTION WebRtcSpl_MinValueW32Neon
  mov r2, #0x7FFFFFFF         @ Initialize the return value.
  cmp r0, #0
  beq END_MIN_VALUE_W32
  cmp r1, #0
  ble END_MIN_VALUE_W32

  vdup.32 q11, r2
  vdup.32 q12, r2
  cmp r1, #8
  blt LOOP_MIN_VALUE_W32

  sub r1, #8                  @ Counter for loops

LOOP_UNROLLED_BY_8_MIN_VALUE_W32:
  vld1.32 {q13, q14}, [r0]!
  subs r1, #8
  vmin.s32 q11, q13
  vmin.s32 q12, q14
  bge LOOP_UNROLLED_BY_8_MIN_VALUE_W32

  @ Find the maximum value in the Neon registers and move it to r2.
  vmin.s32 q12, q11
  vpmin.s32 d24, d24, d25
  vpmin.s32 d24, d24, d24
  adds r1, #8
  vmov.s32 r2, d24[0]
  beq END_MIN_VALUE_W32

LOOP_MIN_VALUE_W32:
  ldr r3, [r0], #4
  cmp r2, r3
  movge r2, r3
  subs r1, #1
  bne LOOP_MIN_VALUE_W32

END_MIN_VALUE_W32:
  mov r0, r2
  bx  lr