1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
|
@
@ Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
@
@ Use of this source code is governed by a BSD-style license
@ that can be found in the LICENSE file in the root of the source
@ tree. An additional intellectual property rights grant can be found
@ in the file PATENTS. All contributing project authors may
@ be found in the AUTHORS file in the root of the source tree.
@
@ Some code in this file was originally from file
@ armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S which was licensed as
@ follows. It has been relicensed with permission from the copyright holders.
@
@
@ OpenMAX DL: v1.0.2
@ Last Modified Revision: 7485
@ Last Modified Date: Fri, 21 Sep 2007
@
@ (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
@
@
@ Description:
@ Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT.
@ It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation.
@ It implements both "scaled"(by 1/2) and "unscaled" versions of the above
@ formula.
@
#include "dl/api/arm/armCOMM_s.h"
#include "dl/api/arm/omxtypes_s.h"
@//Input Registers
#define pSrc r0
#define pDst r1
#define pFFTSpec r2
#define scale r3
@ Output registers
#define result r0
@//Local Scratch Registers
#define argTwiddle r1
#define argDst r2
#define argScale r4
#define tmpOrder r4
#define pTwiddle r4
#define pOut r5
#define subFFTSize r7
#define subFFTNum r6
#define N r6
#define order r14
#define diff r9
@ Total num of radix stages to comple the FFT.
#define count r8
#define x0r r4
#define x0i r5
#define diffMinusOne r2
#define round r3
#define pOut1 r2
#define size r7
#define step r8
#define step1 r9
#define step2 r10
#define twStep r10
#define pTwiddleTmp r11
#define argTwiddle1 r12
#define zero r14
@ Neon registers
#define dX0 D0.S16
#define dX0S32 D0.S32
#define dShift D1.S16
#define dX1 D1.S16
#define dX1S32 D1.S32
#define dY0 D2.S16
#define dY1 D3.S16
#define dX0r D0.S16
#define dX0rS32 D0.S32
#define dX0i D1.S16
#define dX1r D2.S16
#define dX1i D3.S16
#define qX1 Q1.S16
#define dW0r D4.S16
#define dW0i D5.S16
#define dW1r D6.S16
#define dW1i D7.S16
#define dW0rS32 D4.S32
#define dW0iS32 D5.S32
#define dW1rS32 D6.S32
#define dW1iS32 D7.S32
#define dT0 D8.S16
#define dT1 D9.S16
#define dT2 D10.S16
#define dT3 D11.S16
#define qT0 Q6.S32
#define qT1 Q7.S32
#define qT2 Q8.S32
#define qT3 Q9.S32
#define dY0r D4.S16
#define dY0i D5.S16
#define dY1r D6.S16
#define dY1i D7.S16
#define qY1 Q3.S16
#define dY2 D4.S16
#define dY3 D5.S16
#define dW0 D6.S16
#define dW1 D7.S16
#define dW0Tmp D10.S16
#define dW1Neg D11.S16
@ Structure offsets for the FFTSpec
.set ARMsFFTSpec_N, 0
.set ARMsFFTSpec_pBitRev, 4
.set ARMsFFTSpec_pTwiddle, 8
.set ARMsFFTSpec_pBuf, 12
.macro FFTSTAGE scaled, inverse, name
@ Read the size from structure and take log
LDR N, [pFFTSpec, #ARMsFFTSpec_N]
@ Read other structure parameters
LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
MOV size,N,ASR #1 @ preserve the contents of N
MOV step,N,LSL #1 @ step = N/2 * 4 bytes
@ Process different FFT sizes with different loops.
CMP size,#4
BLE smallFFTSize\name
@ Z(k) = 1/2 {[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]}
@ Note: W^(k) is stored as negated value and also need to
@ conjugate the values from the table.
@ Z(0) : no need of twiddle multiply
@ Z(0) = 1/2 { [F(0) + F'(N/2)] +j [F(0) - F'(N/2)] }
VLD1 dX0S32[0],[pSrc],step
ADD pOut1,pOut,step @ pOut1 = pOut+ N/2*4 bytes
VLD1 dX1S32[0],[pSrc]!
SUB twStep,step,size @ twStep = 3N/8 * 4 bytes pointing to W^1
MOV step1,size,LSL #1 @ step1 = N/4 * 4 = N/2*2 bytes
SUB step1,step1,#4 @ (N/4-1)*4 bytes
VHADD dY0,dX0,dX1 @ [b+d | a+c]
VHSUB dY1,dX0,dX1 @ [b-d | a-c]
VTRN dY0,dY1 @ dY0= [a-c | a+c] ;dY1= [b-d | b+d]
.ifeqs "\scaled", "TRUE"
VHSUB dX0,dY0,dY1
SUBS size,size,#2
VHADD dX1,dY0,dY1
.else
VSUB dX0,dY0,dY1
SUBS size,size,#2
VADD dX1,dY0,dY1
.endif
SUB pSrc,pSrc,step
VST1 dX0[0],[pOut1]!
ADD pTwiddleTmp,pTwiddle,#4 @ W^2
VST1 dX1[1],[pOut1]!
ADD argTwiddle1,pTwiddle,twStep @ W^1
BLT decrementScale\name
BEQ lastElement\name
SUB step,step,#20
SUB step1,step1,#4 @ (N/4-1)*8 bytes
SUB step2, step1, #4
@ Z(k) = 1/2[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]
@ Note: W^k is stored as negative values in the table and also need to
@ conjugate the values from the table.
@ Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
@ since both of them require F(1),F(2) and F(N/2-2),F(N/2-1).
evenOddButterflyLoop\name:
VLD2 {dX0r,dX0i},[pSrc],step
VLD2 {dX1r,dX1i},[pSrc]!
SUB pSrc, pSrc, step
VLD1 dW0r,[argTwiddle1],step1
VREV64 qX1,qX1
VLD1 dW1r,[argTwiddle1]!
VHSUB dT2,dX0r,dX1r @ a-c
SUB argTwiddle1, argTwiddle1, step1
SUB step1,step1,#16
VLD1 dW0i,[pTwiddleTmp],step2
VHADD dT3,dX0i,dX1i @ b+d
VLD1 dW1i,[pTwiddleTmp]!
VHADD dT0,dX0r,dX1r @ a+c
VHSUB dT1,dX0i,dX1i @ b-d
SUB pTwiddleTmp, pTwiddleTmp, step2
SUB step2,step2,#16
SUBS size,size,#8
VZIP dW1r,dW1i
VTRN dW0r,dW0i
VZIP dW1iS32, dW1rS32
VMULL qT0,dW1i,dT2
VMLSL qT0,dW1r,dT3
VMULL qT1,dW1i,dT3
VMLAL qT1,dW1r,dT2
VMULL qT2,dW0r,dT2
VMLAL qT2,dW0i,dT3
VMULL qT3,dW0r,dT3
VMLSL qT3,dW0i,dT2
VRSHRN dX1r,qT0,#15
VRSHRN dX1i,qT1,#15
VRSHRN dX0r,qT2,#15
VRSHRN dX0i,qT3,#15
.ifeqs "\scaled", "TRUE"
VHADD dY1r,dT0,dX1i @ F(N/2 -1)
VHSUB dY1i,dX1r,dT1
.else
VADD dY1r,dT0,dX1i @ F(N/2 -1)
VSUB dY1i,dX1r,dT1
.endif
.ifeqs "\scaled", "TRUE"
VHADD dY0r,dT0,dX0i @ F(1)
VHSUB dY0i,dT1,dX0r
.else
VADD dY0r,dT0,dX0i @ F(1)
VSUB dY0i,dT1,dX0r
.endif
VREV64 qY1,qY1
VST2 {dY0r,dY0i},[pOut1],step
VST2 {dY1r,dY1i},[pOut1]
ADD pOut1,pOut1,#16
SUB pOut1, pOut1, step
SUB step,step,#32
BGT evenOddButterflyLoop\name
SUB pSrc,pSrc,#4 @ set both the ptrs to the last element
SUB pOut1,pOut1,#4
B lastElement\name
smallFFTSize\name:
@ Z(k) = 1/2 {[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]}
@ Note: W^(k) is stored as negated value and also need to
@ conjugate the values from the table.
@ Z(0) : no need of twiddle multiply
@ Z(0) = 1/2 { [F(0) + F'(N/2)] +j [F(0) - F'(N/2)] }
VLD1 dX0S32[0],[pSrc],step
ADD pOut1,pOut,step @ pOut1 = pOut+ N/2*4 bytes
VLD1 dX1S32[0],[pSrc]!
SUB twStep,step,size @ twStep = 3N/8 * 4 bytes pointing to W^1
MOV step1,size,LSL #1 @ step1 = N/4 * 4 = N/2*2 bytes
SUB step1,step1,#4 @ (N/4-1)*4 bytes
VHADD dY0,dX0,dX1 @ [b+d | a+c]
VHSUB dY1,dX0,dX1 @ [b-d | a-c]
VTRN dY0,dY1 @ dY0= [a-c | a+c] ;dY1= [b-d | b+d]
.ifeqs "\scaled", "TRUE"
VHSUB dX0,dY0,dY1
SUBS size,size,#2
VHADD dX1,dY0,dY1
.else
VSUB dX0,dY0,dY1
SUBS size,size,#2
VADD dX1,dY0,dY1
.endif
SUB pSrc,pSrc,step
VST1 dX0[0],[pOut1]!
ADD pTwiddleTmp,pTwiddle,#4 @ W^2
VST1 dX1[1],[pOut1]!
ADD argTwiddle1,pTwiddle,twStep @ W^1
BLT decrementScale\name
BEQ lastElement\name
@ Z(k) = 1/2[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]
@ Note: W^k is stored as negative values in the table and also need to
@ conjugate the values from the table.
@ Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
@ since both of them require F(1),F(2) and F(N/2-2),F(N/2-1).
SUB step,step,#12
evenOddButterflyLoopSize4\name:
VLD1 dW0rS32[0],[argTwiddle1],step1
VLD1 dW1rS32[0],[argTwiddle1]!
VLD2 {dX0r[0],dX0i[0]},[pSrc]!
VLD2 {dX0r[1],dX0i[1]},[pSrc],step
SUB pSrc,pSrc,#4
SUB argTwiddle1,argTwiddle1,step1
VLD2 {dX1r[0],dX1i[0]},[pSrc]!
VLD2 {dX1r[1],dX1i[1]},[pSrc]!
SUB step1,step1,#4 @ (N/4-2)*4 bytes
VLD1 dW0iS32[0],[pTwiddleTmp],step1
VLD1 dW1iS32[0],[pTwiddleTmp]!
SUB pSrc,pSrc,step
SUB pTwiddleTmp,pTwiddleTmp,step1
VREV32 dX1r,dX1r
VREV32 dX1i,dX1i
SUBS size,size,#4
VHSUB dT2,dX0r,dX1r @ a-c
VHADD dT3,dX0i,dX1i @ b+d
SUB step1,step1,#4
VHADD dT0,dX0r,dX1r @ a+c
VHSUB dT1,dX0i,dX1i @ b-d
VTRN dW1r,dW1i
VTRN dW0r,dW0i
VMULL qT0,dW1r,dT2
VMLSL qT0,dW1i,dT3
VMULL qT1,dW1r,dT3
VMLAL qT1,dW1i,dT2
VMULL qT2,dW0r,dT2
VMLAL qT2,dW0i,dT3
VMULL qT3,dW0r,dT3
VMLSL qT3,dW0i,dT2
VRSHRN dX1r,qT0,#15
VRSHRN dX1i,qT1,#15
.ifeqs "\scaled", "TRUE"
VHADD dY1r,dT0,dX1i @ F(N/2 -1)
VHSUB dY1i,dX1r,dT1
.else
VADD dY1r,dT0,dX1i @ F(N/2 -1)
VSUB dY1i,dX1r,dT1
.endif
VREV32 dY1r,dY1r
VREV32 dY1i,dY1i
VRSHRN dX0r,qT2,#15
VRSHRN dX0i,qT3,#15
.ifeqs "\scaled", "TRUE"
VHADD dY0r,dT0,dX0i @ F(1)
VHSUB dY0i,dT1,dX0r
.else
VADD dY0r,dT0,dX0i @ F(1)
VSUB dY0i,dT1,dX0r
.endif
VST2 {dY0r[0],dY0i[0]},[pOut1]!
VST2 {dY0r[1],dY0i[1]},[pOut1],step
SUB pOut1, #4
VST2 {dY1r[0],dY1i[0]},[pOut1]!
VST2 {dY1r[1],dY1i[1]},[pOut1]!
SUB pOut1,pOut1,step
SUB pSrc,pSrc,#4 @ set both the ptrs to the last element
SUB pOut1,pOut1,#4
@ Last element can be expanded as follows
@ 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (W^k is stored as -ve)
@ 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)]
@ 1/2[2a+j0] - j (c-jd) [0+j2b]
@ (a+bc, -bd)
@ Since (c,d) = (0,1) for the last element, result is just (a,-b)
lastElement\name:
VLD1 dX0rS32[0],[pSrc]
.ifeqs "\scaled", "TRUE"
VSHR dX0r,dX0r,#1
.endif
VST1 dX0r[0],[pOut1]!
VNEG dX0r,dX0r
VST1 dX0r[1],[pOut1]
decrementScale\name:
.ifeqs "\scaled", "TRUE"
SUB scale,scale,#1
.endif
.endm
M_START armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe,r4
FFTSTAGE "FALSE","TRUE",Inv
M_END
M_START armSP_FFTInv_CCSToR_S16_Sfs_preTwiddleRadix2_unsafe,r4
FFTSTAGE "TRUE","TRUE",InvSfs
M_END
.end
|