dl/sp/src/arm/neon/armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe_s.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409

@
@  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
@
@  Use of this source code is governed by a BSD-style license
@  that can be found in the LICENSE file in the root of the source
@  tree. An additional intellectual property rights grant can be found
@  in the file PATENTS.  All contributing project authors may
@  be found in the AUTHORS file in the root of the source tree.
@
@ Some code in this file was originally from file
@ armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S which was licensed as
@ follows. It has been relicensed with permission from the copyright holders.
@

@
@ OpenMAX DL: v1.0.2
@ Last Modified Revision:   7485
@ Last Modified Date:       Fri, 21 Sep 2007
@ 
@ (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
@

@
@ Description:
@ Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT.
@ It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation.
@ It implements both "scaled"(by 1/2) and "unscaled" versions of the above
@ formula.
@ 
        
#include "dl/api/arm/armCOMM_s.h"
#include "dl/api/arm/omxtypes_s.h"
        
@//Input Registers
#define pSrc            r0
#define pDst            r1
#define pFFTSpec        r2
#define scale           r3

@ Output registers
#define result          r0

@//Local Scratch Registers
#define argTwiddle      r1
#define argDst          r2
#define argScale        r4
#define tmpOrder        r4
#define pTwiddle        r4
#define pOut            r5
#define subFFTSize      r7     
#define subFFTNum       r6
#define N               r6
#define order           r14
#define diff            r9
@ Total num of radix stages to comple the FFT.
#define count           r8
#define x0r             r4    
#define x0i             r5
#define diffMinusOne    r2
#define round           r3
#define pOut1           r2
#define size            r7
#define step            r8            
#define step1           r9
#define step2           r10
#define twStep          r10
#define pTwiddleTmp     r11
#define argTwiddle1     r12
#define zero            r14

@ Neon registers
#define dX0             D0.S16
#define dX0S32          D0.S32
#define dShift          D1.S16
#define dX1             D1.S16
#define dX1S32          D1.S32
#define dY0             D2.S16
#define dY1             D3.S16
#define dX0r            D0.S16            
#define dX0rS32         D0.S32
#define dX0i            D1.S16
#define dX1r            D2.S16
#define dX1i            D3.S16
#define qX1             Q1.S16
#define dW0r            D4.S16
#define dW0i            D5.S16
#define dW1r            D6.S16
#define dW1i            D7.S16
#define dW0rS32         D4.S32
#define dW0iS32         D5.S32
#define dW1rS32         D6.S32
#define dW1iS32         D7.S32
#define dT0             D8.S16
#define dT1             D9.S16
#define dT2             D10.S16
#define dT3             D11.S16
#define qT0             Q6.S32
#define qT1             Q7.S32
#define qT2             Q8.S32
#define qT3             Q9.S32
#define dY0r            D4.S16
#define dY0i            D5.S16
#define dY1r            D6.S16
#define dY1i            D7.S16
#define qY1             Q3.S16
#define dY2             D4.S16
#define dY3             D5.S16
#define dW0             D6.S16
#define dW1             D7.S16
#define dW0Tmp          D10.S16
#define dW1Neg          D11.S16

        @ Structure offsets for the FFTSpec             
        .set    ARMsFFTSpec_N, 0
        .set    ARMsFFTSpec_pBitRev, 4
        .set    ARMsFFTSpec_pTwiddle, 8
        .set    ARMsFFTSpec_pBuf, 12

        .macro FFTSTAGE scaled, inverse, name
        
        @ Read the size from structure and take log
        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
        
        @ Read other structure parameters
        LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
        
        MOV     size,N,ASR #1        @ preserve the contents of N
        MOV     step,N,LSL #1        @ step = N/2 * 4 bytes
        
        @ Process different FFT sizes with different loops.
        CMP    size,#4
        BLE    smallFFTSize\name
        
        @ Z(k) = 1/2 {[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]}
        @ Note: W^(k) is stored as negated value and also need to
        @ conjugate the values from the table.
        
        @ Z(0) : no need of twiddle multiply
        @ Z(0) = 1/2 { [F(0) +  F'(N/2)] +j [F(0) -  F'(N/2)] }
        
        VLD1    dX0S32[0],[pSrc],step
        ADD     pOut1,pOut,step      @ pOut1 = pOut+ N/2*4 bytes 
                
        VLD1    dX1S32[0],[pSrc]!
        SUB     twStep,step,size     @ twStep = 3N/8 * 4 bytes pointing to W^1
        
        MOV     step1,size,LSL #1    @ step1 = N/4 * 4 = N/2*2 bytes
        SUB     step1,step1,#4       @ (N/4-1)*4 bytes
        
        VHADD    dY0,dX0,dX1         @ [b+d | a+c]
        VHSUB    dY1,dX0,dX1         @ [b-d | a-c] 
        VTRN    dY0,dY1              @ dY0= [a-c | a+c] ;dY1= [b-d | b+d] 
        
        .ifeqs  "\scaled", "TRUE"
            VHSUB   dX0,dY0,dY1
            SUBS    size,size,#2
            VHADD   dX1,dY0,dY1
        .else
            VSUB   dX0,dY0,dY1
            SUBS    size,size,#2
            VADD   dX1,dY0,dY1
        .endif
                    
        SUB     pSrc,pSrc,step
        VST1    dX0[0],[pOut1]!
        ADD     pTwiddleTmp,pTwiddle,#4                @ W^2
        VST1    dX1[1],[pOut1]!
        ADD     argTwiddle1,pTwiddle,twStep            @ W^1 
        
        BLT     decrementScale\name
        BEQ     lastElement\name
                        
        SUB     step,step,#20
        SUB     step1,step1,#4                         @ (N/4-1)*8 bytes
        SUB     step2, step1, #4
                        
        @ Z(k) = 1/2[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]
        @ Note: W^k is stored as negative values in the table and also need to
        @ conjugate the values from the table.
        @ Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
        @ since both of them require F(1),F(2) and F(N/2-2),F(N/2-1).

evenOddButterflyLoop\name:     
        VLD2    {dX0r,dX0i},[pSrc],step
        VLD2    {dX1r,dX1i},[pSrc]!
        SUB     pSrc, pSrc, step

        VLD1    dW0r,[argTwiddle1],step1
        VREV64  qX1,qX1
        VLD1    dW1r,[argTwiddle1]!
        VHSUB   dT2,dX0r,dX1r                          @ a-c
        SUB     argTwiddle1, argTwiddle1, step1
        SUB     step1,step1,#16

        VLD1    dW0i,[pTwiddleTmp],step2
        VHADD   dT3,dX0i,dX1i                          @ b+d
        VLD1    dW1i,[pTwiddleTmp]!
        VHADD   dT0,dX0r,dX1r                          @ a+c
        VHSUB   dT1,dX0i,dX1i                          @ b-d
        SUB     pTwiddleTmp, pTwiddleTmp, step2
        SUB     step2,step2,#16

        SUBS    size,size,#8
        
        VZIP    dW1r,dW1i
        VTRN    dW0r,dW0i
        VZIP    dW1iS32, dW1rS32
                                
        VMULL   qT0,dW1i,dT2
        VMLSL   qT0,dW1r,dT3
        VMULL   qT1,dW1i,dT3
        VMLAL   qT1,dW1r,dT2
        VMULL   qT2,dW0r,dT2
        VMLAL   qT2,dW0i,dT3
        VMULL   qT3,dW0r,dT3
        VMLSL   qT3,dW0i,dT2
        
        VRSHRN  dX1r,qT0,#15
        VRSHRN  dX1i,qT1,#15
        VRSHRN  dX0r,qT2,#15
        VRSHRN  dX0i,qT3,#15
        
        .ifeqs  "\scaled", "TRUE"
            VHADD    dY1r,dT0,dX1i                     @ F(N/2 -1)
            VHSUB    dY1i,dX1r,dT1
        .else
            VADD    dY1r,dT0,dX1i                      @ F(N/2 -1)
            VSUB    dY1i,dX1r,dT1
        .endif
        
        .ifeqs  "\scaled", "TRUE"
            VHADD    dY0r,dT0,dX0i                     @ F(1)
            VHSUB    dY0i,dT1,dX0r
        .else
            VADD    dY0r,dT0,dX0i                      @ F(1)
            VSUB    dY0i,dT1,dX0r
        .endif
        
        VREV64  qY1,qY1

        VST2    {dY0r,dY0i},[pOut1],step
        VST2    {dY1r,dY1i},[pOut1]
        ADD     pOut1,pOut1,#16
        SUB     pOut1, pOut1, step
        SUB     step,step,#32
       
        BGT     evenOddButterflyLoop\name

        SUB     pSrc,pSrc,#4           @ set both the ptrs to the last element
        SUB     pOut1,pOut1,#4
        B       lastElement\name
        
smallFFTSize\name:
        @ Z(k) = 1/2 {[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]}
        @ Note: W^(k) is stored as negated value and also need to
        @ conjugate the values from the table.
        
        @ Z(0) : no need of twiddle multiply
        @ Z(0) = 1/2 { [F(0) +  F'(N/2)] +j [F(0) -  F'(N/2)] }
        
        VLD1    dX0S32[0],[pSrc],step
        ADD     pOut1,pOut,step      @ pOut1 = pOut+ N/2*4 bytes 
                
        VLD1    dX1S32[0],[pSrc]!
        SUB     twStep,step,size     @ twStep = 3N/8 * 4 bytes pointing to W^1
        
        MOV     step1,size,LSL #1    @ step1 = N/4 * 4 = N/2*2 bytes
        SUB     step1,step1,#4       @ (N/4-1)*4 bytes
        
        VHADD    dY0,dX0,dX1         @ [b+d | a+c]
        VHSUB    dY1,dX0,dX1         @ [b-d | a-c] 
        VTRN    dY0,dY1              @ dY0= [a-c | a+c] ;dY1= [b-d | b+d] 
        
        .ifeqs  "\scaled", "TRUE"
            VHSUB   dX0,dY0,dY1
            SUBS    size,size,#2
            VHADD   dX1,dY0,dY1
        .else
            VSUB   dX0,dY0,dY1
            SUBS    size,size,#2
            VADD   dX1,dY0,dY1
        .endif
                    
        SUB     pSrc,pSrc,step
        VST1    dX0[0],[pOut1]!
        ADD     pTwiddleTmp,pTwiddle,#4                @ W^2
        VST1    dX1[1],[pOut1]!
        ADD     argTwiddle1,pTwiddle,twStep            @ W^1 
        
        BLT     decrementScale\name
        BEQ     lastElement\name
                        
        @ Z(k) = 1/2[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]
        @ Note: W^k is stored as negative values in the table and also need to
        @ conjugate the values from the table.
        @ Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
        @ since both of them require F(1),F(2) and F(N/2-2),F(N/2-1).

        SUB     step,step,#12

evenOddButterflyLoopSize4\name:     
        VLD1    dW0rS32[0],[argTwiddle1],step1
        VLD1    dW1rS32[0],[argTwiddle1]!
        
        VLD2    {dX0r[0],dX0i[0]},[pSrc]!
        VLD2    {dX0r[1],dX0i[1]},[pSrc],step
        SUB     pSrc,pSrc,#4
        SUB     argTwiddle1,argTwiddle1,step1
        VLD2    {dX1r[0],dX1i[0]},[pSrc]!
        VLD2    {dX1r[1],dX1i[1]},[pSrc]!
        
        SUB     step1,step1,#4                         @ (N/4-2)*4 bytes
        VLD1    dW0iS32[0],[pTwiddleTmp],step1
        VLD1    dW1iS32[0],[pTwiddleTmp]!
        SUB     pSrc,pSrc,step
        
        SUB     pTwiddleTmp,pTwiddleTmp,step1
        VREV32  dX1r,dX1r
        VREV32  dX1i,dX1i
        SUBS    size,size,#4
                        
        VHSUB   dT2,dX0r,dX1r                          @ a-c
        VHADD   dT3,dX0i,dX1i                          @ b+d
        SUB     step1,step1,#4
        VHADD   dT0,dX0r,dX1r                          @ a+c
        VHSUB   dT1,dX0i,dX1i                          @ b-d
        
        VTRN    dW1r,dW1i
        VTRN    dW0r,dW0i
                                
        VMULL   qT0,dW1r,dT2
        VMLSL   qT0,dW1i,dT3
        VMULL   qT1,dW1r,dT3
        VMLAL   qT1,dW1i,dT2
        VMULL   qT2,dW0r,dT2
        VMLAL   qT2,dW0i,dT3
        VMULL   qT3,dW0r,dT3
        VMLSL   qT3,dW0i,dT2
        
        VRSHRN  dX1r,qT0,#15
        VRSHRN  dX1i,qT1,#15
        
        .ifeqs  "\scaled", "TRUE"
            VHADD    dY1r,dT0,dX1i                     @ F(N/2 -1)
            VHSUB    dY1i,dX1r,dT1
        .else
            VADD    dY1r,dT0,dX1i                      @ F(N/2 -1)
            VSUB    dY1i,dX1r,dT1
        .endif
        
        VREV32  dY1r,dY1r
        VREV32  dY1i,dY1i
                            
        VRSHRN  dX0r,qT2,#15
        VRSHRN  dX0i,qT3,#15
        
        .ifeqs  "\scaled", "TRUE"
            VHADD    dY0r,dT0,dX0i                     @ F(1)
            VHSUB    dY0i,dT1,dX0r
        .else
            VADD    dY0r,dT0,dX0i                      @ F(1)
            VSUB    dY0i,dT1,dX0r
        .endif
        
        VST2    {dY0r[0],dY0i[0]},[pOut1]!
        VST2    {dY0r[1],dY0i[1]},[pOut1],step
        SUB     pOut1, #4
        VST2    {dY1r[0],dY1i[0]},[pOut1]!
        VST2    {dY1r[1],dY1i[1]},[pOut1]!
        SUB     pOut1,pOut1,step
        SUB     pSrc,pSrc,#4           @ set both the ptrs to the last element
        SUB     pOut1,pOut1,#4
        
        @ Last element can be expanded as follows
        @ 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (W^k is stored as -ve)
        @ 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)]
        @ 1/2[2a+j0] - j (c-jd) [0+j2b]
        @ (a+bc, -bd)
        @ Since (c,d) = (0,1) for the last element, result is just (a,-b)
        
lastElement\name:      
        VLD1    dX0rS32[0],[pSrc]
        
        .ifeqs  "\scaled", "TRUE"
            VSHR    dX0r,dX0r,#1
        .endif
        
        VST1    dX0r[0],[pOut1]!
        VNEG    dX0r,dX0r
        VST1    dX0r[1],[pOut1]

decrementScale\name:          
        .ifeqs  "\scaled", "TRUE"
            SUB scale,scale,#1
        .endif
        
        .endm
        
        M_START armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe,r4
        FFTSTAGE "FALSE","TRUE",Inv
        M_END
        
        M_START armSP_FFTInv_CCSToR_S16_Sfs_preTwiddleRadix2_unsafe,r4
        FFTSTAGE "TRUE","TRUE",InvSfs
        M_END

        
        .end