summaryrefslogtreecommitdiff
path: root/dl/sp/src/arm/arm64
diff options
context:
space:
mode:
Diffstat (limited to 'dl/sp/src/arm/arm64')
-rw-r--r--dl/sp/src/arm/arm64/ComplexToRealFixup.S261
-rw-r--r--dl/sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S280
-rw-r--r--dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_fs_s.S136
-rw-r--r--dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_ls_s.S149
-rw-r--r--dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_s.S185
-rw-r--r--dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_fs_s.S266
-rw-r--r--dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_ls_s.S371
-rw-r--r--dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_s.S339
-rw-r--r--dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S473
-rw-r--r--dl/sp/src/arm/arm64/omxSP_FFTFwd_CToC_FC32.c190
-rw-r--r--dl/sp/src/arm/arm64/omxSP_FFTFwd_RToCCS_F32.c213
-rw-r--r--dl/sp/src/arm/arm64/omxSP_FFTInv_CCSToR_F32.c259
-rw-r--r--dl/sp/src/arm/arm64/omxSP_FFTInv_CToC_FC32.c214
13 files changed, 3336 insertions, 0 deletions
diff --git a/dl/sp/src/arm/arm64/ComplexToRealFixup.S b/dl/sp/src/arm/arm64/ComplexToRealFixup.S
new file mode 100644
index 0000000..9b30093
--- /dev/null
+++ b/dl/sp/src/arm/arm64/ComplexToRealFixup.S
@@ -0,0 +1,261 @@
+//
+// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+//
+// This is a modification of omxSP_FFTFwd_RToCCS_S32_Sfs_s.s
+// to support float instead of SC32.
+//
+
+//
+// Description:
+// Compute FFT for a real signal
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+
+// Import symbols required from other files
+// (For example tables)
+
+// Set debugging level
+//DEBUG_ON SETL {TRUE}
+
+
+
+// Guarding implementation by the processor name
+
+
+
+ // Guarding implementation by the processor name
+
+// Import symbols required from other files
+
+
+//Input Registers
+
+#define pSrc x0
+#define pDst x1
+#define pTwiddle x2
+#define pOut x3
+#define subFFTNum x4
+
+// Output registers
+
+//Local Scratch Registers
+
+#define argTwiddle x5
+#define argDst x6
+#define subFFTSize x7
+#define N subFFTNum
+#define order x14
+#define step x8
+#define step1 pTwiddle
+#define twStep x9
+#define zero w10
+#define pTwiddleTmp pOut
+
+// Neon registers
+
+#define dX0 v0.2s
+#define dX0s v0.s
+#define dX0r v2.2s
+#define dX0rs v2.s
+#define dX0i v3.2s
+#define dX0is v3.s
+#define dX1r v4.2s
+#define dX1i v5.2s
+#define dT0 v6.2s
+#define dT1 v7.2s
+#define dT2 v8.2s
+#define dT3 v9.2s
+#define qT0 v10.2s
+#define qT1 v12.2s
+#define dW0r v14.2s
+#define dW0r8b v14.8b
+#define dW0i v15.2s
+#define dW1r v16.2s
+#define dW1r8b v16.8b
+#define dW1i v17.2s
+#define dY0r v14.2s
+#define dY0i v15.2s
+#define dY1r v16.2s
+#define dY1i v17.2s
+#define qT2 v18.2s
+#define qT3 v20.2s
+
+#define half v0.2s
+#define dZip v21.2s
+#define dZip8b v21.8b
+
+ // Allocate stack memory required by the function
+
+ // Write function header
+ M_START ComplexToRealFixup,,d15
+
+ asr N, N, #1
+
+ clz order, subFFTNum // N = 2^order
+
+ RSB order,order,#63
+ MOV subFFTSize,subFFTNum // subFFTSize = N/2
+ //MOV subFFTNum,N
+ mov argDst, pDst
+ mov argTwiddle, pTwiddle
+
+ // F(0) = 1/2[Z(0) + Z'(0)] - j [Z(0) - Z'(0)]
+ // 1/2[(a+jb) + (a-jb)] - j [(a+jb) - (a-jb)]
+ // 1/2[2a+j0] - j [0+j2b]
+ // (a+b, 0)
+
+ // F(N/2) = 1/2[Z(0) + Z'(0)] + j [Z(0) - Z'(0)]
+ // 1/2[(a+jb) + (a-jb)] + j [(a+jb) - (a-jb)]
+ // 1/2[2a+j0] + j [0+j2b]
+ // (a-b, 0)
+
+ // F(0) and F(N/2)
+ ld2 {dX0rs,dX0is}[0],[pSrc], #8
+ MOV zero,#0
+ mov dX0rs[1],zero
+ lsl step,subFFTSize, #3 // step = N/2 * 8 bytes
+ mov dX0i[1],zero
+ // twStep = 3N/8 * 8 bytes pointing to W^1
+ SUB twStep,step,subFFTSize,LSL #1
+
+ fadd dY0r,dX0r,dX0i // F(0) = ((Z0.r+Z0.i) , 0)
+ lsl step1,subFFTSize, #2 // step1 = N/2 * 4 bytes
+ fsub dY0i,dX0r,dX0i // F(N/2) = ((Z0.r-Z0.i) , 0)
+ SUBS subFFTSize,subFFTSize,#2
+
+ st1 {dY0r},[argDst],step
+ ADD pTwiddleTmp,argTwiddle,#8 // W^2
+ st1 {dY0i},[argDst], #8
+ ADD argTwiddle,argTwiddle,twStep // W^1
+
+// dup dzero,zero
+ SUB argDst,argDst,step
+
+ BLT End
+ BEQ lastElement
+ SUB step,step,#24
+ SUB step1,step1,#8 // (N/4-1)*8 bytes
+
+ // F(k) = 1/2[Z(k) + Z'(N/2-k)] -j*W^(k) [Z(k) - Z'(N/2-k)]
+ // Note: W^k is stored as negative values in the table
+ // Process 4 elements at a time. E.g: F(1),F(2) and F(N/2-2),F(N/2-1)
+ // since both of them require Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
+
+ fmov half, #0.5
+
+evenOddButterflyLoop:
+
+
+ ld1 {dW0r},[argTwiddle],step1
+ ld1 {dW1r},[argTwiddle], #8
+
+ ld2 {dX0r,dX0i},[pSrc],step
+ SUB argTwiddle,argTwiddle,step1
+ ld2 {dX1r,dX1i},[pSrc], #16
+
+
+
+ SUB step1,step1,#8 // (N/4-2)*8 bytes
+ ld1 {dW0i},[pTwiddleTmp],step1
+ ld1 {dW1i},[pTwiddleTmp], #8
+ SUB pSrc,pSrc,step
+
+ SUB pTwiddleTmp,pTwiddleTmp,step1
+ rev64 dX1r,dX1r
+ rev64 dX1i,dX1i
+ SUBS subFFTSize,subFFTSize,#4
+
+
+
+ fsub dT2,dX0r,dX1r // a-c
+ SUB step1,step1,#8
+ fadd dT0,dX0r,dX1r // a+c
+ fsub dT1,dX0i,dX1i // b-d
+ fadd dT3,dX0i,dX1i // b+d
+ fmul dT0,dT0,half[0]
+ fmul dT1,dT1,half[0]
+ // VZIP dW1r,dW1i
+ // VZIP dW0r,dW0i
+ zip1 dZip, dW1r, dW1i
+ zip2 dW1i, dW1r, dW1i
+ mov dW1r8b, dZip8b
+ zip1 dZip, dW0r, dW0i
+ zip2 dW0i, dW0r, dW0i
+ mov dW0r8b, dZip8b
+
+ fmul qT0,dW1r,dT2
+ fmul qT1,dW1r,dT3
+ fmul qT2,dW0r,dT2
+ fmul qT3,dW0r,dT3
+
+ fmla qT0,dW1i,dT3
+ fmls qT1,dW1i,dT2
+
+ fmls qT2,dW0i,dT3
+ fmla qT3,dW0i,dT2
+
+
+ fmul dX1r,qT0,half[0]
+ fmul dX1i,qT1,half[0]
+
+ fsub dY1r,dT0,dX1i // F(N/2 -1)
+ fadd dY1i,dT1,dX1r
+ fneg dY1i,dY1i
+
+ rev64 dY1r,dY1r
+ rev64 dY1i,dY1i
+
+
+ fmul dX0r,qT2,half[0]
+ fmul dX0i,qT3,half[0]
+
+ fsub dY0r,dT0,dX0i // F(1)
+ fadd dY0i,dT1,dX0r
+
+
+ st2 {dY0r,dY0i},[argDst],step
+ st2 {dY1r,dY1i},[argDst], #16
+ SUB argDst,argDst,step
+ SUB step,step,#32 // (N/2-4)*8 bytes
+
+
+ BGT evenOddButterflyLoop
+
+ // set both the ptrs to the last element
+ SUB pSrc,pSrc,#8
+ SUB argDst,argDst,#8
+
+
+
+ // Last element can be expanded as follows
+ // 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)]
+ // 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)]
+ // 1/2[2a+j0] + j (c+jd) [0+j2b]
+ // (a-bc, -bd)
+ // Since (c,d) = (0,1) for the last element, result is just (a,-b)
+
+lastElement:
+ ld1 {dX0r},[pSrc]
+
+ st1 {dX0rs}[0],[argDst], #4
+ fneg dX0r,dX0r
+ st1 {dX0rs}[1],[argDst], #4
+End:
+
+ // Write function tail
+ M_END
+
+ .end
diff --git a/dl/sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S b/dl/sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S
new file mode 100644
index 0000000..da68314
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S
@@ -0,0 +1,280 @@
+//
+// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+//
+// This is a modification of
+// armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.s to support float
+// instead of SC32.
+//
+
+//
+// Description:
+// Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT
+// It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+
+// Import symbols required from other files
+// (For example tables)
+
+
+// Set debugging level
+//DEBUG_ON SETL {TRUE}
+
+
+
+// Guarding implementation by the processor name
+
+
+
+ // Guarding implementation by the processor name
+
+
+
+//Input Registers
+
+#define pSrc x0
+#define pTwiddle x1
+#define pOut x2
+#define subFFTNum x3
+
+// Output registers
+
+//Local Scratch Registers
+
+#define argTwiddle x5
+#define argDst x6
+#define subFFTSize x7
+#define N subFFTNum
+
+#define pOut1 x13
+
+#define size x7
+#define step x8
+#define step1 x9
+#define twStep x10
+#define pTwiddleTmp x11
+#define argTwiddle1 x12
+
+// Neon registers
+
+#define dX0 v0.2s
+#define dX0s v0.s
+#define dShift v1.2s
+#define dX1 v1.2s
+#define dX1s v1.s
+#define dY0 v2.2s
+#define dY08b v2.8b
+#define dY1 v3.2s
+#define dX0r v0.2s
+#define dX0rs v0.s
+#define dX0i v1.2s
+#define dX1r v2.2s
+#define dX1i v3.2s
+#define dW0r v4.2s
+#define dW0r8b v4.8b
+#define dW0i v5.2s
+#define dW1r v6.2s
+#define dW1r8b v6.8b
+#define dW1i v7.2s
+#define dT0 v8.2s
+#define dT1 v9.2s
+#define dT2 v10.2s
+#define dT3 v11.2s
+#define qT0 v12.2s
+#define qT1 v14.2s
+#define qT2 v16.2s
+#define qT3 v18.2s
+#define dY0r v4.2s
+#define dY0i v5.2s
+#define dY1r v6.2s
+#define dY1i v7.2s
+
+#define dY2 v4.2s
+#define dY3 v5.2s
+#define dW0 v6.2s
+#define dW1 v7.2s
+#define dW0Tmp v10.2s
+#define dW1Neg v11.2s
+
+#define dZip v19.2s
+#define dZip8b v19.8b
+#define half v13.2s
+
+ .MACRO FFTSTAGE scaled, inverse, name
+
+ fmov half, 0.5
+
+ asr size, subFFTNum, #1 // preserve the contents of N = subFFTNum
+ lsl step, subFFTNum, #2 // step = N/2 * 8 bytes
+
+
+ // Z(k) = 1/2 {[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]}
+ // Note: W^(k) is stored as negated value and also need to
+ // conjugate the values from the table
+
+ // Z(0) : no need of twiddle multiply
+ // Z(0) = 1/2 { [F(0) + F'(N/2)] +j [F(0) - F'(N/2)] }
+
+ ld1 {dX0},[pSrc],step
+ ADD pOut1,pOut,step // pOut1 = pOut+ N/2*8 bytes
+
+ ld1 {dX1},[pSrc], #8
+ // twStep = 3N/8 * 8 bytes pointing to W^1
+ SUB twStep,step,size,LSL #1
+
+ lsl step1,size, #2 // step1 = N/4 * 8 = N/2*4 bytes
+ SUB step1,step1,#8 // (N/4-1)*8 bytes
+
+ fadd dY0,dX0,dX1 // [b+d | a+c]
+ fsub dY1,dX0,dX1 // [b-d | a-c]
+ fmul dY0, dY0, half[0]
+ fmul dY1, dY1, half[0]
+
+ // dY0= [a-c | a+c] ;dY1= [b-d | b+d]
+ // VZIP dY0,dY1
+ zip1 dZip,dY0,dY1
+ zip2 dY1,dY0,dY1
+ mov dY08b, dZip8b
+
+ fsub dX0,dY0,dY1
+ SUBS size,size,#2
+ fadd dX1,dY0,dY1
+
+ SUB pSrc,pSrc,step
+
+ st1 {dX0s}[0],[pOut1], #4
+ ADD pTwiddleTmp,pTwiddle,#8 // W^2
+ st1 {dX1s}[1],[pOut1], #4
+ ADD argTwiddle1,pTwiddle,twStep // W^1
+
+
+ BLT decrementScale\name
+ BEQ lastElement\name
+
+
+ // Z(k) = 1/2[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]
+ // Note: W^k is stored as negative values in the table and also
+ // need to conjugate the values from the table.
+ //
+ // Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
+ // since both of them require F(1),F(2) and F(N/2-2),F(N/2-1)
+
+
+ SUB step,step,#24
+evenOddButterflyLoop\name :
+
+
+ ld1 {dW0r},[argTwiddle1],step1
+ ld1 {dW1r},[argTwiddle1], #8
+
+ ld2 {dX0r,dX0i},[pSrc],step
+ SUB argTwiddle1,argTwiddle1,step1
+ ld2 {dX1r,dX1i},[pSrc], #16
+
+ SUB step1,step1,#8 // (N/4-2)*8 bytes
+ ld1 {dW0i},[pTwiddleTmp],step1
+ ld1 {dW1i},[pTwiddleTmp], #8
+ SUB pSrc,pSrc,step
+
+ SUB pTwiddleTmp,pTwiddleTmp,step1
+ rev64 dX1r,dX1r
+ rev64 dX1i,dX1i
+ SUBS size,size,#4
+
+
+ fsub dT2,dX0r,dX1r // a-c
+ fadd dT3,dX0i,dX1i // b+d
+ fadd dT0,dX0r,dX1r // a+c
+ fsub dT1,dX0i,dX1i // b-d
+ SUB step1,step1,#8
+
+ fmul dT2, dT2, half[0]
+ fmul dT3, dT3, half[0]
+
+ fmul dT0, dT0, half[0]
+ fmul dT1, dT1, half[0]
+
+ // VZIP dW1r,dW1i
+ // VZIP dW0r,dW0i
+ zip1 dZip, dW1r,dW1i
+ zip2 dW1i,dW1r,dW1i
+ mov dW1r8b, dZip8b
+ zip1 dZip,dW0r,dW0i
+ zip2 dW0i,dW0r,dW0i
+ mov dW0r8b, dZip8b
+
+ fmul dX1r,dW1r,dT2
+ fmul dX1i,dW1r,dT3
+ fmul dX0r,dW0r,dT2
+ fmul dX0i,dW0r,dT3
+
+ fmls dX1r,dW1i,dT3
+ fmla dX1i,dW1i,dT2
+
+ fmla dX0r,dW0i,dT3
+ fmls dX0i,dW0i,dT2
+
+
+ fadd dY1r,dT0,dX1i // F(N/2 -1)
+ fsub dY1i,dX1r,dT1
+
+ rev64 dY1r,dY1r
+ rev64 dY1i,dY1i
+
+
+ fadd dY0r,dT0,dX0i // F(1)
+ fsub dY0i,dT1,dX0r
+
+
+ st2 {dY0r,dY0i},[pOut1],step
+ st2 {dY1r,dY1i},[pOut1], #16
+ SUB pOut1,pOut1,step
+ SUB step,step,#32 // (N/2-4)*8 bytes
+
+
+ BGT evenOddButterflyLoop\name
+
+
+ // set both the ptrs to the last element
+ SUB pSrc,pSrc,#8
+ SUB pOut1,pOut1,#8
+
+ // Last element can be expanded as follows
+ // 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (since W^k is stored as
+ // -ve)
+ // 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)]
+ // 1/2[2a+j0] - j (c-jd) [0+j2b]
+ // (a+bc, -bd)
+ // Since (c,d) = (0,1) for the last element, result is just (a,-b)
+
+lastElement\name :
+ ld1 {dX0r},[pSrc]
+
+ st1 {dX0rs}[0],[pOut1], #4
+ fneg dX0r,dX0r
+ st1 {dX0rs}[1],[pOut1]
+
+
+
+decrementScale\name :
+
+ .endm
+
+ M_START armSP_FFTInv_CCSToR_F32_preTwiddleRadix2,,d15
+ FFTSTAGE "FALSE","TRUE",Inv
+ M_END
+
+ .end
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_fs_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_fs_s.S
new file mode 100644
index 0000000..b22912d
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_fs_s.S
@@ -0,0 +1,136 @@
+//
+// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+//
+// This is a modification of armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S
+// to support float instead of SC32.
+//
+
+//
+// Description:
+// Compute the first stage of a Radix 2 DIT in-order out-of-place FFT
+// stage for a N point complex signal.
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+
+// Import symbols required from other files
+// (For example tables)
+
+
+
+
+// Set debugging level
+//DEBUG_ON SETL {TRUE}
+
+
+
+// Guarding implementation by the processor name
+
+
+
+// Guarding implementation by the processor name
+
+
+//Input Registers
+
+#define pSrc x0
+#define pDst x1
+#define pTwiddle x2
+#define pSubFFTNum x3
+#define pSubFFTSize x4
+
+
+//Output Registers
+
+
+//Local Scratch Registers
+
+#define subFFTNum x5
+#define subFFTSize x6
+#define pointStep x7
+#define outPointStep x7
+#define grpSize x8
+#define setCount x8
+#define step x9
+#define dstStep x9
+
+// Neon Registers
+#define dX0 v0.2s
+#define dX1 v1.2s
+#define dY0 v2.2s
+#define dY1 v3.2s
+
+ .MACRO FFTSTAGE scaled, inverse, name
+
+ // Define stack arguments
+
+ // Move args values into our work registers
+ ldr subFFTNum, [pSubFFTNum]
+ ldr subFFTSize, [pSubFFTSize]
+
+ // update subFFTSize and subFFTNum into RN6 and RN7 for the next stage
+
+
+ MOV subFFTSize,#2
+ LSR grpSize,subFFTNum,#1
+ MOV subFFTNum,grpSize
+
+
+ // pT0+1 increments pT0 by 8 bytes
+ // pT0+pointStep = increment of 8*pointStep bytes = 4*grpSize bytes
+ // Note: outPointStep = pointStep for firststage
+ // Note: setCount = grpSize/2 (reuse the updated grpSize for setCount)
+
+ lsl pointStep, grpSize, #3
+ rsb step, pointStep, #8
+
+ // Loop on the sets for grp zero
+
+grpZeroSetLoop\name :
+
+ LD1 {dX0},[pSrc],pointStep
+ LD1 {dX1},[pSrc],step // step = -pointStep + 8
+
+ SUBS setCount,setCount,#1
+
+ fadd dY0,dX0,dX1
+ fsub dY1,dX0,dX1
+
+ ST1 {dY0},[pDst],outPointStep
+ // dstStep = step = -pointStep + 8
+ ST1 {dY1},[pDst],dstStep
+
+ BGT grpZeroSetLoop\name
+
+
+ // Save subFFTNum and subFFTSize for next stage
+ str subFFTNum, [pSubFFTNum]
+ str subFFTSize, [pSubFFTSize]
+
+ .endm
+
+
+
+ M_START armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace
+ FFTSTAGE "FALSE","FALSE",fwd
+ M_END
+
+
+
+ M_START armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace
+ FFTSTAGE "FALSE","TRUE",inv
+ M_END
+
+ .end
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_ls_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_ls_s.S
new file mode 100644
index 0000000..e7de11e
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_ls_s.S
@@ -0,0 +1,149 @@
+//
+// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+//
+// This is a modification of armSP_FFT_CToC_SC32_Radix2_ls_unsafe_s.S
+// to support float instead of SC32.
+//
+
+//
+// Description:
+// Compute the last stage of a Radix 2 DIT in-order out-of-place FFT
+// stage for a N point complex signal.
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+
+// Import symbols required from other files
+// (For example tables)
+
+
+
+
+// Set debugging level
+//DEBUG_ON SETL {TRUE}
+
+
+// Guarding implementation by the processor name
+
+
+//Input Registers
+
+#define pSrc x0
+#define pDst x1
+#define pTwiddle x2
+#define pSubFFTNum x3
+#define pSubFFTSize x4
+
+
+//Output Registers
+
+
+//Local Scratch Registers
+
+
+#define subFFTNum x5
+#define subFFTSize x6
+#define outPointStep x8
+#define grpCount x9
+#define dstStep x10
+
+// Neon Registers
+
+#define dWr v0.2s
+#define dWi v1.2s
+#define dXr0 v2.2s
+#define dXi0 v3.2s
+#define dXr1 v4.2s
+#define dXi1 v5.2s
+#define dYr0 v6.2s
+#define dYi0 v7.2s
+#define dYr1 v8.2s
+#define dYi1 v9.2s
+#define qT0 v10.2s
+#define qT1 v12.2s
+
+ .MACRO FFTSTAGE scaled, inverse, name
+
+ // Move parameters into our work registers
+ ldr subFFTSize, [pSubFFTSize]
+
+ lsl outPointStep, subFFTSize, #3
+
+ // Update grpCount and grpSize rightaway
+
+ MOV subFFTNum,#1 //after the last stage
+ LSL grpCount,subFFTSize,#1
+
+ // update subFFTSize for the next stage
+ MOV subFFTSize,grpCount
+
+ rsb dstStep,outPointStep,#16
+
+ // Loop on 2 grps at a time for the last stage
+
+radix2lsGrpLoop\name :
+ // dWr = [pTwiddle[0].Re, pTwiddle[1].Re]
+ // dWi = [pTwiddle[0].Im, pTwiddle[1].Im]
+ ld2 {dWr,dWi},[pTwiddle], #16
+
+ // dXr0 = [pSrc[0].Re, pSrc[2].Re]
+ // dXi0 = [pSrc[0].Im, pSrc[2].Im]
+ // dXr1 = [pSrc[1].Re, pSrc[3].Re]
+ // dXi1 = [pSrc[1].Im, pSrc[3].Im]
+ ld4 {dXr0,dXi0,dXr1,dXi1}, [pSrc], #32
+
+ SUBS grpCount,grpCount,#4 // grpCount is multiplied by 2
+
+ .ifeqs "\inverse", "TRUE"
+ fmul qT0,dWr,dXr1
+ fmla qT0,dWi,dXi1 // real part
+ fmul qT1,dWr,dXi1
+ fmls qT1,dWi,dXr1 // imag part
+
+ .else
+
+ fmul qT0,dWr,dXr1
+ fmls qT0,dWi,dXi1 // real part
+ fmul qT1,dWr,dXi1
+ fmla qT1,dWi,dXr1 // imag part
+
+ .endif
+
+ fsub dYr0,dXr0,qT0
+ fsub dYi0,dXi0,qT1
+ fadd dYr1,dXr0,qT0
+ fadd dYi1,dXi0,qT1
+
+ st2 {dYr0,dYi0},[pDst],outPointStep
+ st2 {dYr1,dYi1},[pDst],dstStep // dstStep = step = -outPointStep + 16
+
+ BGT radix2lsGrpLoop\name
+
+
+ .endm
+
+
+
+ M_START armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace,,d12
+ FFTSTAGE "FALSE","FALSE",fwd
+ M_END
+
+
+
+ M_START armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace,,d12
+ FFTSTAGE "FALSE","TRUE",inv
+ M_END
+
+ .end
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_s.S
new file mode 100644
index 0000000..530a815
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_s.S
@@ -0,0 +1,185 @@
+//
+// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+//
+// This is a modification of armSP_FFT_CToC_SC32_Radix2_unsafe_s.s
+// to support float instead of SC32.
+//
+
+// Description:
+// Compute a Radix 2 DIT in-order out-of-place FFT stage for an N point
+// complex signal. This handles the general stage, not the first or last
+// stage.
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+
+// Import symbols required from other files
+// (For example tables)
+
+
+
+// Set debugging level
+//DEBUG_ON SETL {TRUE}
+
+
+
+// Guarding implementation by the processor name
+
+
+
+
+// Guarding implementation by the processor name
+
+//Input Registers
+
+#define pSrc x0
+#define pDst x1
+#define pTwiddle x2
+#define pSubFFTNum x3
+#define pSubFFTSize x4
+
+
+//Output Registers
+
+
+//Local Scratch Registers
+
+#define subFFTNum x5
+#define subFFTSize x6
+#define outPointStep x8
+#define pointStep x9
+#define pointStep32 w9
+#define grpCount x10
+#define grpCount32 w10
+#define setCount x13
+#define step x15
+#define dstStep x11
+
+// Neon Registers
+
+#define dW v0.2s
+#define dX0 v2.2s
+#define dX1 v3.2s
+#define dX2 v4.2s
+#define dX3 v5.2s
+#define dY0 v6.2s
+#define dY1 v7.2s
+#define dY2 v8.2s
+#define dY3 v9.2s
+#define qT0 v10.2s
+#define qT1 v11.2s
+
+ .MACRO FFTSTAGE scaled, inverse, name
+
+ // Define stack arguments
+
+ // Move args values into our work registers
+ ldr subFFTNum, [pSubFFTNum]
+ ldr subFFTSize, [pSubFFTSize]
+
+ // Update grpCount and grpSize rightaway inorder to reuse pGrpCount
+ // and pGrpSize regs
+
+ LSR subFFTNum,subFFTNum,#1 //grpSize
+ LSL grpCount,subFFTSize,#1
+
+
+ // pT0+1 increments pT0 by 8 bytes
+ // pT0+pointStep = increment of 8*pointStep bytes = 4*grpSize bytes
+ lsl pointStep, subFFTNum, #2
+
+ // update subFFTSize for the next stage
+ MOV subFFTSize,grpCount
+
+ // pOut0+1 increments pOut0 by 8 bytes
+ // pOut0+outPointStep == increment of 8*outPointStep bytes =
+ // 4*size bytes
+ smull outPointStep, grpCount32, pointStep32
+
+ LSL pointStep,pointStep,#1
+
+
+ rsb step,pointStep,#16
+ rsb dstStep,outPointStep,#16
+
+ // Loop on the groups
+
+radix2GrpLoop\name :
+ lsr setCount, pointStep, #3
+ LD1 {dW},[pTwiddle],pointStep //[wi | wr]
+
+
+ // Loop on the sets
+
+
+radix2SetLoop\name :
+
+
+ // point0: dX0-real part dX1-img part
+ LD2 {dX0,dX1},[pSrc],pointStep
+ // point1: dX2-real part dX3-img part
+ LD2 {dX2,dX3},[pSrc],step
+
+ SUBS setCount,setCount,#2
+
+ .ifeqs "\inverse", "TRUE"
+ fmul qT0,dX2,dW[0]
+ fmla qT0,dX3,dW[1] // real part
+ fmul qT1,dX3,dW[0]
+ fmls qT1,dX2,dW[1] // imag part
+
+ .else
+
+ fmul qT0,dX2,dW[0]
+ fmls qT0,dX3,dW[1] // real part
+ fmul qT1,dX3,dW[0]
+ fmla qT1,dX2,dW[1] // imag part
+
+ .endif
+
+ fsub dY0,dX0,qT0
+ fsub dY1,dX1,qT1
+ fadd dY2,dX0,qT0
+ fadd dY3,dX1,qT1
+
+ st2 {dY0,dY1},[pDst],outPointStep
+ // dstStep = -outPointStep + 16
+ st2 {dY2,dY3},[pDst],dstStep
+
+ BGT radix2SetLoop\name
+
+ SUBS grpCount,grpCount,#2
+ ADD pSrc,pSrc,pointStep
+ BGT radix2GrpLoop\name
+
+
+ str subFFTNum, [pSubFFTNum]
+ str subFFTSize, [pSubFFTSize]
+ .endm
+
+
+
+ M_START armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace,,d11
+ FFTSTAGE "FALSE","FALSE",FWD
+ M_END
+
+
+
+ M_START armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace,,d11
+ FFTSTAGE "FALSE","TRUE",INV
+ M_END
+
+
+ .end
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_fs_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_fs_s.S
new file mode 100644
index 0000000..624ef3e
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_fs_s.S
@@ -0,0 +1,266 @@
+//
+// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+//
+//
+// This is a modification of armSP_FFT_CToC_SC32_Radix4_fs_unsafe_s.s
+// to support float instead of SC32.
+//
+
+//
+// Description:
+// Compute a first stage Radix 4 FFT stage for a N point complex signal
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+// Import symbols required from other files
+// (For example tables)
+
+
+
+
+// Set debugging level
+//DEBUG_ON SETL {TRUE}
+
+
+
+// Guarding implementation by the processor name
+
+
+
+// Guarding implementation by the processor name
+
+//Input Registers
+
+#define pSrc x0
+#define pDst x1
+#define pTwiddle x2
+#define pSubFFTNum x3
+#define pSubFFTSize x4
+
+
+//Output Registers
+
+
+//Local Scratch Registers
+
+#define subFFTNum x5
+#define subFFTSize x6
+#define grpSize x7
+// Reuse grpSize as setCount
+#define setCount x7
+#define pointStep x8
+#define outPointStep x8
+#define setStep x9
+#define step1 x10
+#define step3 x11
+
+// Neon Registers
+
+#define dXr0 v0.2s
+#define dXi0 v1.2s
+#define dXr1 v2.2s
+#define dXi1 v3.2s
+#define dXr2 v4.2s
+#define dXi2 v5.2s
+#define dXr3 v6.2s
+#define dXi3 v7.2s
+#define dYr0 v8.2s
+#define dYi0 v9.2s
+#define dYr1 v10.2s
+#define dYi1 v11.2s
+#define dYr2 v12.2s
+#define dYi2 v13.2s
+#define dYr3 v14.2s
+#define dYi3 v15.2s
+#define dZr0 v16.2s
+#define dZi0 v17.2s
+#define dZr1 v18.2s
+#define dZi1 v19.2s
+#define dZr2 v20.2s
+#define dZi2 v21.2s
+#define dZr3 v22.2s
+#define dZi3 v23.2s
+
+
+ .MACRO FFTSTAGE scaled, inverse, name
+
+ // Define stack arguments
+
+ // Move args values into our work registers
+ ldr subFFTNum, [pSubFFTNum]
+ ldr subFFTSize, [pSubFFTSize]
+
+ // pT0+1 increments pT0 by 8 bytes
+ // pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
+ // Note: outPointStep = pointStep for firststage
+
+ lsl pointStep, subFFTNum, #1
+
+ // Update pSubFFTSize and pSubFFTNum regs
+ ld2 {dXr0,dXi0}, [pSrc], pointStep // data[0]
+
+ // subFFTSize = 1 for the first stage
+ MOV subFFTSize,#4
+
+ // Note: setCount = subFFTNum/4 (reuse the grpSize reg for setCount)
+ LSR grpSize,subFFTNum,#2
+ ld2 {dXr1,dXi1}, [pSrc], pointStep // data[1]
+ MOV subFFTNum,grpSize
+
+
+ // Calculate the step of input data for the next set
+ //MOV setStep,pointStep,LSL #1
+ lsl setStep, grpSize, #4
+ ld2 {dXr2,dXi2}, [pSrc], pointStep // data[2]
+
+ // setStep = 3*pointStep
+ ADD setStep,setStep,pointStep
+ // setStep = - 3*pointStep+16
+
+ rsb setStep,setStep,#16
+ // data[3] & update pSrc for the next set
+ ld2 {dXr3,dXi3}, [pSrc], setStep
+
+ // step1 = 2*pointStep
+ lsl step1, pointStep, #1
+
+ // fadd qY0, qX0, qX2
+ fadd dYr0, dXr0, dXr2
+ fadd dYi0, dXi0, dXi2
+ // step3 = -pointStep
+ neg step3, pointStep
+
+ // grp = 0 a special case since all the twiddle factors are 1
+ // Loop on the sets : 2 sets at a time
+
+radix4fsGrpZeroSetLoop\name :
+
+
+
+ // Decrement setcount
+ SUBS setCount,setCount,#2
+
+
+ // finish first stage of 4 point FFT
+
+
+ // fsub qy2,qx0,qx2
+ fsub dYr2, dXr0, dXr2
+ fsub dYi2, dXi0, dXi2
+
+ ld2 {dXr0,dXi0}, [pSrc], step1 // data[0]
+ // fadd qy1,qx1,qx3
+ fadd dYr1, dXr1, dXr3
+ fadd dYi1, dXi1, dXi3
+ ld2 {dXr2,dXi2}, [pSrc], step3 // data[2]
+ // fsub qy3,qx1,qx3
+ fsub dYr3, dXr1, dXr3
+ fsub dYi3, dXi1, dXi3
+
+
+ // finish second stage of 4 point FFT
+
+ .ifeqs "\inverse", "TRUE"
+
+ ld2 {dXr1,dXi1}, [pSrc], step1 // data[1]
+ // fadd qz0,qy0,qy1
+ fadd dZr0, dYr0, dYr1
+ fadd dZi0, dYi0, dYi1
+
+ // data[3] & update pSrc for the next set, but not if it's the
+ // last iteration so that we don't read past the end of the
+ // input array.
+ BEQ radix4SkipLastUpdateInv\name
+ ld2 {dXr3,dXi3}, [pSrc], setStep
+
+radix4SkipLastUpdateInv\name:
+ FSUB dZr3,dYr2,dYi3
+
+ st2 {dZr0,dZi0},[pDst],outPointStep
+ FADD dZi3,dYi2,dYr3
+
+ // fsub qZ1,qY0,qY1
+ FSUB dZr1, dYr0, dYr1
+ FSUB dZi1, dYi0, dYi1
+ st2 {dZr3,dZi3},[pDst],outPointStep
+
+ FADD dZr2,dYr2,dYi3
+ st2 {dZr1,dZi1},[pDst],outPointStep
+ FSUB dZi2,dYi2,dYr3
+
+ // fadd qY0, qX0, qX2
+ FADD dYr0, dXr0, dXr2 // u0 for next iteration
+ FADD dYi0, dXi0, dXi2
+ st2 {dZr2,dZi2},[pDst],setStep
+
+
+ .else
+
+ ld2 {dXr1,dXi1}, [pSrc], step1 // data[1]
+ // fadd qZ0,qY0,qY1
+ fadd dZr0, dYr0, dYr1
+ fadd dZi0, dYi0, dYi1
+
+ // data[3] & update pSrc for the next set, but not if it's the
+ // last iteration so that we don't read past the end of the
+ // input array.
+ BEQ radix4SkipLastUpdateFwd\name
+ ld2 {dXr3,dXi3}, [pSrc], setStep
+
+radix4SkipLastUpdateFwd\name:
+ FADD dZr2,dYr2,dYi3
+
+ st2 {dZr0,dZi0},[pDst],outPointStep
+ FSUB dZi2,dYi2,dYr3
+
+ // fsub qz1,qy0,qy1
+ fsub dZr1, dYr0, dYr1
+ fsub dZi1, dYi0, dYi1
+ st2 {dZr2,dZi2},[pDst],outPointStep
+
+ FSUB dZr3,dYr2,dYi3
+ st2 {dZr1,dZi1},[pDst],outPointStep
+ FADD dZi3,dYi2,dYr3
+
+ // fadd qy0,qx0,qx2
+ fadd dYr0, dXr0, dXr2 // u0 for next iteration
+ fadd dYi0, dXi0, dXi2
+
+ st2 {dZr3,dZi3},[pDst],setStep
+
+ .endif
+
+ BGT radix4fsGrpZeroSetLoop\name
+
+ // Save subFFTNum and subFFTSize for next stage
+ str subFFTNum, [pSubFFTNum]
+ str subFFTSize, [pSubFFTSize]
+
+ .endm
+
+
+
+ M_START armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace,,d15
+ FFTSTAGE "FALSE","FALSE",fwd
+ M_END
+
+
+
+ M_START armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace,,d15
+ FFTSTAGE "FALSE","TRUE",inv
+ M_END
+
+
+ .end
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_ls_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_ls_s.S
new file mode 100644
index 0000000..2fc2e60
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_ls_s.S
@@ -0,0 +1,371 @@
+//
+// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+//
+// This is a modification of armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.s
+// to support float instead of SC32.
+//
+
+//
+// Description:
+// Compute a Radix 4 FFT stage for a N point complex signal
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+// Import symbols required from other files
+// (For example tables)
+
+
+
+
+// Set debugging level
+//DEBUG_ON SETL {TRUE}
+
+
+// Guarding implementation by the processor name
+
+
+// Import symbols required from other files
+// (For example tables)
+ //IMPORT armAAC_constTable
+
+//Input Registers
+
+#define pSrc x0
+#define pDst x1
+#define pTwiddle x2
+#define pSubFFTNum x3
+#define pSubFFTSize x4
+
+
+
+//Output Registers
+
+
+//Local Scratch Registers
+
+#define subFFTNum x5
+#define subFFTSize x6
+#define outPointStep x8
+#define grpCount x9
+#define dstStep x10
+#define grpTwStep x13
+#define stepTwiddle x14
+#define twStep x15
+#define step16 x11
+#define step24 x12
+
+
+// Neon Registers
+
+#define dButterfly1Real02 v0.2s
+#define dButterfly1Real028b v0.8b
+#define dButterfly1Imag02 v1.2s
+#define dButterfly1Imag028b v1.8b
+#define dButterfly1Real13 v2.2s
+#define dButterfly1Real138b v2.8b
+#define dButterfly1Imag13 v3.2s
+#define dButterfly1Imag138b v3.8b
+#define dButterfly2Real02 v4.2s
+#define dButterfly2Imag02 v5.2s
+#define dButterfly2Real13 v6.2s
+#define dButterfly2Imag13 v7.2s
+#define dXr0 v0.2s
+#define dXi0 v1.2s
+#define dXr08b v0.8b
+#define dXi08b v1.8b
+#define dXr1 v2.2s
+#define dXi1 v3.2s
+#define dXr2 v4.2s
+#define dXi2 v5.2s
+#define dXr3 v6.2s
+#define dXi3 v7.2s
+
+#define dYr0 v16.2s
+#define dYi0 v17.2s
+#define dYr1 v18.2s
+#define dYi1 v19.2s
+#define dYr2 v20.2s
+#define dYi2 v21.2s
+#define dYr3 v22.2s
+#define dYi3 v23.2s
+
+#define dW1r v8.2s
+#define dW1i v9.2s
+#define dW2r v10.2s
+#define dW2r8b v10.8b
+#define dW2i v11.2s
+#define dW3r v12.2s
+#define dW3r8b v12.8b
+#define dW3i v13.2s
+
+#define dZr0 v14.2s
+#define dZi0 v15.2s
+#define dZr08b v14.8b
+#define dZi08b v15.8b
+#define dZr1 v26.2s
+#define dZi1 v27.2s
+#define dZr2 v28.2s
+#define dZi2 v29.2s
+#define dZr3 v30.2s
+#define dZi3 v31.2s
+
+#define dZip v24.2s
+#define dZip8b v24.8b
+
+ .MACRO FFTSTAGE scaled, inverse , name
+
+ // Define stack arguments
+
+ // Move args values into our work registers
+ ldr subFFTNum, [pSubFFTNum]
+ ldr subFFTSize, [pSubFFTSize]
+
+ // pOut0+1 increments pOut0 by 8 bytes
+ // pOut0+outPointStep == increment of 8*outPointStep bytes
+ lsl outPointStep,subFFTSize, #3
+
+ // Update grpCount and grpSize rightaway
+
+ ld2 {dW1r,dW1i},[pTwiddle] // [wi|wr]
+ MOV step16,#16
+ LSL grpCount,subFFTSize,#2
+
+ ld1 {dW2r},[pTwiddle] // [wi|wr]
+ MOV subFFTNum,#1 //after the last stage
+
+ ld1 {dW3r},[pTwiddle],step16 // [wi|wr]
+ MOV stepTwiddle,#0
+
+ ld1 {dW2i},[pTwiddle],#8 // [wi|wr]
+ SUB grpTwStep,stepTwiddle,#8 // grpTwStep = -8 to start with
+
+ // update subFFTSize for the next stage
+ MOV subFFTSize,grpCount
+ ld1 {dW3i},[pTwiddle],grpTwStep // [wi|wr]
+ lsl dstStep,outPointStep, #1
+
+ // AC.r AC.i BD.r BD.i
+ ld4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc], #32
+ ADD dstStep,dstStep,outPointStep // dstStep = 3*outPointStep
+
+ rsb dstStep,dstStep,#16 // dstStep = - 3*outPointStep+16
+ MOV step24,#24
+
+ // AC.r AC.i BD.r BD.i
+ ld4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc], #32
+
+
+ // Process two groups at a time
+
+radix4lsGrpLoop\name :
+
+ // VZIP dW2r,dW2i
+ zip1 dZip, dW2r, dW2i
+ zip2 dW2i, dW2r, dW2i
+ mov dW2r8b, dZip8b
+
+ ADD stepTwiddle,stepTwiddle,#16
+
+ // VZIP dW3r,dW3i
+ zip1 dZip, dW3r,dW3i
+ zip2 dW3i, dW3r, dW3i
+ mov dW3r8b, dZip8b
+ ADD grpTwStep,stepTwiddle,#4
+
+ // VUZP dButterfly1Real13, dButterfly2Real13 // B.r D.r
+ uzp1 dZip, dButterfly1Real13, dButterfly2Real13 // B.r D.r
+ uzp2 dButterfly2Real13, dButterfly1Real13, dButterfly2Real13 // B.r D.r
+ mov dButterfly1Real138b, dZip8b
+
+ SUB twStep,stepTwiddle,#16 // -16+stepTwiddle
+
+ // VUZP dButterfly1Imag13, dButterfly2Imag13 // B.i D.i
+ uzp1 dZip, dButterfly1Imag13, dButterfly2Imag13 // B.i D.i
+ uzp2 dButterfly2Imag13, dButterfly1Imag13, dButterfly2Imag13 // B.i D.i
+ mov dButterfly1Imag138b, dZip8b
+ lsl grpTwStep,grpTwStep,#1
+
+ // VUZP dButterfly1Real02, dButterfly2Real02 // A.r C.r
+ uzp1 dZip, dButterfly1Real02, dButterfly2Real02 // A.r C.r
+ uzp2 dButterfly2Real02, dButterfly1Real02, dButterfly2Real02 // A.r C.r
+ mov dButterfly1Real028b, dZip8b
+ rsb grpTwStep,grpTwStep,#0 // -8-2*stepTwiddle
+
+ // VUZP dButterfly1Imag02, dButterfly2Imag02 // A.i C.i
+ uzp1 dZip, dButterfly1Imag02, dButterfly2Imag02 // A.i C.i
+ uzp2 dButterfly2Imag02, dButterfly1Imag02, dButterfly2Imag02 // A.i C.i
+ mov dButterfly1Imag028b, dZip8b
+
+
+ // grpCount is multiplied by 4
+ SUBS grpCount,grpCount,#8
+
+ .ifeqs "\inverse", "TRUE"
+ fmul dZr1,dW1r,dXr1
+ fmla dZr1,dW1i,dXi1 // real part
+ fmul dZi1,dW1r,dXi1
+ fmls dZi1,dW1i,dXr1 // imag part
+
+ .else
+
+ fmul dZr1,dW1r,dXr1
+ fmls dZr1,dW1i,dXi1 // real part
+ fmul dZi1,dW1r,dXi1
+ fmla dZi1,dW1i,dXr1 // imag part
+
+ .endif
+
+ ld2 {dW1r,dW1i},[pTwiddle],stepTwiddle // [wi|wr]
+
+ .ifeqs "\inverse", "TRUE"
+ fmul dZr2,dW2r,dXr2
+ fmla dZr2,dW2i,dXi2 // real part
+ fmul dZi2,dW2r,dXi2
+ ld1 {dW2r},[pTwiddle],step16 // [wi|wr]
+ fmls dZi2,dW2i,dXr2 // imag part
+
+ .else
+
+ fmul dZr2,dW2r,dXr2
+ fmls dZr2,dW2i,dXi2 // real part
+ fmul dZi2,dW2r,dXi2
+ ld1 {dW2r},[pTwiddle],step16 // [wi|wr]
+ fmla dZi2,dW2i,dXr2 // imag part
+
+ .endif
+
+
+ ld1 {dW2i},[pTwiddle],twStep // [wi|wr]
+
+ // move qX0 so as to load for the next iteration
+ // MOV qZ0,qX0
+ mov dZr08b, dXr08b
+ mov dZi08b, dXi08b
+
+ .ifeqs "\inverse", "TRUE"
+ fmul dZr3,dW3r,dXr3
+ fmla dZr3,dW3i,dXi3 // real part
+ fmul dZi3,dW3r,dXi3
+ ld1 {dW3r},[pTwiddle],step24
+ fmls dZi3,dW3i,dXr3 // imag part
+
+ .else
+
+ fmul dZr3,dW3r,dXr3
+ fmls dZr3,dW3i,dXi3 // real part
+ fmul dZi3,dW3r,dXi3
+ ld1 {dW3r},[pTwiddle],step24
+ fmla dZi3,dW3i,dXr3 // imag part
+
+ .endif
+
+ ld1 {dW3i},[pTwiddle],grpTwStep // [wi|wr]
+
+ // Don't do the load on the last iteration so we don't read past the end
+ // of pSrc.
+ bne skipIncrement\name
+ add pSrc, pSrc, #64
+skipIncrement\name:
+ beq radix4lsSkipRead\name
+ // AC.r AC.i BD.r BD.i
+ ld4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc], #32
+
+ // AC.r AC.i BD.r BD.i
+ ld4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc], #32
+radix4lsSkipRead\name:
+
+ // finish first stage of 4 point FFT
+
+ // fadd qY0,qZ0,qZ2
+ fadd dYr0,dZr0,dZr2
+ fadd dYi0,dZi0,dZi2
+ // fsub qY2,qZ0,qZ2
+ fsub dYr2,dZr0,dZr2
+ fsub dYi2,dZi0,dZi2
+ // fadd qY1,qZ1,qZ3
+ fadd dYr1,dZr1,dZr3
+ fadd dYi1,dZi1,dZi3
+ // fsub qY3,qZ1,qZ3
+ fsub dYr3,dZr1,dZr3
+ fsub dYi3,dZi1,dZi3
+
+
+ // finish second stage of 4 point FFT
+
+ .ifeqs "\inverse", "TRUE"
+
+ // fsub qZ0,qY2,qY1
+ fsub dZr0,dYr2,dYr1
+ fsub dZi0,dYi2,dYi1
+ fadd dZr3,dYr0,dYi3
+ st2 {dZr0,dZi0},[pDst],outPointStep
+ fsub dZi3,dYi0,dYr3
+
+ // fadd qZ2,qY2,qY1
+ fadd dZr2,dYr2,dYr1
+ fadd dZi2,dYi2,dYi1
+
+ st2 {dZr3,dZi3},[pDst],outPointStep
+
+ fsub dZr1,dYr0,dYi3
+ st2 {dZr2,dZi2},[pDst],outPointStep
+ fadd dZi1,dYi0,dYr3
+
+ // dstStep = -outPointStep + 16
+ st2 {dZr1,dZi1},[pDst],dstStep
+
+
+ .else
+
+ // fsub qZ0,qY2,qY1
+ fsub dZr0,dYr2,dYr1
+ fsub dZi0,dYi2,dYi1
+
+ fsub dZr1,dYr0,dYi3
+ st2 {dZr0,dZi0},[pDst],outPointStep
+ fadd dZi1,dYi0,dYr3
+
+ // fadd qZ2,qY2,qY1
+ fadd dZr2,dYr2,dYr1
+ fadd dZi2,dYi2,dYi1
+
+ st2 {dZr1,dZi1},[pDst],outPointStep
+
+ fadd dZr3,dYr0,dYi3
+ st2 {dZr2,dZi2},[pDst],outPointStep
+ fsub dZi3,dYi0,dYr3
+
+ // dstStep = -outPointStep + 16
+ st2 {dZr3,dZi3},[pDst],dstStep
+
+
+ .endif
+
+ BGT radix4lsGrpLoop\name
+
+ .endm
+
+
+ M_START armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace,,d15
+ FFTSTAGE "FALSE","FALSE",fwd
+ M_END
+
+
+ M_START armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace,,d15
+ FFTSTAGE "FALSE","TRUE",inv
+ M_END
+
+
+ .end
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_s.S
new file mode 100644
index 0000000..830fd16
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_s.S
@@ -0,0 +1,339 @@
+//
+// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+//
+//
+// This is a modification of armSP_FFT_CToC_SC32_Radix4_unsafe_s.s
+// to support float instead of SC32.
+//
+
+//
+// Description:
+// Compute a Radix 4 FFT stage for a N point complex signal
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+
+// Import symbols required from other files
+// (For example tables)
+
+
+
+
+// Set debugging level
+//DEBUG_ON SETL {TRUE}
+
+
+
+// Guarding implementation by the processor name
+
+
+
+
+// Guarding implementation by the processor name
+
+
+// Import symbols required from other files
+// (For example tables)
+
+
+//Input Registers
+
+#define pSrc x0
+#define pDst x1
+#define pTwiddle x2
+#define pSubFFTNum x3
+#define pSubFFTSize x4
+
+
+
+//Output Registers
+
+
+//Local Scratch Registers
+
+#define subFFTNum x5
+#define subFFTSize x6
+#define grpCount x7
+#define grpCount32 w7
+#define pointStep x8
+#define pointStep32 w8
+#define outPointStep x9
+#define stepTwiddle x10
+#define setCount x11
+#define srcStep x12
+#define setStep x13
+#define dstStep x14
+#define twStep x15
+
+// Neon Registers
+
+#define dW1 v0.2s
+#define dW2 v1.2s
+#define dW3 v2.2s
+
+#define dXr0 v4.2s
+#define dXi0 v5.2s
+#define dXr1 v6.2s
+#define dXi1 v7.2s
+#define dXr2 v8.2s
+#define dXi2 v9.2s
+#define dXr3 v10.2s
+#define dXi3 v11.2s
+#define dYr0 v12.2s
+#define dYi0 v13.2s
+#define dYr1 v14.2s
+#define dYi1 v15.2s
+#define dYr2 v16.2s
+#define dYi2 v17.2s
+#define dYr3 v18.2s
+#define dYi3 v19.2s
+#define dZr0 v20.2s
+#define dZi0 v21.2s
+#define dZr1 v22.2s
+#define dZi1 v23.2s
+#define dZr2 v24.2s
+#define dZi2 v25.2s
+#define dZr3 v26.2s
+#define dZi3 v27.2s
+
+ .MACRO FFTSTAGE scaled, inverse , name
+
+ // Define stack arguments
+
+ // Move args values into our work registers
+ ldr subFFTNum, [pSubFFTNum]
+ ldr subFFTSize, [pSubFFTSize]
+
+ // Update grpCount and grpSize rightaway inorder to reuse
+ // pGrpCount and pGrpSize regs
+
+ LSL grpCount,subFFTSize,#2
+ LSR subFFTNum,subFFTNum,#2
+ MOV subFFTSize,grpCount
+
+ ld1 {dW1},[pTwiddle] //[wi | wr]
+ // pT0+1 increments pT0 by 8 bytes
+ // pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
+ lsl pointStep,subFFTNum, #1
+
+ // pOut0+1 increments pOut0 by 8 bytes
+ // pOut0+outPointStep == increment of 8*outPointStep bytes
+ // = 2*size bytes
+
+ MOV stepTwiddle,#0
+ ld1 {dW2},[pTwiddle] //[wi | wr]
+ smull outPointStep,grpCount32,pointStep32
+
+ LSL pointStep,pointStep,#2 // 2*grpSize
+
+ ld1 {dW3},[pTwiddle] //[wi | wr]
+ lsl srcStep,pointStep, #1 // srcStep = 2*pointStep
+
+ ADD setStep,srcStep,pointStep // setStep = 3*pointStep
+
+ rsb setStep,setStep,#0 // setStep = - 3*pointStep
+ SUB srcStep,srcStep,#16 // srcStep = 2*pointStep-16
+
+ lsl dstStep,outPointStep, #1
+
+ ADD dstStep,dstStep,outPointStep // dstStep = 3*outPointStep
+ // dstStep = - 3*outPointStep+16
+ rsb dstStep,dstStep,#16
+
+
+radix4GrpLoop\name :
+
+ ld2 {dXr0,dXi0},[pSrc],pointStep // data[0]
+ ADD stepTwiddle,stepTwiddle,pointStep
+ ld2 {dXr1,dXi1},[pSrc],pointStep // data[1]
+ // set pTwiddle to the first point
+ ADD pTwiddle,pTwiddle,stepTwiddle
+ ld2 {dXr2,dXi2},[pSrc],pointStep // data[2]
+ lsl twStep,stepTwiddle, #2
+
+ // data[3] & update pSrc for the next set
+ ld2 {dXr3,dXi3},[pSrc],setStep
+ SUB twStep,stepTwiddle,twStep // twStep = -3*stepTwiddle
+
+ lsr setCount,pointStep, #3
+
+ // set pSrc to data[0] of the next set
+ ADD pSrc,pSrc,#16
+ // increment to data[1] of the next set
+ ADD pSrc,pSrc,pointStep
+
+
+ // Loop on the sets
+
+radix4SetLoop\name :
+
+
+
+ .ifeqs "\inverse", "TRUE"
+ fmul dZr1,dXr1,dW1[0]
+ fmul dZi1,dXi1,dW1[0]
+ fmul dZr2,dXr2,dW2[0]
+ fmul dZi2,dXi2,dW2[0]
+ fmul dZr3,dXr3,dW3[0]
+ fmul dZi3,dXi3,dW3[0]
+
+ fmla dZr1,dXi1,dW1[1] // real part
+ fmls dZi1,dXr1,dW1[1] // imag part
+
+ // data[1] for next iteration
+ ld2 {dXr1,dXi1},[pSrc],pointStep
+
+ fmla dZr2,dXi2,dW2[1] // real part
+ fmls dZi2,dXr2,dW2[1] // imag part
+
+ // data[2] for next iteration
+ ld2 {dXr2,dXi2},[pSrc],pointStep
+
+ fmla dZr3,dXi3,dW3[1] // real part
+ fmls dZi3,dXr3,dW3[1] // imag part
+ .else
+ fmul dZr1,dXr1,dW1[0]
+ fmul dZi1,dXi1,dW1[0]
+ fmul dZr2,dXr2,dW2[0]
+ fmul dZi2,dXi2,dW2[0]
+ fmul dZr3,dXr3,dW3[0]
+ fmul dZi3,dXi3,dW3[0]
+
+ fmls dZr1,dXi1,dW1[1] // real part
+ fmla dZi1,dXr1,dW1[1] // imag part
+
+ // data[1] for next iteration
+ ld2 {dXr1,dXi1},[pSrc],pointStep
+
+ fmls dZr2,dXi2,dW2[1] // real part
+ fmla dZi2,dXr2,dW2[1] // imag part
+
+ // data[2] for next iteration
+ ld2 {dXr2,dXi2},[pSrc],pointStep
+
+ fmls dZr3,dXi3,dW3[1] // real part
+ fmla dZi3,dXr3,dW3[1] // imag part
+ .endif
+
+ // data[3] & update pSrc to data[0]
+ // But don't read on the very last iteration because that reads past
+ // the end of pSrc. The last iteration is grpCount = 4, setCount = 2.
+ cmp grpCount, #4
+
+ b.ne skipUpdate\name
+ cmp setCount, #2
+ b.ne skipUpdate\name
+ add pSrc, pSrc, setStep
+ beq radix4SkipRead\name
+skipUpdate\name:
+ ld2 {dXr3,dXi3},[pSrc],setStep
+radix4SkipRead\name:
+
+ SUBS setCount,setCount,#2
+
+ // finish first stage of 4 point FFT
+ // fadd qY0,qX0,qZ2
+ // fsub qY2,qX0,qZ2
+ fadd dYr0,dXr0,dZr2
+ fsub dYr2,dXr0,dZr2
+ fadd dYi0,dXi0,dZi2
+ fsub dYi2,dXi0,dZi2
+
+ // data[0] for next iteration
+ ld2 {dXr0,dXi0},[pSrc], #16
+ // fadd qY1,qZ1,qZ3
+ // fsub qY3,qZ1,qZ3
+ fadd dYr1,dZr1,dZr3
+ fsub dYr3,dZr1,dZr3
+ fadd dYi1,dZi1,dZi3
+ fsub dYi3,dZi1,dZi3
+
+ // finish second stage of 4 point FFT
+
+ // fsub qZ0,qY2,qY1
+ fsub dZr0,dYr2,dYr1
+ fsub dZi0,dYi2,dYi1
+
+ .ifeqs "\inverse", "TRUE"
+
+ fadd dZr3,dYr0,dYi3
+ st2 {dZr0,dZi0},[pDst],outPointStep
+ fsub dZi3,dYi0,dYr3
+
+ // fadd qZ2,qY2,qY1
+ fadd dZr2,dYr2,dYr1
+ fadd dZi2,dYi2,dYi1
+
+ st2 {dZr3,dZi3},[pDst],outPointStep
+
+ fsub dZr1,dYr0,dYi3
+ st2 {dZr2,dZi2},[pDst],outPointStep
+ fadd dZi1,dYi0,dYr3
+
+ st2 {dZr1,dZi1},[pDst],dstStep
+
+
+ .else
+
+ fsub dZr1,dYr0,dYi3
+ st2 {dZr0,dZi0},[pDst],outPointStep
+ fadd dZi1,dYi0,dYr3
+
+ // fadd qZ2,qY2,qY1
+ fadd dZr2,dYr2,dYr1
+ fadd dZi2,dYi2,dYi1
+
+ st2 {dZr1,dZi1},[pDst],outPointStep
+
+ fadd dZr3,dYr0,dYi3
+ st2 {dZr2,dZi2},[pDst],outPointStep
+ fsub dZi3,dYi0,dYr3
+
+ st2 {dZr3,dZi3},[pDst],dstStep
+
+
+ .endif
+
+ // increment to data[1] of the next set
+ ADD pSrc,pSrc,pointStep
+ BGT radix4SetLoop\name
+
+
+ ld1 {dW1},[pTwiddle],stepTwiddle //[wi | wr]
+ // subtract 4 since grpCount multiplied by 4
+ SUBS grpCount,grpCount,#4
+ ld1 {dW2},[pTwiddle],stepTwiddle //[wi | wr]
+ // increment pSrc for the next grp
+ ADD pSrc,pSrc,srcStep
+ ld1 {dW3},[pTwiddle],twStep //[wi | wr]
+ BGT radix4GrpLoop\name
+
+ str subFFTNum, [pSubFFTNum]
+ str subFFTSize, [pSubFFTSize]
+
+ .endm
+
+
+ M_START armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace,,d15
+ FFTSTAGE "FALSE","FALSE",FWD
+ M_END
+
+
+ M_START armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace,,d15
+ FFTSTAGE "FALSE","TRUE",INV
+ M_END
+
+
+ .end
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S
new file mode 100644
index 0000000..f348e6a
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S
@@ -0,0 +1,473 @@
+//
+// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+//
+// This is a modification of armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.s
+// to support float instead of SC32.
+//
+
+//
+// Description:
+// Compute a first stage Radix 8 FFT stage for a N point complex signal
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+// Import symbols required from other files
+// (For example tables)
+
+
+// Set debugging level
+//DEBUG_ON SETL {TRUE}
+
+
+
+// Guarding implementation by the processor name
+
+
+
+
+// Guarding implementation by the processor name
+
+//Input Registers
+
+#define pSrc x0
+#define pDst x1
+#define pTwiddle x2
+#define pSubFFTNum x3
+#define pSubFFTSize x4
+
+
+//Output Registers
+
+
+//Local Scratch Registers
+
+#define subFFTNum x5
+#define subFFTSize x6
+#define grpSize x7
+// Reuse grpSize as setCount
+#define setCount x7
+#define pointStep x8
+#define outPointStep x8
+#define setStep x9
+#define step1 x10
+#define step2 x11
+#define t0 w12
+
+
+// Neon Registers
+
+#define dXr0 v0.2s
+#define dXi0 v1.2s
+#define dXr1 v2.2s
+#define dXi1 v3.2s
+#define dXr2 v4.2s
+#define dXi2 v5.2s
+#define dXr3 v6.2s
+#define dXi3 v7.2s
+#define dXr4 v8.2s
+#define dXi4 v9.2s
+#define dXr5 v10.2s
+#define dXi5 v11.2s
+#define dXr6 v12.2s
+#define dXi6 v13.2s
+#define dXr7 v14.2s
+#define dXi7 v15.2s
+#define qX0 v0.4s
+#define qX1 v1.4s
+#define qX2 v2.4s
+#define qX3 v3.4s
+#define qX4 v4.4s
+#define qX5 v5.4s
+#define qX6 v6.4s
+#define qX7 v7.4s
+
+#define dUr0 v16.2s
+#define dUi0 v17.2s
+#define dUr2 v18.2s
+#define dUi2 v19.2s
+#define dUr4 v20.2s
+#define dUi4 v21.2s
+#define dUr6 v22.2s
+#define dUi6 v23.2s
+#define dUr1 v24.2s
+#define dUi1 v25.2s
+#define dUr3 v26.2s
+#define dUi3 v27.2s
+#define dUr5 v28.2s
+#define dUi5 v29.2s
+// reuse dXr7 and dXi7
+#define dUr7 v30.2s
+#define dUi7 v31.2s
+#define qU0 v8.4s
+#define qU1 v12.4s
+#define qU2 v9.4s
+#define qU3 v13.4s
+#define qU4 v10.4s
+#define qU5 v14.4s
+#define qU6 v11.4s
+#define qU7 v15.4s
+
+
+#define dVr0 v24.2s
+#define dVi0 v25.2s
+#define dVr2 v26.2s
+#define dVi2 v27.2s
+#define dVr4 v28.2s
+#define dVi4 v29.2s
+#define dVr6 v30.2s
+#define dVi6 v31.2s
+#define dVr1 v16.2s
+#define dVi1 v17.2s
+#define dVr3 v18.2s
+#define dVi3 v19.2s
+#define dVr5 v20.2s
+#define dVi5 v21.2s
+#define dVr7 v22.2s
+#define dVi7 v23.2s
+#define qV0 v12.4s
+#define qV1 v8.4s
+#define qV2 v13.4s
+#define qV3 v9.4s
+#define qV4 v14.4s
+#define qV5 v10.4s
+#define qV6 v15.4s
+#define qV7 v11.4s
+
+#define dYr0 v16.2s
+#define dYi0 v17.2s
+#define dYr2 v18.2s
+#define dYi2 v19.2s
+#define dYr4 v20.2s
+#define dYi4 v21.2s
+#define dYr6 v22.2s
+#define dYi6 v23.2s
+#define dYr1 v24.2s
+#define dYi1 v25.2s
+#define dYr3 v26.2s
+#define dYi3 v27.2s
+#define dYr5 v28.2s
+#define dYi5 v29.2s
+#define dYr7 v30.2s
+#define dYi7 v31.2s
+#define qY0 v8.4s
+#define qY1 v12.4s
+#define qY2 v9.4s
+#define qY3 v13.4s
+#define qY4 v10.4s
+#define qY5 v14.4s
+#define qY6 v11.4s
+#define qY7 v15.4s
+
+#define dT0 v14.2s
+#define dT0s v14.s
+#define dT1 v15.2s
+
+ .MACRO FFTSTAGE scaled, inverse, name
+
+ // Define stack arguments
+
+ // Move args values into our work registers
+ ldr subFFTNum, [pSubFFTNum]
+ ldr subFFTSize, [pSubFFTSize]
+
+ // Update pSubFFTSize and pSubFFTNum regs
+ // subFFTSize = 1 for the first stage
+
+ movz t0, 0x3f35, lsl #16 // High half word of sqrt(1/2).
+ movk t0, 0x04f3 // Low half word of sqrt(1/2).
+ MOV subFFTSize,#8
+
+ // Note: setCount = subFFTNum/8 (reuse the grpSize reg for setCount)
+ LSR grpSize,subFFTNum,#3
+ MOV subFFTNum,grpSize
+
+
+ // pT0+1 increments pT0 by 8 bytes
+ // pT0+pointStep = increment of 8*pointStep bytes = grpSize bytes
+ // Note: outPointStep = pointStep for firststage
+
+ lsl pointStep,grpSize, #3
+
+
+ // Calculate the step of input data for the next set
+ //MOV step1,pointStep,LSL #1 // step1 = 2*pointStep
+ ld2 {dXr0,dXi0},[pSrc],pointStep // data[0]
+ lsl step1,grpSize, #4
+ lsl step2,pointStep, #3
+
+ ld2 {dXr1,dXi1},[pSrc],pointStep // data[1]
+ SUB step2,step2,pointStep // step2 = 7*pointStep
+ // setStep = - 7*pointStep+16
+ rsb setStep,step2,#16
+
+ ld2 {dXr2,dXi2},[pSrc],pointStep // data[2]
+ ld2 {dXr3,dXi3},[pSrc],pointStep // data[3]
+ ld2 {dXr4,dXi4},[pSrc],pointStep // data[4]
+ ld2 {dXr5,dXi5},[pSrc],pointStep // data[5]
+ ld2 {dXr6,dXi6},[pSrc],pointStep // data[6]
+ // data[7] & update pSrc for the next set
+ // setStep = -7*pointStep + 16
+ ld2 {dXr7,dXi7},[pSrc],setStep
+ // grp = 0 a special case since all the twiddle factors are 1
+ // Loop on the sets
+
+radix8fsGrpZeroSetLoop\name :
+
+ // Decrement setcount
+ SUBS setCount,setCount,#2
+
+
+ // finish first stage of 8 point FFT
+
+ // fadd qU0,qX0,qX4
+ // fadd qU2,qX1,qX5
+ // fadd qU4,qX2,qX6
+ // fadd qU6,qX3,qX7
+ fadd dUr0,dXr0,dXr4
+ fadd dUr2,dXr1,dXr5
+ fadd dUr4,dXr2,dXr6
+ fadd dUr6,dXr3,dXr7
+ fadd dUi0,dXi0,dXi4
+ fadd dUi2,dXi1,dXi5
+ fadd dUi4,dXi2,dXi6
+ fadd dUi6,dXi3,dXi7
+
+ // finish second stage of 8 point FFT
+
+ // fadd qV0,qU0,qU4
+ // fsub qV2,qU0,qU4
+ // fadd qV4,qU2,qU6
+ // fsub qV6,qU2,qU6
+ fadd dVr0,dUr0,dUr4
+ fsub dVr2,dUr0,dUr4
+ fadd dVr4,dUr2,dUr6
+ fsub dVr6,dUr2,dUr6
+ fadd dVi0,dUi0,dUi4
+ fsub dVi2,dUi0,dUi4
+ fadd dVi4,dUi2,dUi6
+ fsub dVi6,dUi2,dUi6
+
+ // finish third stage of 8 point FFT
+
+ // fadd qY0,qV0,qV4
+ // fsub qY4,qV0,qV4
+ fadd dYr0,dVr0,dVr4
+ fsub dYr4,dVr0,dVr4
+ fadd dYi0,dVi0,dVi4
+ fsub dYi4,dVi0,dVi4
+
+ st2 {dYr0,dYi0},[pDst],step1 // store y0
+
+ .ifeqs "\inverse", "TRUE"
+
+ fsub dYr2,dVr2,dVi6
+ fadd dYi2,dVi2,dVr6
+
+ fadd dYr6,dVr2,dVi6
+ st2 {dYr2,dYi2},[pDst],step1 // store y2
+ fsub dYi6,dVi2,dVr6
+
+ // fsub qU1,qX0,qX4
+ fsub dUr1,dXr0,dXr4
+ fsub dUi1,dXi0,dXi4
+
+ st2 {dYr4,dYi4},[pDst],step1 // store y4
+
+ // fsub qU3,qX1,qX5
+ // fsub qU5,qX2,qX6
+ fsub dUr3,dXr1,dXr5
+ fsub dUr5,dXr2,dXr6
+ fsub dUi3,dXi1,dXi5
+ fsub dUi5,dXi2,dXi6
+
+ st2 {dYr6,dYi6},[pDst],step1 // store y6
+
+ .ELSE
+
+ fadd dYr6,dVr2,dVi6
+ fsub dYi6,dVi2,dVr6
+
+ fsub dYr2,dVr2,dVi6
+ st2 {dYr6,dYi6},[pDst],step1 // store y2
+ fadd dYi2,dVi2,dVr6
+
+
+ // fsub qU1,qX0,qX4
+ fsub dUr1,dXr0,dXr4
+ fsub dUi1,dXi0,dXi4
+
+ st2 {dYr4,dYi4},[pDst],step1 // store y4
+
+ // fsub qU3,qX1,qX5
+ // fsub qU5,qX2,qX6
+ fsub dUr3,dXr1,dXr5
+ fsub dUr5,dXr2,dXr6
+ fsub dUi3,dXi1,dXi5
+ fsub dUi5,dXi2,dXi6
+
+ st2 {dYr2,dYi2},[pDst],step1 // store y6
+
+
+ .ENDIF
+
+ // finish first stage of 8 point FFT
+
+ // fsub qU7,qX3,qX7
+ fsub dUr7,dXr3,dXr7
+ fsub dUi7,dXi3,dXi7
+
+ mov dT0s[0], t0
+
+ // finish second stage of 8 point FFT
+
+ fsub dVr1,dUr1,dUi5
+ // data[0] for next iteration
+ ld2 {dXr0,dXi0},[pSrc],pointStep
+ fadd dVi1,dUi1,dUr5
+ fadd dVr3,dUr1,dUi5
+ ld2 {dXr1,dXi1},[pSrc],pointStep // data[1]
+ fsub dVi3,dUi1,dUr5
+
+ fsub dVr5,dUr3,dUi7
+ ld2 {dXr2,dXi2},[pSrc],pointStep // data[2]
+ fadd dVi5,dUi3,dUr7
+ fadd dVr7,dUr3,dUi7
+ ld2 {dXr3,dXi3},[pSrc],pointStep // data[3]
+ fsub dVi7,dUi3,dUr7
+
+ // finish third stage of 8 point FFT
+
+ .ifeqs "\inverse", "TRUE"
+
+ // calculate a*v5
+ fmul dT1,dVr5,dT0[0] // use dVi0 for dT1
+
+ ld2 {dXr4,dXi4},[pSrc],pointStep // data[4]
+ fmul dVi5,dVi5,dT0[0]
+
+ ld2 {dXr5,dXi5},[pSrc],pointStep // data[5]
+ fsub dVr5,dT1,dVi5 // a * V5
+ fadd dVi5,dT1,dVi5
+
+ ld2 {dXr6,dXi6},[pSrc],pointStep // data[6]
+
+ // calculate b*v7
+ fmul dT1,dVr7,dT0[0]
+ fmul dVi7,dVi7,dT0[0]
+
+ // fadd qY1,qV1,qV5
+ // fsub qY5,qV1,qV5
+ fadd dYr1,dVr1,dVr5
+ fsub dYr5,dVr1,dVr5
+ fadd dYi1,dVi1,dVi5
+ fsub dYi5,dVi1,dVi5
+
+ fadd dVr7,dT1,dVi7 // b * V7
+ fsub dVi7,dVi7,dT1
+ SUB pDst, pDst, step2 // set pDst to y1
+
+ // On the last iteration, this will read past the end of pSrc,
+ // so skip this read.
+ BEQ radix8SkipLastUpdateInv\name
+ ld2 {dXr7,dXi7},[pSrc],setStep // data[7]
+radix8SkipLastUpdateInv\name:
+
+ fsub dYr3,dVr3,dVr7
+ fsub dYi3,dVi3,dVi7
+ st2 {dYr1,dYi1},[pDst],step1 // store y1
+ fadd dYr7,dVr3,dVr7
+ fadd dYi7,dVi3,dVi7
+
+
+ st2 {dYr3,dYi3},[pDst],step1 // store y3
+ st2 {dYr5,dYi5},[pDst],step1 // store y5
+ st2 {dYr7,dYi7},[pDst] // store y7
+ ADD pDst, pDst, #16
+
+ .ELSE
+
+ // calculate b*v7
+ fmul dT1,dVr7,dT0[0]
+ ld2 {dXr4,dXi4},[pSrc],pointStep // data[4]
+ fmul dVi7,dVi7,dT0[0]
+
+ ld2 {dXr5,dXi5},[pSrc],pointStep // data[5]
+ fadd dVr7,dT1,dVi7 // b * V7
+ fsub dVi7,dVi7,dT1
+
+ ld2 {dXr6,dXi6},[pSrc],pointStep // data[6]
+
+ // calculate a*v5
+ fmul dT1,dVr5,dT0[0] // use dVi0 for dT1
+ fmul dVi5,dVi5,dT0[0]
+
+ fadd dYr7,dVr3,dVr7
+ fadd dYi7,dVi3,dVi7
+ SUB pDst, pDst, step2 // set pDst to y1
+
+ fsub dVr5,dT1,dVi5 // a * V5
+ fadd dVi5,dT1,dVi5
+
+ // On the last iteration, this will read past the end of pSrc,
+ // so skip this read.
+ BEQ radix8SkipLastUpdateFwd\name
+ ld2 {dXr7,dXi7},[pSrc],setStep // data[7]
+radix8SkipLastUpdateFwd\name:
+
+ // fsub qY5,qV1,qV5
+ fsub dYr5,dVr1,dVr5
+ fsub dYi5,dVi1,dVi5
+
+ fsub dYr3,dVr3,dVr7
+ st2 {dYr7,dYi7},[pDst],step1 // store y1
+ fsub dYi3,dVi3,dVi7
+
+ // fadd qY1,qV1,qV5
+ fadd dYr1,dVr1,dVr5
+ fadd dYi1,dVi1,dVi5
+
+ st2 {dYr5,dYi5},[pDst],step1 // store y3
+ st2 {dYr3,dYi3},[pDst],step1 // store y5
+ st2 {dYr1,dYi1},[pDst],#16 // store y7
+
+ .ENDIF
+
+
+ // update pDst for the next set
+ SUB pDst, pDst, step2
+ BGT radix8fsGrpZeroSetLoop\name
+
+ // Save subFFTNum and subFFTSize for next stage
+ str subFFTNum, [pSubFFTNum]
+ str subFFTSize, [pSubFFTSize]
+
+ .endm
+
+
+ // Allocate stack memory required by the function
+
+
+ M_START armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace,,d15
+ FFTSTAGE "FALSE","FALSE",FWD
+ M_END
+
+
+ M_START armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace,,d15
+ FFTSTAGE "FALSE","TRUE",INV
+ M_END
+
+
+
+ .end
diff --git a/dl/sp/src/arm/arm64/omxSP_FFTFwd_CToC_FC32.c b/dl/sp/src/arm/arm64/omxSP_FFTFwd_CToC_FC32.c
new file mode 100644
index 0000000..f29796b
--- /dev/null
+++ b/dl/sp/src/arm/arm64/omxSP_FFTFwd_CToC_FC32.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+
+extern void armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+/**
+ * Function: omxSP_FFTFwd_CToC_FC32_Sfs (2.2.4.2.2)
+ *
+ * Description:
+ * Compute an FFT for a complex signal of length of 2^order,
+ * where 0 <= order <= 15.
+ * Transform length is determined by the specification structure, which
+ * must be initialized prior to calling the FFT function using the appropriate
+ * helper, i.e., <FFTInit_C_sc32> or <FFTInit_C_SC16>. The relationship
+ * between the input and output sequences can be expressed in terms of the
+ * DFT, i.e.,
+ *
+ * X[k] = SUM[n=0...N-1]x[n].e^(-jnk.2.pi/N)
+ * k = 0,1,2,..., N-1
+ * N = 2^order
+ *
+ * Input Arguments:
+ * pSrc - pointer to the input signal, a complex-valued vector of length
+ * 2^order; must be aligned on a 32 byte boundary.
+ * pFFTSpec - pointer to the preallocated and initialized specification
+ * structure
+ *
+ * Output Arguments:
+ * pDst - pointer to the complex-valued output vector, of length 2^order;
+ * must be aligned on an 32-byte boundary.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - returned if one or more of the following conditions
+ * is true:
+ * - one or more of the following pointers is NULL: pSrc, pDst, or
+ * pFFTSpec.
+ * - pSrc or pDst is not 32-byte aligned
+ *
+ */
+
+OMXResult omxSP_FFTFwd_CToC_FC32_Sfs(const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ const OMXFFTSpec_C_FC32* pFFTSpec) {
+ ARMsFFTSpec_FC32* spec = (ARMsFFTSpec_FC32*)pFFTSpec;
+ int order;
+ long subFFTSize;
+ long subFFTNum;
+ OMX_FC32* pTwiddle;
+ OMX_FC32* pOut;
+
+ /*
+ * Check args are not NULL and the source and destination pointers
+ * are properly aligned.
+ */
+ if (!validateParametersFC32(pSrc, pDst, spec))
+ return OMX_Sts_BadArgErr;
+
+ order = fastlog2(spec->N);
+
+ subFFTSize = 1;
+ subFFTNum = spec->N;
+ pTwiddle = spec->pTwiddle;
+ pOut = spec->pBuf;
+
+ if (order > 3) {
+ OMX_FC32* argDst;
+
+ /*
+ * Set up argDst and pOut appropriately so that pOut = pDst for
+ * the very last FFT stage.
+ */
+ if ((order & 2) == 0) {
+ argDst = pOut;
+ pOut = pDst;
+ } else {
+ argDst = pDst;
+ }
+
+ /*
+ * Odd order uses a radix 8 first stage; even order, a radix 4
+ * first stage.
+ */
+ if (order & 1) {
+ armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace(
+ pSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
+ } else {
+ armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace(
+ pSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
+ }
+
+ /*
+ * Now use radix 4 stages to finish rest of the FFT
+ */
+ if (subFFTNum >= 4) {
+ while (subFFTNum > 4) {
+ OMX_FC32* tmp;
+
+ armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace(
+ argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+ /*
+ * Swap argDst and pOut
+ */
+ tmp = pOut;
+ pOut = argDst;
+ argDst = tmp;
+ }
+
+ armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace(
+ argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+ }
+ } else if (order == 3) {
+ armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace(
+ pSrc, pDst, pTwiddle, &subFFTNum, &subFFTSize);
+ armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace(
+ pDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+ armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace(
+ pOut, pDst, pTwiddle, &subFFTNum, &subFFTSize);
+ } else if (order == 2) {
+ armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace(
+ pSrc, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+ armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace(
+ pOut, pDst, pTwiddle, &subFFTNum, &subFFTSize);
+ } else {
+ /* Order = 1 */
+ armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace(
+ pSrc, pDst, pTwiddle, &subFFTNum, &subFFTSize);
+ }
+
+ return OMX_Sts_NoErr;
+}
diff --git a/dl/sp/src/arm/arm64/omxSP_FFTFwd_RToCCS_F32.c b/dl/sp/src/arm/arm64/omxSP_FFTFwd_RToCCS_F32.c
new file mode 100644
index 0000000..f1e503e
--- /dev/null
+++ b/dl/sp/src/arm/arm64/omxSP_FFTFwd_RToCCS_F32.c
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+
+extern void armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void ComplexToRealFixup(OMX_FC32* pSrc,
+ OMX_F32* pDst,
+ const OMX_FC32* pTwiddle,
+ OMX_F32* pBuf,
+ long N);
+
+/**
+ * Function: omxSP_FFTFwd_CToC_FC32_Sfs (2.2.4.2.2)
+ *
+ * Description:
+ * Compute an FFT for a complex signal of length of 2^order,
+ * where 0 <= order <= 15.
+ * Transform length is determined by the specification structure, which
+ * must be initialized prior to calling the FFT function using the appropriate
+ * helper, i.e., <FFTInit_C_sc32> or <FFTInit_C_SC16>. The relationship
+ * between the input and output sequences can be expressed in terms of the
+ * DFT, i.e.,
+ *
+ * X[k] = SUM[n=0...N-1]x[n].e^(-jnk.2.pi/N)
+ * k = 0,1,2,..., N-1
+ * N = 2^order
+ *
+ * Input Arguments:
+ * pSrc - pointer to the input signal, a complex-valued vector of length
+ * 2^order; must be aligned on a 32 byte boundary.
+ * pFFTSpec - pointer to the preallocated and initialized specification
+ * structure
+ *
+ * Output Arguments:
+ * pDst - pointer to the complex-valued output vector, of length 2^order;
+ * must be aligned on an 32-byte boundary.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - returned if one or more of the following conditions
+ * is true:
+ * - one or more of the following pointers is NULL: pSrc, pDst, or
+ * pFFTSpec.
+ * - pSrc or pDst is not 32-byte aligned
+ *
+ */
+
+OMXResult omxSP_FFTFwd_RToCCS_F32_Sfs(const OMX_F32* pSrc,
+ OMX_F32* pDst,
+ const OMXFFTSpec_R_F32* pFFTSpec) {
+ ARMsFFTSpec_R_FC32* spec = (ARMsFFTSpec_R_FC32*)pFFTSpec;
+ int order;
+ long subFFTSize;
+ long subFFTNum;
+ OMX_FC32* pTwiddle;
+ OMX_FC32* pOut;
+ OMX_FC32* pComplexSrc = (OMX_FC32*) pSrc;
+ OMX_FC32* pComplexDst = (OMX_FC32*) pDst;
+
+ /*
+ * Check args are not NULL and the source and destination pointers
+ * are properly aligned.
+ */
+ if (!validateParametersF32(pSrc, pDst, spec))
+ return OMX_Sts_BadArgErr;
+
+ /*
+ * Compute the RFFT using a complex FFT of one less order, so set
+ * order to be the order of the complex FFT.
+ */
+ order = fastlog2(spec->N) - 1;
+
+ subFFTSize = 1;
+ subFFTNum = spec->N >> 1;
+ pTwiddle = spec->pTwiddle;
+ pOut = (OMX_FC32*) spec->pBuf;
+
+ if (order > 3) {
+ OMX_FC32* argDst;
+ OMX_FC32* pComplexDst = (OMX_FC32*) pDst;
+
+ /*
+ * Set up argDst and pOut appropriately so that pOut = pDst for
+ * ComplexToRealFixup.
+ */
+ if ((order & 2) != 0) {
+ argDst = pOut;
+ pOut = pComplexDst;
+ } else {
+ argDst = pComplexDst;
+ }
+
+ /*
+ * Odd order uses a radix 8 first stage; even order, a radix 4
+ * first stage.
+ */
+ if (order & 1) {
+ armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace(
+ pComplexSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
+ } else {
+ armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace(
+ pComplexSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
+ }
+
+ /*
+ * Now use radix 4 stages to finish rest of the FFT
+ */
+ if (subFFTNum >= 4) {
+ while (subFFTNum > 4) {
+ OMX_FC32* tmp;
+
+ armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace(
+ argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+ /*
+ * Swap argDst and pOut
+ */
+ tmp = pOut;
+ pOut = argDst;
+ argDst = tmp;
+ }
+
+ armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace(
+ argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+ }
+ } else if (order == 3) {
+ armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace(
+ pComplexSrc, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+ armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace(
+ pOut, pComplexDst, pTwiddle, &subFFTNum, &subFFTSize);
+ armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace(
+ pComplexDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+ } else if (order == 2) {
+ armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace(
+ pComplexSrc, pComplexDst, pTwiddle, &subFFTNum, &subFFTSize);
+ armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace(
+ pComplexDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+ } else if (order == 1) {
+ armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace(
+ pComplexSrc, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+ } else {
+ /* Handle complex order 0 specially */
+ pOut->Re = pSrc[0];
+ pOut->Im = pSrc[1];
+ }
+
+ /*
+ * Complex FFT done. Fix up the complex result to give the correct
+ * RFFT.
+ */
+
+ ComplexToRealFixup(pOut, pDst, pTwiddle, spec->pBuf, spec->N);
+
+ return OMX_Sts_NoErr;
+}
diff --git a/dl/sp/src/arm/arm64/omxSP_FFTInv_CCSToR_F32.c b/dl/sp/src/arm/arm64/omxSP_FFTInv_CCSToR_F32.c
new file mode 100644
index 0000000..84de9cf
--- /dev/null
+++ b/dl/sp/src/arm/arm64/omxSP_FFTInv_CCSToR_F32.c
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+
+extern void armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTInv_CCSToR_F32_preTwiddleRadix2(
+ const OMX_F32* pSrc,
+ const OMX_FC32* pTwiddle,
+ OMX_F32* pBuf,
+ long N);
+
+/*
+ * Scale FFT data by 1/|length|. |length| must be a power of two
+ */
+static inline ScaleRFFTData(OMX_F32* fftData, unsigned length) {
+ float32_t* data = (float32_t*)fftData;
+ float32_t scale = 2.0f / length;
+
+ if (length >= 4) {
+ /*
+ * Do 4 float elements at a time because |length| is always a
+ * multiple of 4 when |length| >= 4.
+ *
+ * TODO(rtoy): Figure out how to process 8 elements at a time
+ * using intrinsics or replace this with inline assembly.
+ */
+ do {
+ float32x4_t x = vld1q_f32(data);
+
+ length -= 4;
+ x = vmulq_n_f32(x, scale);
+ vst1q_f32(data, x);
+ data += 4;
+ } while (length > 0);
+ } else if (length == 2) {
+ float32x2_t x = vld1_f32(data);
+ x = vmul_n_f32(x, scale);
+ vst1_f32(data, x);
+ } else {
+ fftData[0] *= scale;
+ }
+}
+
+/**
+ * Function: omxSP_FFTInv_CCSToR_F32_Sfs
+ *
+ * Description:
+ * These functions compute the inverse FFT for a conjugate-symmetric input
+ * sequence. Transform length is determined by the specification structure,
+ * which must be initialized prior to calling the FFT function using
+ * <FFTInit_R_F32>. For a transform of length M, the input sequence is
+ * represented using a packed CCS vector of length M+2, and is organized
+ * as follows:
+ *
+ * Index: 0 1 2 3 4 5 . . . M-2 M-1 M M+1
+ * Comp: R[0] 0 R[1] I[1] R[2] I[2] . . . R[M/2-1] I[M/2-1] R[M/2] 0
+ *
+ * where R[n] and I[n], respectively, denote the real and imaginary
+ * components for FFT bin n. Bins are numbered from 0 to M/2, where M
+ * is the FFT length. Bin index 0 corresponds to the DC component,
+ * and bin index M/2 corresponds to the foldover frequency.
+ *
+ * Input Arguments:
+ * pSrc - pointer to the complex-valued input sequence represented
+ * using CCS format, of length (2^order) + 2; must be aligned on a
+ * 32-byte boundary.
+ * pFFTSpec - pointer to the preallocated and initialized
+ * specification structure
+ *
+ * Output Arguments:
+ * pDst - pointer to the real-valued output sequence, of length
+ * 2^order ; must be aligned on a 32-byte boundary.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+
+ * OMX_Sts_BadArgErr - bad arguments if one or more of the
+ * following is true:
+ * - pSrc, pDst, or pFFTSpec is NULL
+ * - pSrc or pDst is not aligned on a 32-byte boundary
+ *
+ */
+OMXResult omxSP_FFTInv_CCSToR_F32_Sfs(
+ const OMX_F32* pSrc,
+ OMX_F32* pDst,
+ const OMXFFTSpec_R_F32* pFFTSpec) {
+ ARMsFFTSpec_R_FC32* spec = (ARMsFFTSpec_R_FC32*)pFFTSpec;
+ int order;
+ long subFFTSize;
+ long subFFTNum;
+ OMX_FC32* pTwiddle;
+ OMX_FC32* pOut;
+ OMX_FC32* pComplexSrc;
+ OMX_FC32* pComplexDst = (OMX_FC32*) pDst;
+
+ /*
+ * Check args are not NULL and the source and destination pointers
+ * are properly aligned.
+ */
+ if (!validateParametersF32(pSrc, pDst, spec))
+ return OMX_Sts_BadArgErr;
+
+ /*
+ * Preprocess the input before calling the complex inverse FFT. The
+ * result is actually stored in the second half of the temp buffer
+ * in pFFTSpec.
+ */
+ if (spec->N > 1)
+ armSP_FFTInv_CCSToR_F32_preTwiddleRadix2(
+ pSrc, spec->pTwiddle, spec->pBuf, spec->N);
+
+ /*
+ * Do a complex inverse FFT of half size.
+ */
+ order = fastlog2(spec->N) - 1;
+
+ subFFTSize = 1;
+ subFFTNum = spec->N >> 1;
+ pTwiddle = spec->pTwiddle;
+ /*
+ * The pBuf is split in half. The first half is the temp buffer. The
+ * second half holds the source data that was placed there by
+ * armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe.
+ */
+ pOut = (OMX_FC32*) spec->pBuf;
+ pComplexSrc = pOut + (1 << order);
+
+
+ if (order > 3) {
+ OMX_FC32* argDst;
+
+ /*
+ * Set up argDst and pOut appropriately so that pOut = pDst for
+ * the very last FFT stage.
+ */
+ if ((order & 2) == 0) {
+ argDst = pOut;
+ pOut = pComplexDst;
+ } else {
+ argDst = pComplexDst;
+ }
+
+ /*
+ * Odd order uses a radix 8 first stage; even order, a radix 4
+ * first stage.
+ */
+ if (order & 1) {
+ armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace(
+ pComplexSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
+ } else {
+ armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace(
+ pComplexSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
+ }
+
+ /*
+ * Now use radix 4 stages to finish rest of the FFT
+ */
+ if (subFFTNum >= 4) {
+ while (subFFTNum > 4) {
+ OMX_FC32* tmp;
+
+ armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace(
+ argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+ /*
+ * Swap argDst and pOut
+ */
+ tmp = pOut;
+ pOut = argDst;
+ argDst = tmp;
+ }
+
+ armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace(
+ argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+ }
+ } else if (order == 3) {
+ armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
+ pComplexSrc, pComplexDst, pTwiddle, &subFFTNum, &subFFTSize);
+ armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace(
+ pComplexDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+ armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace(
+ pOut, pComplexDst, pTwiddle, &subFFTNum, &subFFTSize);
+ } else if (order == 2) {
+ armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
+ pComplexSrc, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+ armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace(
+ pOut, pComplexDst, pTwiddle, &subFFTNum, &subFFTSize);
+ } else if (order == 1) {
+ armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
+ pComplexSrc, pComplexDst, pTwiddle, &subFFTNum, &subFFTSize);
+ } else {
+ /* Order = 0 */
+ *pComplexDst = *pComplexSrc;
+ }
+
+ ScaleRFFTData(pDst, spec->N);
+ return OMX_Sts_NoErr;
+}
+
diff --git a/dl/sp/src/arm/arm64/omxSP_FFTInv_CToC_FC32.c b/dl/sp/src/arm/arm64/omxSP_FFTInv_CToC_FC32.c
new file mode 100644
index 0000000..eec05e9
--- /dev/null
+++ b/dl/sp/src/arm/arm64/omxSP_FFTInv_CToC_FC32.c
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+
+extern void armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+/*
+ * Scale FFT data by 1/|length|. |length| must be a power of two
+ */
+static inline ScaleFFTData(OMX_FC32* fftData, unsigned length) {
+ float32_t* data = (float32_t*)fftData;
+ float32_t scale = 1.0f / length;
+
+ /*
+ * Do two complex elements at a time because |length| is always
+ * greater than or equal to 2 (order >= 1)
+ */
+ do {
+ float32x4_t x = vld1q_f32(data);
+
+ length -= 2;
+ x = vmulq_n_f32(x, scale);
+ vst1q_f32(data, x);
+ data += 4;
+ } while (length > 0);
+}
+
+/**
+ * Function: omxSP_FFTInv_CToC_FC32
+ *
+ * Description:
+ * These functions compute an inverse FFT for a complex signal of
+ * length of 2^order, where 0 <= order <= 15. Transform length is
+ * determined by the specification structure, which must be
+ * initialized prior to calling the FFT function using the appropriate
+ * helper, i.e., <FFTInit_C_FC32>. The relationship between the input
+ * and output sequences can be expressed in terms of the IDFT, i.e.:
+ *
+ * x[n] = SUM[k=0,...,N-1] X[k].e^(jnk.2.pi/N)
+ * n=0,1,2,...N-1
+ * N=2^order.
+ *
+ * Input Arguments:
+ * pSrc - pointer to the complex-valued input signal, of length 2^order ;
+ * must be aligned on a 32-byte boundary.
+ * pFFTSpec - pointer to the preallocated and initialized specification
+ * structure
+ *
+ * Output Arguments:
+ * order
+ * pDst - pointer to the complex-valued output signal, of length 2^order;
+ * must be aligned on a 32-byte boundary.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - returned if one or more of the following conditions
+ * is true:
+ * - one or more of the following pointers is NULL: pSrc, pDst, or
+ * pFFTSpec.
+ * - pSrc or pDst is not 32-byte aligned
+ *
+ */
+
+OMXResult omxSP_FFTInv_CToC_FC32_Sfs(const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ const OMXFFTSpec_C_FC32* pFFTSpec) {
+ ARMsFFTSpec_FC32* spec = (ARMsFFTSpec_FC32*)pFFTSpec;
+ int order;
+ long subFFTSize;
+ long subFFTNum;
+ OMX_FC32* pTwiddle;
+ OMX_FC32* pOut;
+
+ /*
+ * Check args are not NULL and the source and destination pointers
+ * are properly aligned.
+ */
+ if (!validateParametersFC32(pSrc, pDst, spec))
+ return OMX_Sts_BadArgErr;
+
+ order = fastlog2(spec->N);
+
+ subFFTSize = 1;
+ subFFTNum = spec->N;
+ pTwiddle = spec->pTwiddle;
+ pOut = spec->pBuf;
+
+ if (order > 3) {
+ OMX_FC32* argDst;
+
+ /*
+ * Set up argDst and pOut appropriately so that pOut = pDst for
+ * the very last FFT stage.
+ */
+ if ((order & 2) == 0) {
+ argDst = pOut;
+ pOut = pDst;
+ } else {
+ argDst = pDst;
+ }
+
+ /*
+ * Odd order uses a radix 8 first stage; even order, a radix 4
+ * first stage.
+ */
+ if (order & 1) {
+ armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace(
+ pSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
+ } else {
+ armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace(
+ pSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
+ }
+
+ /*
+ * Now use radix 4 stages to finish rest of the FFT
+ */
+ if (subFFTNum >= 4) {
+ while (subFFTNum > 4) {
+ OMX_FC32* tmp;
+
+ armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace(
+ argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+ /*
+ * Swap argDst and pOut
+ */
+ tmp = pOut;
+ pOut = argDst;
+ argDst = tmp;
+ }
+
+ armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace(
+ argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+ }
+ } else if (order == 3) {
+ armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
+ pSrc, pDst, pTwiddle, &subFFTNum, &subFFTSize);
+ armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace(
+ pDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+ armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace(
+ pOut, pDst, pTwiddle, &subFFTNum, &subFFTSize);
+ } else if (order == 2) {
+ armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
+ pSrc, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+ armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace(
+ pOut, pDst, pTwiddle, &subFFTNum, &subFFTSize);
+ } else {
+ /* Order = 1 */
+ armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
+ pSrc, pDst, pTwiddle, &subFFTNum, &subFFTSize);
+ }
+
+ ScaleFFTData(pDst, spec->N);
+ return OMX_Sts_NoErr;
+}