13 files changed, 3336 insertions, 0 deletions
diff --git a/dl/sp/src/arm/arm64/ComplexToRealFixup.S b/dl/sp/src/arm/arm64/ComplexToRealFixup.S
new file mode 100644
index 0000000..9b30093
--- /dev/null
+++ b/dl/sp/src/arm/arm64/ComplexToRealFixup.S
@@ -0,0 +1,261 @@
+//
+//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+//  Use of this source code is governed by a BSD-style license
+//  that can be found in the LICENSE file in the root of the source
+//  tree. An additional intellectual property rights grant can be found
+//  in the file PATENTS.  All contributing project authors may
+//  be found in the AUTHORS file in the root of the source tree.
+//
+//  This is a modification of omxSP_FFTFwd_RToCCS_S32_Sfs_s.s
+//  to support float instead of SC32.
+//
+
+//
+// Description:
+// Compute FFT for a real signal
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+
+// Import symbols required from other files
+// (For example tables)
+
+// Set debugging level
+//DEBUG_ON    SETL {TRUE}
+
+
+
+// Guarding implementation by the processor name
+
+
+
+    // Guarding implementation by the processor name
+
+// Import symbols required from other files
+
+
+//Input Registers
+
+#define pSrc            x0
+#define pDst            x1
+#define pTwiddle        x2
+#define	pOut		x3	
+#define	subFFTNum	x4
+
+// Output registers
+
+//Local Scratch Registers
+
+#define argTwiddle      x5
+#define argDst          x6
+#define subFFTSize      x7
+#define N               subFFTNum
+#define order           x14
+#define step            x8
+#define step1           pTwiddle
+#define twStep          x9
+#define zero            w10
+#define pTwiddleTmp     pOut
+
+// Neon registers
+
+#define dX0       v0.2s
+#define dX0s      v0.s
+#define dX0r      v2.2s
+#define dX0rs     v2.s
+#define dX0i      v3.2s
+#define dX0is     v3.s
+#define dX1r      v4.2s
+#define dX1i      v5.2s
+#define dT0       v6.2s
+#define dT1       v7.2s
+#define dT2       v8.2s
+#define dT3       v9.2s
+#define qT0       v10.2s
+#define qT1       v12.2s
+#define dW0r      v14.2s
+#define dW0r8b    v14.8b
+#define dW0i      v15.2s
+#define dW1r      v16.2s
+#define dW1r8b    v16.8b
+#define dW1i      v17.2s
+#define dY0r      v14.2s
+#define dY0i      v15.2s
+#define dY1r      v16.2s
+#define dY1i      v17.2s
+#define qT2       v18.2s
+#define qT3       v20.2s
+
+#define half      v0.2s
+#define dZip      v21.2s
+#define dZip8b    v21.8b
+        
+    // Allocate stack memory required by the function
+
+    // Write function header
+        M_START     ComplexToRealFixup,,d15
+
+        asr     N, N, #1
+
+        clz     order, subFFTNum                    // N = 2^order
+
+        RSB     order,order,#63
+        MOV     subFFTSize,subFFTNum            // subFFTSize = N/2
+        //MOV     subFFTNum,N
+        mov     argDst, pDst
+        mov     argTwiddle, pTwiddle
+
+        // F(0) = 1/2[Z(0) + Z'(0)] - j [Z(0) - Z'(0)]
+        // 1/2[(a+jb) + (a-jb)] - j  [(a+jb) - (a-jb)]
+        // 1/2[2a+j0] - j [0+j2b]
+        // (a+b, 0)
+
+        // F(N/2) = 1/2[Z(0) + Z'(0)] + j [Z(0) - Z'(0)]
+        // 1/2[(a+jb) + (a-jb)] + j  [(a+jb) - (a-jb)]
+        // 1/2[2a+j0] + j [0+j2b]
+        // (a-b, 0)
+
+        // F(0) and F(N/2)
+        ld2     {dX0rs,dX0is}[0],[pSrc], #8
+        MOV     zero,#0
+        mov    dX0rs[1],zero
+        lsl     step,subFFTSize, #3               // step = N/2 * 8 bytes
+        mov    dX0i[1],zero
+        // twStep = 3N/8 * 8 bytes pointing to W^1
+        SUB     twStep,step,subFFTSize,LSL #1
+
+        fadd    dY0r,dX0r,dX0i                    // F(0) = ((Z0.r+Z0.i) , 0)
+        lsl     step1,subFFTSize, #2              // step1 = N/2 * 4 bytes
+        fsub    dY0i,dX0r,dX0i                    // F(N/2) = ((Z0.r-Z0.i) , 0)
+        SUBS    subFFTSize,subFFTSize,#2
+
+        st1     {dY0r},[argDst],step
+        ADD     pTwiddleTmp,argTwiddle,#8         // W^2
+        st1     {dY0i},[argDst], #8
+        ADD     argTwiddle,argTwiddle,twStep      // W^1
+
+//        dup     dzero,zero
+        SUB     argDst,argDst,step
+
+        BLT     End
+        BEQ     lastElement
+        SUB     step,step,#24
+        SUB     step1,step1,#8                    // (N/4-1)*8 bytes
+
+        // F(k) = 1/2[Z(k) +  Z'(N/2-k)] -j*W^(k) [Z(k) -  Z'(N/2-k)]
+        // Note: W^k is stored as negative values in the table
+        // Process 4 elements at a time. E.g: F(1),F(2) and F(N/2-2),F(N/2-1)
+        // since both of them require Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
+
+        fmov     half, #0.5
+
+evenOddButterflyLoop:
+
+
+        ld1     {dW0r},[argTwiddle],step1
+        ld1     {dW1r},[argTwiddle], #8
+
+        ld2     {dX0r,dX0i},[pSrc],step
+        SUB     argTwiddle,argTwiddle,step1
+        ld2     {dX1r,dX1i},[pSrc], #16
+
+
+
+        SUB     step1,step1,#8                    // (N/4-2)*8 bytes
+        ld1     {dW0i},[pTwiddleTmp],step1
+        ld1     {dW1i},[pTwiddleTmp], #8
+        SUB     pSrc,pSrc,step
+
+        SUB     pTwiddleTmp,pTwiddleTmp,step1
+        rev64   dX1r,dX1r
+        rev64   dX1i,dX1i
+        SUBS    subFFTSize,subFFTSize,#4
+
+
+
+        fsub    dT2,dX0r,dX1r                     // a-c
+        SUB     step1,step1,#8
+        fadd    dT0,dX0r,dX1r                     // a+c
+        fsub    dT1,dX0i,dX1i                     // b-d
+        fadd    dT3,dX0i,dX1i                     // b+d
+        fmul   dT0,dT0,half[0]
+        fmul   dT1,dT1,half[0]
+        // VZIP    dW1r,dW1i
+        // VZIP    dW0r,dW0i
+        zip1    dZip, dW1r, dW1i
+        zip2    dW1i, dW1r, dW1i
+        mov     dW1r8b, dZip8b
+        zip1    dZip, dW0r, dW0i
+        zip2    dW0i, dW0r, dW0i
+        mov     dW0r8b, dZip8b
+
+        fmul   qT0,dW1r,dT2
+        fmul   qT1,dW1r,dT3
+        fmul   qT2,dW0r,dT2
+        fmul   qT3,dW0r,dT3
+
+        fmla   qT0,dW1i,dT3
+        fmls   qT1,dW1i,dT2
+
+        fmls   qT2,dW0i,dT3
+        fmla   qT3,dW0i,dT2
+
+
+        fmul  dX1r,qT0,half[0]
+        fmul  dX1i,qT1,half[0]
+
+        fsub    dY1r,dT0,dX1i                     // F(N/2 -1)
+        fadd    dY1i,dT1,dX1r
+        fneg    dY1i,dY1i
+
+        rev64   dY1r,dY1r
+        rev64   dY1i,dY1i
+
+
+        fmul  dX0r,qT2,half[0]
+        fmul  dX0i,qT3,half[0]
+
+        fsub    dY0r,dT0,dX0i                     // F(1)
+        fadd    dY0i,dT1,dX0r
+
+
+        st2     {dY0r,dY0i},[argDst],step
+        st2     {dY1r,dY1i},[argDst], #16
+        SUB     argDst,argDst,step
+        SUB     step,step,#32                     // (N/2-4)*8 bytes
+
+
+        BGT     evenOddButterflyLoop
+
+        // set both the ptrs to the last element
+        SUB     pSrc,pSrc,#8
+        SUB     argDst,argDst,#8
+
+
+
+        // Last element can be expanded as follows
+        // 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)]
+        // 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)]
+        // 1/2[2a+j0] + j (c+jd) [0+j2b]
+        // (a-bc, -bd)
+        // Since (c,d) = (0,1) for the last element, result is just (a,-b)
+
+lastElement:
+        ld1     {dX0r},[pSrc]
+
+        st1     {dX0rs}[0],[argDst], #4
+        fneg    dX0r,dX0r
+        st1     {dX0rs}[1],[argDst], #4
+End:
+
+        // Write function tail
+        M_END
+
+        .end
diff --git a/dl/sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S b/dl/sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S
new file mode 100644
index 0000000..da68314
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S
@@ -0,0 +1,280 @@
+//
+//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+//  Use of this source code is governed by a BSD-style license
+//  that can be found in the LICENSE file in the root of the source
+//  tree. An additional intellectual property rights grant can be found
+//  in the file PATENTS.  All contributing project authors may
+//  be found in the AUTHORS file in the root of the source tree.
+//
+//  This is a modification of
+//  armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.s to support float
+//  instead of SC32.
+//
+
+//
+// Description:
+// Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT
+// It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+
+// Import symbols required from other files
+// (For example tables)
+
+
+// Set debugging level
+//DEBUG_ON    SETL {TRUE}
+
+
+
+// Guarding implementation by the processor name
+
+
+
+      // Guarding implementation by the processor name
+
+
+
+//Input Registers
+
+#define pSrc            x0
+#define pTwiddle        x1
+#define	pOut		x2	
+#define	subFFTNum	x3
+
+// Output registers
+
+//Local Scratch Registers
+
+#define argTwiddle      x5
+#define argDst          x6
+#define subFFTSize      x7
+#define N               subFFTNum
+
+#define pOut1           x13
+	
+#define size            x7
+#define step            x8
+#define step1           x9
+#define twStep          x10
+#define pTwiddleTmp     x11
+#define argTwiddle1     x12
+
+// Neon registers
+
+#define dX0     v0.2s
+#define dX0s    v0.s
+#define dShift  v1.2s
+#define dX1     v1.2s
+#define dX1s    v1.s
+#define dY0     v2.2s
+#define dY08b   v2.8b
+#define dY1     v3.2s
+#define dX0r    v0.2s
+#define dX0rs   v0.s
+#define dX0i    v1.2s
+#define dX1r    v2.2s
+#define dX1i    v3.2s
+#define dW0r    v4.2s
+#define dW0r8b  v4.8b
+#define dW0i    v5.2s
+#define dW1r    v6.2s
+#define dW1r8b  v6.8b
+#define dW1i    v7.2s
+#define dT0     v8.2s
+#define dT1     v9.2s
+#define dT2     v10.2s
+#define dT3     v11.2s
+#define qT0     v12.2s
+#define qT1     v14.2s
+#define qT2     v16.2s
+#define qT3     v18.2s
+#define dY0r    v4.2s
+#define dY0i    v5.2s
+#define dY1r    v6.2s
+#define dY1i    v7.2s
+
+#define dY2     v4.2s
+#define dY3     v5.2s
+#define dW0     v6.2s
+#define dW1     v7.2s
+#define dW0Tmp  v10.2s
+#define dW1Neg  v11.2s
+
+#define dZip    v19.2s
+#define dZip8b  v19.8b
+#define half    v13.2s
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        fmov    half, 0.5
+
+        asr     size, subFFTNum, #1           // preserve the contents of N = subFFTNum
+        lsl     step, subFFTNum, #2           // step = N/2 * 8 bytes
+
+
+        // Z(k) = 1/2 {[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]}
+        // Note: W^(k) is stored as negated value and also need to
+        // conjugate the values from the table
+
+        // Z(0) : no need of twiddle multiply
+        // Z(0) = 1/2 { [F(0) +  F'(N/2)] +j [F(0) -  F'(N/2)] }
+
+        ld1     {dX0},[pSrc],step
+        ADD     pOut1,pOut,step               // pOut1 = pOut+ N/2*8 bytes
+
+        ld1     {dX1},[pSrc], #8
+        // twStep = 3N/8 * 8 bytes pointing to W^1
+        SUB     twStep,step,size,LSL #1
+
+        lsl     step1,size, #2                // step1 = N/4 * 8 = N/2*4 bytes
+        SUB     step1,step1,#8                // (N/4-1)*8 bytes
+
+        fadd    dY0,dX0,dX1                   // [b+d | a+c]
+        fsub    dY1,dX0,dX1                   // [b-d | a-c]
+        fmul    dY0, dY0, half[0]
+        fmul    dY1, dY1, half[0]
+
+        // dY0= [a-c | a+c] ;dY1= [b-d | b+d]
+        // VZIP    dY0,dY1
+        zip1    dZip,dY0,dY1
+        zip2    dY1,dY0,dY1
+        mov     dY08b, dZip8b
+
+        fsub   dX0,dY0,dY1
+        SUBS   size,size,#2
+        fadd   dX1,dY0,dY1
+
+        SUB     pSrc,pSrc,step
+
+        st1     {dX0s}[0],[pOut1], #4
+        ADD     pTwiddleTmp,pTwiddle,#8       // W^2
+        st1     {dX1s}[1],[pOut1], #4
+        ADD     argTwiddle1,pTwiddle,twStep   // W^1
+
+
+        BLT     decrementScale\name
+        BEQ     lastElement\name
+
+
+        // Z(k) = 1/2[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]
+        // Note: W^k is stored as negative values in the table and also
+        // need to conjugate the values from the table.
+        //
+        // Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
+        // since both of them require F(1),F(2) and F(N/2-2),F(N/2-1)
+
+
+        SUB     step,step,#24
+evenOddButterflyLoop\name :
+
+
+        ld1     {dW0r},[argTwiddle1],step1
+        ld1     {dW1r},[argTwiddle1], #8
+
+        ld2     {dX0r,dX0i},[pSrc],step
+        SUB     argTwiddle1,argTwiddle1,step1
+        ld2     {dX1r,dX1i},[pSrc], #16
+
+        SUB     step1,step1,#8                // (N/4-2)*8 bytes
+        ld1     {dW0i},[pTwiddleTmp],step1
+        ld1     {dW1i},[pTwiddleTmp], #8
+        SUB     pSrc,pSrc,step
+
+        SUB     pTwiddleTmp,pTwiddleTmp,step1
+        rev64   dX1r,dX1r
+        rev64   dX1i,dX1i
+        SUBS    size,size,#4
+
+
+        fsub    dT2,dX0r,dX1r                 // a-c
+        fadd    dT3,dX0i,dX1i                 // b+d
+        fadd    dT0,dX0r,dX1r                 // a+c
+        fsub    dT1,dX0i,dX1i                 // b-d
+        SUB     step1,step1,#8
+
+        fmul    dT2, dT2, half[0]
+        fmul    dT3, dT3, half[0]
+
+        fmul    dT0, dT0, half[0]
+        fmul    dT1, dT1, half[0]
+
+        // VZIP    dW1r,dW1i
+        // VZIP    dW0r,dW0i
+        zip1    dZip, dW1r,dW1i
+        zip2    dW1i,dW1r,dW1i
+        mov     dW1r8b, dZip8b
+        zip1    dZip,dW0r,dW0i
+        zip2    dW0i,dW0r,dW0i
+        mov     dW0r8b, dZip8b
+
+        fmul   dX1r,dW1r,dT2
+        fmul   dX1i,dW1r,dT3
+        fmul   dX0r,dW0r,dT2
+        fmul   dX0i,dW0r,dT3
+
+        fmls   dX1r,dW1i,dT3
+        fmla   dX1i,dW1i,dT2
+
+        fmla   dX0r,dW0i,dT3
+        fmls   dX0i,dW0i,dT2
+
+
+        fadd    dY1r,dT0,dX1i                 // F(N/2 -1)
+        fsub    dY1i,dX1r,dT1
+
+        rev64   dY1r,dY1r
+        rev64   dY1i,dY1i
+
+
+        fadd    dY0r,dT0,dX0i                 // F(1)
+        fsub    dY0i,dT1,dX0r
+
+
+        st2     {dY0r,dY0i},[pOut1],step
+        st2     {dY1r,dY1i},[pOut1], #16
+        SUB     pOut1,pOut1,step
+        SUB     step,step,#32                 // (N/2-4)*8 bytes
+
+
+        BGT     evenOddButterflyLoop\name
+
+
+        // set both the ptrs to the last element
+        SUB     pSrc,pSrc,#8
+        SUB     pOut1,pOut1,#8
+
+        // Last element can be expanded as follows
+        // 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (since W^k is stored as
+        // -ve)
+        // 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)]
+        // 1/2[2a+j0] - j (c-jd) [0+j2b]
+        // (a+bc, -bd)
+        // Since (c,d) = (0,1) for the last element, result is just (a,-b)
+
+lastElement\name :
+        ld1     {dX0r},[pSrc]
+
+        st1     {dX0rs}[0],[pOut1], #4
+        fneg    dX0r,dX0r
+        st1     {dX0rs}[1],[pOut1]
+
+
+
+decrementScale\name :
+
+        .endm
+
+        M_START armSP_FFTInv_CCSToR_F32_preTwiddleRadix2,,d15
+            FFTSTAGE "FALSE","TRUE",Inv
+        M_END
+
+        .end
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_fs_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_fs_s.S
new file mode 100644
index 0000000..b22912d
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_fs_s.S
@@ -0,0 +1,136 @@
+//
+//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+//  Use of this source code is governed by a BSD-style license
+//  that can be found in the LICENSE file in the root of the source
+//  tree. An additional intellectual property rights grant can be found
+//  in the file PATENTS.  All contributing project authors may
+//  be found in the AUTHORS file in the root of the source tree.
+//
+//  This is a modification of armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S
+//  to support float instead of SC32.
+//
+
+//
+// Description:
+// Compute the first stage of a Radix 2 DIT in-order out-of-place FFT
+// stage for a N point complex signal.
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+
+// Import symbols required from other files
+// (For example tables)
+
+
+
+
+// Set debugging level
+//DEBUG_ON    SETL {TRUE}
+
+
+
+// Guarding implementation by the processor name
+
+
+
+// Guarding implementation by the processor name
+
+
+//Input Registers
+
+#define pSrc            x0
+#define pDst            x1
+#define pTwiddle        x2
+#define	pSubFFTNum	x3
+#define pSubFFTSize	x4	
+
+
+//Output Registers
+
+
+//Local Scratch Registers
+
+#define subFFTNum       x5
+#define subFFTSize      x6
+#define pointStep       x7
+#define outPointStep    x7
+#define grpSize         x8
+#define setCount        x8
+#define step            x9
+#define dstStep         x9
+
+// Neon Registers
+#define dX0     v0.2s
+#define dX1     v1.2s
+#define dY0     v2.2s
+#define dY1     v3.2s
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        // Define stack arguments
+
+        // Move args values into our work registers
+        ldr     subFFTNum, [pSubFFTNum]
+        ldr     subFFTSize, [pSubFFTSize]
+
+        // update subFFTSize and subFFTNum into RN6 and RN7 for the next stage
+
+
+        MOV        subFFTSize,#2
+        LSR        grpSize,subFFTNum,#1
+        MOV        subFFTNum,grpSize
+
+
+        // pT0+1 increments pT0 by 8 bytes
+        // pT0+pointStep = increment of 8*pointStep bytes = 4*grpSize bytes
+        // Note: outPointStep = pointStep for firststage
+        // Note: setCount = grpSize/2 (reuse the updated grpSize for setCount)
+
+        lsl     pointStep, grpSize, #3
+        rsb     step, pointStep, #8
+
+        // Loop on the sets for grp zero
+
+grpZeroSetLoop\name :
+
+        LD1    {dX0},[pSrc],pointStep
+        LD1    {dX1},[pSrc],step                   // step = -pointStep + 8
+
+        SUBS    setCount,setCount,#1
+
+        fadd    dY0,dX0,dX1
+        fsub    dY1,dX0,dX1
+
+        ST1    {dY0},[pDst],outPointStep
+        // dstStep =  step = -pointStep + 8
+        ST1    {dY1},[pDst],dstStep
+
+        BGT     grpZeroSetLoop\name
+
+
+        // Save subFFTNum and subFFTSize for next stage
+        str     subFFTNum, [pSubFFTNum]
+        str     subFFTSize, [pSubFFTSize]
+
+        .endm
+
+
+
+        M_START armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace
+        FFTSTAGE "FALSE","FALSE",fwd
+        M_END
+
+
+
+        M_START armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace
+        FFTSTAGE "FALSE","TRUE",inv
+        M_END
+
+        .end
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_ls_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_ls_s.S
new file mode 100644
index 0000000..e7de11e
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_ls_s.S
@@ -0,0 +1,149 @@
+//
+//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+//  Use of this source code is governed by a BSD-style license
+//  that can be found in the LICENSE file in the root of the source
+//  tree. An additional intellectual property rights grant can be found
+//  in the file PATENTS.  All contributing project authors may
+//  be found in the AUTHORS file in the root of the source tree.
+//
+//  This is a modification of armSP_FFT_CToC_SC32_Radix2_ls_unsafe_s.S
+//  to support float instead of SC32.
+//
+
+//
+// Description:
+// Compute the last stage of a Radix 2 DIT in-order out-of-place FFT
+// stage for a N point complex signal.
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+
+// Import symbols required from other files
+// (For example tables)
+
+
+
+
+// Set debugging level
+//DEBUG_ON    SETL {TRUE}
+
+
+// Guarding implementation by the processor name
+
+
+//Input Registers
+
+#define pSrc            x0
+#define pDst            x1
+#define pTwiddle        x2
+#define	pSubFFTNum	x3
+#define pSubFFTSize	x4	
+
+
+//Output Registers
+
+
+//Local Scratch Registers
+
+
+#define subFFTNum       x5
+#define subFFTSize      x6
+#define outPointStep    x8
+#define grpCount        x9
+#define dstStep         x10
+
+// Neon Registers
+
+#define dWr     v0.2s
+#define dWi     v1.2s
+#define dXr0    v2.2s
+#define dXi0    v3.2s
+#define dXr1    v4.2s
+#define dXi1    v5.2s
+#define dYr0    v6.2s
+#define dYi0    v7.2s
+#define dYr1    v8.2s
+#define dYi1    v9.2s
+#define qT0     v10.2s
+#define qT1     v12.2s
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        // Move parameters into our work registers
+        ldr     subFFTSize, [pSubFFTSize]
+
+        lsl     outPointStep, subFFTSize, #3
+
+        // Update grpCount and grpSize rightaway
+
+        MOV     subFFTNum,#1                          //after the last stage
+        LSL     grpCount,subFFTSize,#1
+
+        // update subFFTSize for the next stage
+        MOV     subFFTSize,grpCount
+
+        rsb     dstStep,outPointStep,#16
+
+        // Loop on 2 grps at a time for the last stage
+
+radix2lsGrpLoop\name :
+        // dWr = [pTwiddle[0].Re, pTwiddle[1].Re]
+        // dWi = [pTwiddle[0].Im, pTwiddle[1].Im]
+        ld2     {dWr,dWi},[pTwiddle], #16
+
+        // dXr0 = [pSrc[0].Re, pSrc[2].Re]
+        // dXi0 = [pSrc[0].Im, pSrc[2].Im]
+        // dXr1 = [pSrc[1].Re, pSrc[3].Re]
+        // dXi1 = [pSrc[1].Im, pSrc[3].Im]
+        ld4     {dXr0,dXi0,dXr1,dXi1}, [pSrc], #32
+
+        SUBS    grpCount,grpCount,#4                  // grpCount is multiplied by 2
+
+        .ifeqs  "\inverse", "TRUE"
+            fmul   qT0,dWr,dXr1
+            fmla   qT0,dWi,dXi1                       // real part
+            fmul   qT1,dWr,dXi1
+            fmls   qT1,dWi,dXr1                       // imag part
+
+        .else
+
+            fmul   qT0,dWr,dXr1
+            fmls   qT0,dWi,dXi1                       // real part
+            fmul   qT1,dWr,dXi1
+            fmla   qT1,dWi,dXr1                       // imag part
+
+        .endif
+
+        fsub    dYr0,dXr0,qT0
+        fsub    dYi0,dXi0,qT1
+        fadd    dYr1,dXr0,qT0
+        fadd    dYi1,dXi0,qT1
+
+        st2     {dYr0,dYi0},[pDst],outPointStep
+        st2     {dYr1,dYi1},[pDst],dstStep            // dstStep =  step = -outPointStep + 16
+
+        BGT     radix2lsGrpLoop\name
+
+
+        .endm
+
+
+
+        M_START armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace,,d12
+        FFTSTAGE "FALSE","FALSE",fwd
+        M_END
+
+
+
+        M_START armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace,,d12
+        FFTSTAGE "FALSE","TRUE",inv
+        M_END
+
+        .end
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_s.S
new file mode 100644
index 0000000..530a815
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_s.S
@@ -0,0 +1,185 @@
+//
+//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+//  Use of this source code is governed by a BSD-style license
+//  that can be found in the LICENSE file in the root of the source
+//  tree. An additional intellectual property rights grant can be found
+//  in the file PATENTS.  All contributing project authors may
+//  be found in the AUTHORS file in the root of the source tree.
+//
+//  This is a modification of armSP_FFT_CToC_SC32_Radix2_unsafe_s.s
+//  to support float instead of SC32.
+//
+
+// Description:
+// Compute a Radix 2 DIT in-order out-of-place FFT stage for an N point
+// complex signal.  This handles the general stage, not the first or last
+// stage.
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+
+// Import symbols required from other files
+// (For example tables)
+
+
+
+// Set debugging level
+//DEBUG_ON    SETL {TRUE}
+
+
+
+// Guarding implementation by the processor name
+
+
+
+
+// Guarding implementation by the processor name
+
+//Input Registers
+
+#define pSrc            x0
+#define pDst            x1
+#define pTwiddle        x2
+#define	pSubFFTNum	x3
+#define pSubFFTSize	x4	
+
+
+//Output Registers
+
+
+//Local Scratch Registers
+
+#define subFFTNum       x5
+#define subFFTSize      x6
+#define outPointStep    x8
+#define pointStep       x9
+#define pointStep32     w9
+#define grpCount        x10
+#define grpCount32      w10
+#define setCount        x13
+#define step            x15
+#define dstStep         x11
+
+// Neon Registers
+
+#define dW      v0.2s
+#define dX0     v2.2s
+#define dX1     v3.2s
+#define dX2     v4.2s
+#define dX3     v5.2s
+#define dY0     v6.2s
+#define dY1     v7.2s
+#define dY2     v8.2s
+#define dY3     v9.2s
+#define qT0     v10.2s
+#define qT1     v11.2s
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        // Define stack arguments
+
+        // Move args values into our work registers
+        ldr     subFFTNum, [pSubFFTNum]
+        ldr     subFFTSize, [pSubFFTSize]
+
+        // Update grpCount and grpSize rightaway inorder to reuse pGrpCount
+        // and pGrpSize regs
+
+        LSR     subFFTNum,subFFTNum,#1                 //grpSize
+        LSL     grpCount,subFFTSize,#1
+
+
+        // pT0+1 increments pT0 by 8 bytes
+        // pT0+pointStep = increment of 8*pointStep bytes = 4*grpSize bytes
+        lsl     pointStep, subFFTNum, #2
+
+        // update subFFTSize for the next stage
+        MOV     subFFTSize,grpCount
+
+        // pOut0+1 increments pOut0 by 8 bytes
+        // pOut0+outPointStep == increment of 8*outPointStep bytes =
+        //    4*size bytes
+        smull   outPointStep, grpCount32, pointStep32
+
+        LSL     pointStep,pointStep,#1
+
+
+        rsb      step,pointStep,#16
+        rsb      dstStep,outPointStep,#16
+
+        // Loop on the groups
+
+radix2GrpLoop\name :
+        lsr     setCount, pointStep, #3
+        LD1     {dW},[pTwiddle],pointStep              //[wi | wr]
+
+
+        // Loop on the sets
+
+
+radix2SetLoop\name :
+
+
+        // point0: dX0-real part dX1-img part
+        LD2    {dX0,dX1},[pSrc],pointStep
+        // point1: dX2-real part dX3-img part
+        LD2    {dX2,dX3},[pSrc],step
+
+        SUBS    setCount,setCount,#2
+
+        .ifeqs  "\inverse", "TRUE"
+            fmul   qT0,dX2,dW[0]
+            fmla   qT0,dX3,dW[1]                       // real part
+            fmul   qT1,dX3,dW[0]
+            fmls   qT1,dX2,dW[1]                       // imag part
+
+        .else
+
+            fmul   qT0,dX2,dW[0]
+            fmls   qT0,dX3,dW[1]                       // real part
+            fmul   qT1,dX3,dW[0]
+            fmla   qT1,dX2,dW[1]                       // imag part
+
+        .endif
+
+        fsub    dY0,dX0,qT0
+        fsub    dY1,dX1,qT1
+        fadd    dY2,dX0,qT0
+        fadd    dY3,dX1,qT1
+
+        st2    {dY0,dY1},[pDst],outPointStep
+        // dstStep = -outPointStep + 16
+        st2    {dY2,dY3},[pDst],dstStep
+
+        BGT     radix2SetLoop\name
+
+        SUBS    grpCount,grpCount,#2
+        ADD     pSrc,pSrc,pointStep
+        BGT     radix2GrpLoop\name
+
+
+        str     subFFTNum, [pSubFFTNum]
+        str     subFFTSize, [pSubFFTSize]
+        .endm
+
+
+
+        M_START armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace,,d11
+        FFTSTAGE "FALSE","FALSE",FWD
+        M_END
+
+
+
+        M_START armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace,,d11
+        FFTSTAGE "FALSE","TRUE",INV
+        M_END
+
+
+        .end
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_fs_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_fs_s.S
new file mode 100644
index 0000000..624ef3e
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_fs_s.S
@@ -0,0 +1,266 @@
+//
+//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+//  Use of this source code is governed by a BSD-style license
+//  that can be found in the LICENSE file in the root of the source
+//  tree. An additional intellectual property rights grant can be found
+//  in the file PATENTS.  All contributing project authors may
+//  be found in the AUTHORS file in the root of the source tree.
+//
+//
+//  This is a modification of armSP_FFT_CToC_SC32_Radix4_fs_unsafe_s.s
+//  to support float instead of SC32.
+//
+
+//
+// Description:
+// Compute a first stage Radix 4 FFT stage for a N point complex signal
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+// Import symbols required from other files
+// (For example tables)
+
+
+
+
+// Set debugging level
+//DEBUG_ON    SETL {TRUE}
+
+
+
+// Guarding implementation by the processor name
+
+
+
+// Guarding implementation by the processor name
+
+//Input Registers
+
+#define pSrc            x0
+#define pDst            x1
+#define pTwiddle        x2
+#define	pSubFFTNum	x3
+#define pSubFFTSize	x4	
+
+
+//Output Registers
+
+
+//Local Scratch Registers
+
+#define subFFTNum       x5
+#define subFFTSize      x6
+#define grpSize         x7
+// Reuse grpSize as setCount
+#define setCount        x7
+#define pointStep       x8
+#define outPointStep    x8
+#define setStep         x9
+#define step1           x10
+#define step3           x11
+
+// Neon Registers
+
+#define dXr0    v0.2s
+#define dXi0    v1.2s
+#define dXr1    v2.2s
+#define dXi1    v3.2s
+#define dXr2    v4.2s
+#define dXi2    v5.2s
+#define dXr3    v6.2s
+#define dXi3    v7.2s
+#define dYr0    v8.2s
+#define dYi0    v9.2s
+#define dYr1    v10.2s
+#define dYi1    v11.2s
+#define dYr2    v12.2s
+#define dYi2    v13.2s
+#define dYr3    v14.2s
+#define dYi3    v15.2s
+#define dZr0    v16.2s
+#define dZi0    v17.2s
+#define dZr1    v18.2s
+#define dZi1    v19.2s
+#define dZr2    v20.2s
+#define dZi2    v21.2s
+#define dZr3    v22.2s
+#define dZi3    v23.2s
+
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        // Define stack arguments
+
+        // Move args values into our work registers
+        ldr     subFFTNum, [pSubFFTNum]
+        ldr     subFFTSize, [pSubFFTSize]
+        
+        // pT0+1 increments pT0 by 8 bytes
+        // pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
+        // Note: outPointStep = pointStep for firststage
+
+        lsl     pointStep, subFFTNum, #1
+
+        // Update pSubFFTSize and pSubFFTNum regs
+        ld2     {dXr0,dXi0}, [pSrc], pointStep          // data[0]
+
+        // subFFTSize = 1 for the first stage
+        MOV     subFFTSize,#4
+
+        // Note: setCount = subFFTNum/4 (reuse the grpSize reg for setCount)
+        LSR     grpSize,subFFTNum,#2
+        ld2     {dXr1,dXi1}, [pSrc], pointStep          //  data[1]
+        MOV     subFFTNum,grpSize
+
+
+        // Calculate the step of input data for the next set
+        //MOV     setStep,pointStep,LSL #1
+        lsl     setStep, grpSize, #4
+        ld2     {dXr2,dXi2}, [pSrc], pointStep          //  data[2]
+
+        // setStep = 3*pointStep
+        ADD     setStep,setStep,pointStep
+        // setStep = - 3*pointStep+16
+
+        rsb     setStep,setStep,#16
+        //  data[3] & update pSrc for the next set
+        ld2     {dXr3,dXi3}, [pSrc], setStep
+
+        // step1 = 2*pointStep
+        lsl     step1, pointStep, #1
+
+        // fadd qY0, qX0, qX2
+        fadd    dYr0, dXr0, dXr2        
+        fadd    dYi0, dXi0, dXi2
+        // step3 = -pointStep
+        neg     step3, pointStep
+
+        // grp = 0 a special case since all the twiddle factors are 1
+        // Loop on the sets : 2 sets at a time
+
+radix4fsGrpZeroSetLoop\name :
+
+
+
+        // Decrement setcount
+        SUBS    setCount,setCount,#2
+
+
+        // finish first stage of 4 point FFT
+
+
+        // fsub qy2,qx0,qx2
+        fsub    dYr2, dXr0, dXr2
+        fsub    dYi2, dXi0, dXi2
+
+        ld2     {dXr0,dXi0}, [pSrc], step1              //  data[0]
+        // fadd qy1,qx1,qx3     
+        fadd    dYr1, dXr1, dXr3                    
+        fadd    dYi1, dXi1, dXi3
+        ld2     {dXr2,dXi2}, [pSrc], step3              //  data[2]
+        // fsub qy3,qx1,qx3     
+        fsub    dYr3, dXr1, dXr3                    
+        fsub    dYi3, dXi1, dXi3
+
+
+        // finish second stage of 4 point FFT
+
+        .ifeqs "\inverse", "TRUE"
+
+            ld2     {dXr1,dXi1}, [pSrc], step1          //  data[1]
+            // fadd  qz0,qy0,qy1
+            fadd    dZr0, dYr0, dYr1                    
+            fadd    dZi0, dYi0, dYi1
+
+            //  data[3] & update pSrc for the next set, but not if it's the
+            //  last iteration so that we don't read past the end of the 
+            //  input array.
+            BEQ     radix4SkipLastUpdateInv\name
+            ld2     {dXr3,dXi3}, [pSrc], setStep
+
+radix4SkipLastUpdateInv\name:
+            FSUB    dZr3,dYr2,dYi3
+
+            st2    {dZr0,dZi0},[pDst],outPointStep
+            FADD    dZi3,dYi2,dYr3
+
+            // fsub qZ1,qY0,qY1 
+            FSUB    dZr1, dYr0, dYr1
+            FSUB    dZi1, dYi0, dYi1
+            st2    {dZr3,dZi3},[pDst],outPointStep
+
+            FADD    dZr2,dYr2,dYi3
+            st2    {dZr1,dZi1},[pDst],outPointStep
+            FSUB    dZi2,dYi2,dYr3
+
+            //  fadd qY0, qX0, qX2
+            FADD    dYr0, dXr0, dXr2                    // u0 for next iteration
+            FADD    dYi0, dXi0, dXi2
+            st2    {dZr2,dZi2},[pDst],setStep
+
+
+        .else
+
+            ld2     {dXr1,dXi1}, [pSrc], step1          //  data[1]
+            // fadd qZ0,qY0,qY1
+            fadd    dZr0, dYr0, dYr1
+            fadd    dZi0, dYi0, dYi1
+
+            //  data[3] & update pSrc for the next set, but not if it's the
+            //  last iteration so that we don't read past the end of the 
+            //  input array.
+            BEQ     radix4SkipLastUpdateFwd\name
+            ld2     {dXr3,dXi3}, [pSrc], setStep
+
+radix4SkipLastUpdateFwd\name:
+            FADD    dZr2,dYr2,dYi3
+
+            st2    {dZr0,dZi0},[pDst],outPointStep
+            FSUB    dZi2,dYi2,dYr3
+
+            // fsub  qz1,qy0,qy1
+            fsub    dZr1, dYr0, dYr1
+            fsub    dZi1, dYi0, dYi1
+            st2    {dZr2,dZi2},[pDst],outPointStep
+
+            FSUB    dZr3,dYr2,dYi3
+            st2    {dZr1,dZi1},[pDst],outPointStep
+            FADD    dZi3,dYi2,dYr3
+
+            //  fadd  qy0,qx0,qx2
+            fadd    dYr0, dXr0, dXr2                    // u0 for next iteration
+            fadd    dYi0, dXi0, dXi2
+
+            st2    {dZr3,dZi3},[pDst],setStep
+
+        .endif
+
+        BGT     radix4fsGrpZeroSetLoop\name
+
+        // Save subFFTNum and subFFTSize for next stage
+        str     subFFTNum, [pSubFFTNum]
+        str     subFFTSize, [pSubFFTSize]
+        
+        .endm
+
+
+
+        M_START armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace,,d15
+        FFTSTAGE "FALSE","FALSE",fwd
+        M_END
+
+
+
+        M_START armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace,,d15
+        FFTSTAGE "FALSE","TRUE",inv
+        M_END
+
+
+        .end
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_ls_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_ls_s.S
new file mode 100644
index 0000000..2fc2e60
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_ls_s.S
@@ -0,0 +1,371 @@
+//
+//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+//  Use of this source code is governed by a BSD-style license
+//  that can be found in the LICENSE file in the root of the source
+//  tree. An additional intellectual property rights grant can be found
+//  in the file PATENTS.  All contributing project authors may
+//  be found in the AUTHORS file in the root of the source tree.
+//
+//  This is a modification of armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.s
+//  to support float instead of SC32.
+//
+
+//
+// Description:
+// Compute a Radix 4 FFT stage for a N point complex signal
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+// Import symbols required from other files
+// (For example tables)
+
+
+
+
+// Set debugging level
+//DEBUG_ON    SETL {TRUE}
+
+
+// Guarding implementation by the processor name
+
+
+// Import symbols required from other files
+// (For example tables)
+    //IMPORT  armAAC_constTable
+
+//Input Registers
+
+#define pSrc            x0
+#define pDst            x1
+#define pTwiddle        x2
+#define	pSubFFTNum	x3
+#define pSubFFTSize	x4	
+
+
+
+//Output Registers
+
+
+//Local Scratch Registers
+
+#define subFFTNum       x5
+#define subFFTSize      x6
+#define outPointStep    x8
+#define grpCount        x9
+#define dstStep         x10
+#define grpTwStep       x13
+#define stepTwiddle     x14
+#define twStep          x15
+#define step16          x11
+#define step24          x12
+
+
+// Neon Registers
+
+#define dButterfly1Real02       v0.2s
+#define dButterfly1Real028b     v0.8b
+#define dButterfly1Imag02       v1.2s
+#define dButterfly1Imag028b     v1.8b
+#define dButterfly1Real13       v2.2s
+#define dButterfly1Real138b     v2.8b
+#define dButterfly1Imag13       v3.2s
+#define dButterfly1Imag138b     v3.8b
+#define dButterfly2Real02       v4.2s
+#define dButterfly2Imag02       v5.2s
+#define dButterfly2Real13       v6.2s
+#define dButterfly2Imag13       v7.2s
+#define dXr0                    v0.2s
+#define dXi0                    v1.2s
+#define dXr08b                  v0.8b
+#define dXi08b                  v1.8b
+#define dXr1                    v2.2s
+#define dXi1                    v3.2s
+#define dXr2                    v4.2s
+#define dXi2                    v5.2s
+#define dXr3                    v6.2s
+#define dXi3                    v7.2s
+
+#define dYr0                    v16.2s
+#define dYi0                    v17.2s
+#define dYr1                    v18.2s
+#define dYi1                    v19.2s
+#define dYr2                    v20.2s
+#define dYi2                    v21.2s
+#define dYr3                    v22.2s
+#define dYi3                    v23.2s
+
+#define dW1r                    v8.2s
+#define dW1i                    v9.2s
+#define dW2r                    v10.2s
+#define dW2r8b                  v10.8b
+#define dW2i                    v11.2s
+#define dW3r                    v12.2s
+#define dW3r8b                  v12.8b
+#define dW3i                    v13.2s
+
+#define dZr0                    v14.2s
+#define dZi0                    v15.2s
+#define dZr08b                  v14.8b
+#define dZi08b                  v15.8b
+#define dZr1                    v26.2s
+#define dZi1                    v27.2s
+#define dZr2                    v28.2s
+#define dZi2                    v29.2s
+#define dZr3                    v30.2s
+#define dZi3                    v31.2s
+
+#define dZip                    v24.2s
+#define dZip8b                  v24.8b
+
+        .MACRO FFTSTAGE scaled, inverse , name
+
+        // Define stack arguments
+
+        // Move args values into our work registers
+        ldr     subFFTNum, [pSubFFTNum]
+        ldr     subFFTSize, [pSubFFTSize]
+
+        // pOut0+1 increments pOut0 by 8 bytes
+        // pOut0+outPointStep == increment of 8*outPointStep bytes
+        lsl     outPointStep,subFFTSize, #3
+
+        // Update grpCount and grpSize rightaway
+
+        ld2    {dW1r,dW1i},[pTwiddle]             // [wi|wr]
+        MOV     step16,#16
+        LSL     grpCount,subFFTSize,#2
+
+        ld1    {dW2r},[pTwiddle]                  // [wi|wr]
+        MOV     subFFTNum,#1                      //after the last stage
+
+        ld1    {dW3r},[pTwiddle],step16           // [wi|wr]
+        MOV     stepTwiddle,#0
+
+        ld1    {dW2i},[pTwiddle],#8               // [wi|wr]
+        SUB     grpTwStep,stepTwiddle,#8          // grpTwStep = -8 to start with
+
+        // update subFFTSize for the next stage
+        MOV     subFFTSize,grpCount
+        ld1    {dW3i},[pTwiddle],grpTwStep        // [wi|wr]
+        lsl     dstStep,outPointStep, #1
+
+        // AC.r AC.i BD.r BD.i
+        ld4     {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc], #32
+        ADD     dstStep,dstStep,outPointStep      // dstStep = 3*outPointStep
+
+        rsb     dstStep,dstStep,#16               // dstStep = - 3*outPointStep+16
+        MOV     step24,#24
+
+        // AC.r AC.i BD.r BD.i
+        ld4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc], #32
+
+
+        // Process two groups at a time
+
+radix4lsGrpLoop\name :
+
+        // VZIP    dW2r,dW2i
+        zip1    dZip, dW2r, dW2i
+        zip2    dW2i, dW2r, dW2i
+        mov     dW2r8b, dZip8b
+
+        ADD     stepTwiddle,stepTwiddle,#16
+
+        // VZIP    dW3r,dW3i
+        zip1    dZip, dW3r,dW3i
+        zip2    dW3i, dW3r, dW3i
+        mov     dW3r8b, dZip8b
+        ADD     grpTwStep,stepTwiddle,#4
+
+        // VUZP     dButterfly1Real13, dButterfly2Real13      // B.r D.r
+        uzp1     dZip, dButterfly1Real13, dButterfly2Real13   // B.r D.r
+        uzp2     dButterfly2Real13, dButterfly1Real13, dButterfly2Real13   // B.r D.r
+        mov      dButterfly1Real138b, dZip8b
+
+        SUB     twStep,stepTwiddle,#16                        // -16+stepTwiddle
+
+        // VUZP     dButterfly1Imag13, dButterfly2Imag13      // B.i D.i
+        uzp1     dZip, dButterfly1Imag13, dButterfly2Imag13   // B.i D.i
+        uzp2     dButterfly2Imag13, dButterfly1Imag13, dButterfly2Imag13   // B.i D.i
+        mov      dButterfly1Imag138b, dZip8b
+        lsl     grpTwStep,grpTwStep,#1
+
+        // VUZP     dButterfly1Real02, dButterfly2Real02      // A.r C.r
+        uzp1     dZip, dButterfly1Real02, dButterfly2Real02   // A.r C.r
+        uzp2     dButterfly2Real02, dButterfly1Real02, dButterfly2Real02   // A.r C.r
+        mov      dButterfly1Real028b, dZip8b
+        rsb     grpTwStep,grpTwStep,#0                        // -8-2*stepTwiddle
+
+        // VUZP     dButterfly1Imag02, dButterfly2Imag02      // A.i C.i
+        uzp1     dZip, dButterfly1Imag02, dButterfly2Imag02   // A.i C.i
+        uzp2     dButterfly2Imag02, dButterfly1Imag02, dButterfly2Imag02   // A.i C.i
+        mov      dButterfly1Imag028b, dZip8b
+
+
+        // grpCount is multiplied by 4
+        SUBS    grpCount,grpCount,#8
+
+        .ifeqs  "\inverse", "TRUE"
+            fmul   dZr1,dW1r,dXr1
+            fmla   dZr1,dW1i,dXi1                       // real part
+            fmul   dZi1,dW1r,dXi1
+            fmls   dZi1,dW1i,dXr1                       // imag part
+
+        .else
+
+            fmul   dZr1,dW1r,dXr1
+            fmls   dZr1,dW1i,dXi1                       // real part
+            fmul   dZi1,dW1r,dXi1
+            fmla   dZi1,dW1i,dXr1                       // imag part
+
+        .endif
+
+        ld2    {dW1r,dW1i},[pTwiddle],stepTwiddle       // [wi|wr]
+
+        .ifeqs  "\inverse", "TRUE"
+            fmul   dZr2,dW2r,dXr2
+            fmla   dZr2,dW2i,dXi2                       // real part
+            fmul   dZi2,dW2r,dXi2
+            ld1   {dW2r},[pTwiddle],step16              // [wi|wr]
+            fmls   dZi2,dW2i,dXr2                       // imag part
+
+        .else
+
+            fmul   dZr2,dW2r,dXr2
+            fmls   dZr2,dW2i,dXi2                       // real part
+            fmul   dZi2,dW2r,dXi2
+            ld1    {dW2r},[pTwiddle],step16             // [wi|wr]
+            fmla   dZi2,dW2i,dXr2                       // imag part
+
+        .endif
+
+
+        ld1    {dW2i},[pTwiddle],twStep                 // [wi|wr]
+
+        // move qX0 so as to load for the next iteration
+        // MOV     qZ0,qX0
+        mov     dZr08b, dXr08b
+        mov     dZi08b, dXi08b
+
+        .ifeqs  "\inverse", "TRUE"
+            fmul   dZr3,dW3r,dXr3
+            fmla   dZr3,dW3i,dXi3                       // real part
+            fmul   dZi3,dW3r,dXi3
+            ld1    {dW3r},[pTwiddle],step24
+            fmls   dZi3,dW3i,dXr3                       // imag part
+
+        .else
+
+            fmul   dZr3,dW3r,dXr3
+            fmls   dZr3,dW3i,dXi3                       // real part
+            fmul   dZi3,dW3r,dXi3
+            ld1    {dW3r},[pTwiddle],step24
+            fmla   dZi3,dW3i,dXr3                       // imag part
+
+        .endif
+
+        ld1    {dW3i},[pTwiddle],grpTwStep              // [wi|wr]
+
+        // Don't do the load on the last iteration so we don't read past the end
+        // of pSrc.
+        bne     skipIncrement\name
+        add     pSrc, pSrc, #64
+skipIncrement\name:     
+        beq     radix4lsSkipRead\name
+        // AC.r AC.i BD.r BD.i
+        ld4     {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc], #32
+
+        // AC.r AC.i BD.r BD.i
+        ld4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc], #32
+radix4lsSkipRead\name:
+
+        // finish first stage of 4 point FFT
+
+        // fadd    qY0,qZ0,qZ2
+        fadd    dYr0,dZr0,dZr2
+        fadd    dYi0,dZi0,dZi2
+        // fsub    qY2,qZ0,qZ2
+        fsub    dYr2,dZr0,dZr2
+        fsub    dYi2,dZi0,dZi2
+        // fadd    qY1,qZ1,qZ3
+        fadd    dYr1,dZr1,dZr3
+        fadd    dYi1,dZi1,dZi3
+        // fsub    qY3,qZ1,qZ3
+        fsub    dYr3,dZr1,dZr3
+        fsub    dYi3,dZi1,dZi3
+
+
+        // finish second stage of 4 point FFT
+
+        .ifeqs  "\inverse", "TRUE"
+
+            // fsub    qZ0,qY2,qY1
+            fsub    dZr0,dYr2,dYr1
+            fsub    dZi0,dYi2,dYi1
+            fadd    dZr3,dYr0,dYi3
+            st2    {dZr0,dZi0},[pDst],outPointStep
+            fsub    dZi3,dYi0,dYr3
+
+            // fadd    qZ2,qY2,qY1
+            fadd    dZr2,dYr2,dYr1
+            fadd    dZi2,dYi2,dYi1
+
+            st2    {dZr3,dZi3},[pDst],outPointStep
+
+            fsub    dZr1,dYr0,dYi3
+            st2    {dZr2,dZi2},[pDst],outPointStep
+            fadd    dZi1,dYi0,dYr3
+
+            // dstStep = -outPointStep + 16
+            st2    {dZr1,dZi1},[pDst],dstStep
+
+
+        .else
+
+            // fsub    qZ0,qY2,qY1
+            fsub    dZr0,dYr2,dYr1
+            fsub    dZi0,dYi2,dYi1
+
+            fsub    dZr1,dYr0,dYi3
+            st2    {dZr0,dZi0},[pDst],outPointStep
+            fadd    dZi1,dYi0,dYr3
+
+            // fadd    qZ2,qY2,qY1
+            fadd    dZr2,dYr2,dYr1
+            fadd    dZi2,dYi2,dYi1
+
+            st2    {dZr1,dZi1},[pDst],outPointStep
+
+            fadd    dZr3,dYr0,dYi3
+            st2    {dZr2,dZi2},[pDst],outPointStep
+            fsub    dZi3,dYi0,dYr3
+
+            // dstStep = -outPointStep + 16
+            st2    {dZr3,dZi3},[pDst],dstStep
+
+
+        .endif
+
+        BGT     radix4lsGrpLoop\name
+
+        .endm
+
+
+        M_START armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace,,d15
+        FFTSTAGE "FALSE","FALSE",fwd
+        M_END
+
+
+        M_START armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace,,d15
+        FFTSTAGE "FALSE","TRUE",inv
+        M_END
+
+
+        .end
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_s.S
new file mode 100644
index 0000000..830fd16
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_s.S
@@ -0,0 +1,339 @@
+//
+//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+//  Use of this source code is governed by a BSD-style license
+//  that can be found in the LICENSE file in the root of the source
+//  tree. An additional intellectual property rights grant can be found
+//  in the file PATENTS.  All contributing project authors may
+//  be found in the AUTHORS file in the root of the source tree.
+//
+//
+//  This is a modification of armSP_FFT_CToC_SC32_Radix4_unsafe_s.s
+//  to support float instead of SC32.
+//
+
+//
+// Description:
+// Compute a Radix 4 FFT stage for a N point complex signal
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+
+// Import symbols required from other files
+// (For example tables)
+
+
+
+
+// Set debugging level
+//DEBUG_ON    SETL {TRUE}
+
+
+
+// Guarding implementation by the processor name
+
+
+
+
+// Guarding implementation by the processor name
+
+
+// Import symbols required from other files
+// (For example tables)
+
+
+//Input Registers
+
+#define pSrc            x0
+#define pDst            x1
+#define pTwiddle        x2
+#define	pSubFFTNum	x3
+#define pSubFFTSize	x4	
+
+
+
+//Output Registers
+
+
+//Local Scratch Registers
+
+#define subFFTNum       x5
+#define subFFTSize      x6
+#define grpCount        x7
+#define grpCount32      w7
+#define pointStep       x8
+#define pointStep32     w8
+#define outPointStep    x9
+#define stepTwiddle     x10
+#define setCount        x11
+#define srcStep         x12
+#define setStep         x13
+#define dstStep         x14
+#define twStep          x15
+
+// Neon Registers
+
+#define dW1     v0.2s
+#define dW2     v1.2s
+#define dW3     v2.2s
+
+#define dXr0    v4.2s
+#define dXi0    v5.2s
+#define dXr1    v6.2s
+#define dXi1    v7.2s
+#define dXr2    v8.2s
+#define dXi2    v9.2s
+#define dXr3    v10.2s
+#define dXi3    v11.2s
+#define dYr0    v12.2s
+#define dYi0    v13.2s
+#define dYr1    v14.2s
+#define dYi1    v15.2s
+#define dYr2    v16.2s
+#define dYi2    v17.2s
+#define dYr3    v18.2s
+#define dYi3    v19.2s
+#define dZr0    v20.2s
+#define dZi0    v21.2s
+#define dZr1    v22.2s
+#define dZi1    v23.2s
+#define dZr2    v24.2s
+#define dZi2    v25.2s
+#define dZr3    v26.2s
+#define dZi3    v27.2s
+
+        .MACRO FFTSTAGE scaled, inverse , name
+
+        // Define stack arguments
+
+        // Move args values into our work registers
+        ldr     subFFTNum, [pSubFFTNum]
+        ldr     subFFTSize, [pSubFFTSize]
+
+        // Update grpCount and grpSize rightaway inorder to reuse
+        // pGrpCount and pGrpSize regs
+
+        LSL     grpCount,subFFTSize,#2
+        LSR     subFFTNum,subFFTNum,#2
+        MOV     subFFTSize,grpCount
+
+        ld1      {dW1},[pTwiddle]                    //[wi | wr]
+        // pT0+1 increments pT0 by 8 bytes
+        // pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
+        lsl     pointStep,subFFTNum, #1
+
+        // pOut0+1 increments pOut0 by 8 bytes
+        // pOut0+outPointStep == increment of 8*outPointStep bytes
+        //   = 2*size bytes
+
+        MOV     stepTwiddle,#0
+        ld1      {dW2},[pTwiddle]                    //[wi | wr]
+        smull   outPointStep,grpCount32,pointStep32
+
+        LSL     pointStep,pointStep,#2             // 2*grpSize
+
+        ld1      {dW3},[pTwiddle]                  //[wi | wr]
+        lsl     srcStep,pointStep, #1              // srcStep = 2*pointStep
+
+        ADD     setStep,srcStep,pointStep          // setStep = 3*pointStep
+
+        rsb     setStep,setStep,#0                 // setStep = - 3*pointStep
+        SUB     srcStep,srcStep,#16                // srcStep = 2*pointStep-16
+
+        lsl     dstStep,outPointStep, #1
+
+        ADD     dstStep,dstStep,outPointStep       // dstStep = 3*outPointStep
+        // dstStep = - 3*outPointStep+16
+        rsb     dstStep,dstStep,#16
+
+
+radix4GrpLoop\name :
+
+        ld2     {dXr0,dXi0},[pSrc],pointStep       //  data[0]
+        ADD      stepTwiddle,stepTwiddle,pointStep
+        ld2     {dXr1,dXi1},[pSrc],pointStep       //  data[1]
+        // set pTwiddle to the first point
+        ADD      pTwiddle,pTwiddle,stepTwiddle
+        ld2     {dXr2,dXi2},[pSrc],pointStep       //  data[2]
+        lsl      twStep,stepTwiddle, #2
+
+        //  data[3] & update pSrc for the next set
+        ld2     {dXr3,dXi3},[pSrc],setStep
+        SUB      twStep,stepTwiddle,twStep         // twStep = -3*stepTwiddle
+
+        lsr      setCount,pointStep, #3
+
+        // set pSrc to data[0] of the next set
+        ADD     pSrc,pSrc,#16
+        // increment to data[1] of the next set
+        ADD     pSrc,pSrc,pointStep
+
+
+        // Loop on the sets
+
+radix4SetLoop\name :
+
+
+
+        .ifeqs  "\inverse", "TRUE"
+            fmul   dZr1,dXr1,dW1[0]
+            fmul   dZi1,dXi1,dW1[0]
+            fmul   dZr2,dXr2,dW2[0]
+            fmul   dZi2,dXi2,dW2[0]
+            fmul   dZr3,dXr3,dW3[0]
+            fmul   dZi3,dXi3,dW3[0]
+
+            fmla   dZr1,dXi1,dW1[1]                // real part
+            fmls   dZi1,dXr1,dW1[1]                // imag part
+
+            //  data[1] for next iteration
+            ld2     {dXr1,dXi1},[pSrc],pointStep
+
+            fmla   dZr2,dXi2,dW2[1]                // real part
+            fmls   dZi2,dXr2,dW2[1]                // imag part
+
+            //  data[2] for next iteration
+            ld2     {dXr2,dXi2},[pSrc],pointStep
+
+            fmla   dZr3,dXi3,dW3[1]                // real part
+            fmls   dZi3,dXr3,dW3[1]                // imag part
+        .else
+            fmul   dZr1,dXr1,dW1[0]
+            fmul   dZi1,dXi1,dW1[0]
+            fmul   dZr2,dXr2,dW2[0]
+            fmul   dZi2,dXi2,dW2[0]
+            fmul   dZr3,dXr3,dW3[0]
+            fmul   dZi3,dXi3,dW3[0]
+
+            fmls   dZr1,dXi1,dW1[1]                // real part
+            fmla   dZi1,dXr1,dW1[1]                // imag part
+
+            //  data[1] for next iteration
+            ld2     {dXr1,dXi1},[pSrc],pointStep
+
+            fmls   dZr2,dXi2,dW2[1]                // real part
+            fmla   dZi2,dXr2,dW2[1]                // imag part
+
+            //  data[2] for next iteration
+            ld2     {dXr2,dXi2},[pSrc],pointStep
+
+            fmls   dZr3,dXi3,dW3[1]                // real part
+            fmla   dZi3,dXr3,dW3[1]                // imag part
+        .endif
+
+        //  data[3] & update pSrc to data[0]
+        // But don't read on the very last iteration because that reads past 
+        // the end of pSrc. The last iteration is grpCount = 4, setCount = 2.
+        cmp     grpCount, #4
+        
+        b.ne    skipUpdate\name
+        cmp     setCount, #2
+        b.ne    skipUpdate\name
+        add     pSrc, pSrc, setStep
+        beq     radix4SkipRead\name
+skipUpdate\name:
+        ld2     {dXr3,dXi3},[pSrc],setStep
+radix4SkipRead\name:
+
+        SUBS    setCount,setCount,#2
+
+        // finish first stage of 4 point FFT
+        // fadd    qY0,qX0,qZ2
+        // fsub    qY2,qX0,qZ2
+        fadd    dYr0,dXr0,dZr2
+        fsub    dYr2,dXr0,dZr2
+        fadd    dYi0,dXi0,dZi2
+        fsub    dYi2,dXi0,dZi2
+
+        //  data[0] for next iteration
+        ld2     {dXr0,dXi0},[pSrc], #16
+        // fadd    qY1,qZ1,qZ3
+        // fsub    qY3,qZ1,qZ3
+        fadd    dYr1,dZr1,dZr3
+        fsub    dYr3,dZr1,dZr3
+        fadd    dYi1,dZi1,dZi3
+        fsub    dYi3,dZi1,dZi3
+
+        // finish second stage of 4 point FFT
+
+        // fsub    qZ0,qY2,qY1
+        fsub    dZr0,dYr2,dYr1
+        fsub    dZi0,dYi2,dYi1
+
+        .ifeqs  "\inverse", "TRUE"
+
+            fadd    dZr3,dYr0,dYi3
+            st2     {dZr0,dZi0},[pDst],outPointStep
+            fsub    dZi3,dYi0,dYr3
+
+            // fadd    qZ2,qY2,qY1
+            fadd    dZr2,dYr2,dYr1
+            fadd    dZi2,dYi2,dYi1
+
+            st2     {dZr3,dZi3},[pDst],outPointStep
+
+            fsub    dZr1,dYr0,dYi3
+            st2     {dZr2,dZi2},[pDst],outPointStep
+            fadd    dZi1,dYi0,dYr3
+
+            st2     {dZr1,dZi1},[pDst],dstStep
+
+
+        .else
+
+            fsub    dZr1,dYr0,dYi3
+            st2     {dZr0,dZi0},[pDst],outPointStep
+            fadd    dZi1,dYi0,dYr3
+
+            // fadd    qZ2,qY2,qY1
+            fadd    dZr2,dYr2,dYr1
+            fadd    dZi2,dYi2,dYi1
+
+            st2     {dZr1,dZi1},[pDst],outPointStep
+
+            fadd    dZr3,dYr0,dYi3
+            st2     {dZr2,dZi2},[pDst],outPointStep
+            fsub    dZi3,dYi0,dYr3
+
+            st2     {dZr3,dZi3},[pDst],dstStep
+
+
+        .endif
+
+        // increment to data[1] of the next set
+        ADD     pSrc,pSrc,pointStep
+        BGT     radix4SetLoop\name
+
+
+        ld1      {dW1},[pTwiddle],stepTwiddle    //[wi | wr]
+        // subtract 4 since grpCount multiplied by 4
+        SUBS    grpCount,grpCount,#4
+        ld1      {dW2},[pTwiddle],stepTwiddle    //[wi | wr]
+        // increment pSrc for the next grp
+        ADD     pSrc,pSrc,srcStep
+        ld1      {dW3},[pTwiddle],twStep         //[wi | wr]
+        BGT     radix4GrpLoop\name
+
+        str     subFFTNum, [pSubFFTNum]
+        str     subFFTSize, [pSubFFTSize]
+
+        .endm
+
+
+        M_START armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace,,d15
+            FFTSTAGE "FALSE","FALSE",FWD
+        M_END
+
+
+        M_START armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace,,d15
+            FFTSTAGE "FALSE","TRUE",INV
+        M_END
+
+
+        .end
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S
new file mode 100644
index 0000000..f348e6a
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S
@@ -0,0 +1,473 @@
+//
+//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+//  Use of this source code is governed by a BSD-style license
+//  that can be found in the LICENSE file in the root of the source
+//  tree. An additional intellectual property rights grant can be found
+//  in the file PATENTS.  All contributing project authors may
+//  be found in the AUTHORS file in the root of the source tree.
+//
+//  This is a modification of armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.s
+//  to support float instead of SC32.
+//
+
+//
+// Description:
+// Compute a first stage Radix 8 FFT stage for a N point complex signal
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+// Import symbols required from other files
+// (For example tables)
+
+
+// Set debugging level
+//DEBUG_ON    SETL {TRUE}
+
+
+
+// Guarding implementation by the processor name
+
+
+
+
+// Guarding implementation by the processor name
+
+//Input Registers
+
+#define pSrc            x0
+#define pDst            x1
+#define pTwiddle        x2
+#define	pSubFFTNum	x3
+#define pSubFFTSize	x4	
+
+
+//Output Registers
+
+
+//Local Scratch Registers
+
+#define subFFTNum       x5
+#define subFFTSize      x6
+#define grpSize         x7
+// Reuse grpSize as setCount
+#define setCount        x7
+#define pointStep       x8
+#define outPointStep    x8
+#define setStep         x9
+#define step1           x10
+#define step2           x11
+#define t0              w12
+
+
+// Neon Registers
+
+#define dXr0    v0.2s
+#define dXi0    v1.2s
+#define dXr1    v2.2s
+#define dXi1    v3.2s
+#define dXr2    v4.2s
+#define dXi2    v5.2s
+#define dXr3    v6.2s
+#define dXi3    v7.2s
+#define dXr4    v8.2s
+#define dXi4    v9.2s
+#define dXr5    v10.2s
+#define dXi5    v11.2s
+#define dXr6    v12.2s
+#define dXi6    v13.2s
+#define dXr7    v14.2s
+#define dXi7    v15.2s
+#define qX0     v0.4s
+#define qX1     v1.4s
+#define qX2     v2.4s
+#define qX3     v3.4s
+#define qX4     v4.4s
+#define qX5     v5.4s
+#define qX6     v6.4s
+#define qX7     v7.4s
+
+#define dUr0    v16.2s
+#define dUi0    v17.2s
+#define dUr2    v18.2s
+#define dUi2    v19.2s
+#define dUr4    v20.2s
+#define dUi4    v21.2s
+#define dUr6    v22.2s
+#define dUi6    v23.2s
+#define dUr1    v24.2s
+#define dUi1    v25.2s
+#define dUr3    v26.2s
+#define dUi3    v27.2s
+#define dUr5    v28.2s
+#define dUi5    v29.2s
+// reuse dXr7 and dXi7
+#define dUr7    v30.2s
+#define dUi7    v31.2s
+#define qU0     v8.4s
+#define qU1     v12.4s
+#define qU2     v9.4s
+#define qU3     v13.4s
+#define qU4     v10.4s
+#define qU5     v14.4s
+#define qU6     v11.4s
+#define qU7     v15.4s
+
+
+#define dVr0    v24.2s
+#define dVi0    v25.2s
+#define dVr2    v26.2s
+#define dVi2    v27.2s
+#define dVr4    v28.2s
+#define dVi4    v29.2s
+#define dVr6    v30.2s
+#define dVi6    v31.2s
+#define dVr1    v16.2s
+#define dVi1    v17.2s
+#define dVr3    v18.2s
+#define dVi3    v19.2s
+#define dVr5    v20.2s
+#define dVi5    v21.2s
+#define dVr7    v22.2s
+#define dVi7    v23.2s
+#define qV0     v12.4s
+#define qV1     v8.4s
+#define qV2     v13.4s
+#define qV3     v9.4s
+#define qV4     v14.4s
+#define qV5     v10.4s
+#define qV6     v15.4s
+#define qV7     v11.4s
+
+#define dYr0    v16.2s
+#define dYi0    v17.2s
+#define dYr2    v18.2s
+#define dYi2    v19.2s
+#define dYr4    v20.2s
+#define dYi4    v21.2s
+#define dYr6    v22.2s
+#define dYi6    v23.2s
+#define dYr1    v24.2s
+#define dYi1    v25.2s
+#define dYr3    v26.2s
+#define dYi3    v27.2s
+#define dYr5    v28.2s
+#define dYi5    v29.2s
+#define dYr7    v30.2s
+#define dYi7    v31.2s
+#define qY0     v8.4s
+#define qY1     v12.4s
+#define qY2     v9.4s
+#define qY3     v13.4s
+#define qY4     v10.4s
+#define qY5     v14.4s
+#define qY6     v11.4s
+#define qY7     v15.4s
+
+#define dT0     v14.2s
+#define dT0s    v14.s
+#define dT1     v15.2s
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        // Define stack arguments
+
+        // Move args values into our work registers
+        ldr     subFFTNum, [pSubFFTNum]
+        ldr     subFFTSize, [pSubFFTSize]
+
+        // Update pSubFFTSize and pSubFFTNum regs
+        // subFFTSize = 1 for the first stage
+
+        movz    t0, 0x3f35, lsl #16               // High half word of sqrt(1/2).
+        movk    t0, 0x04f3                        // Low half word of sqrt(1/2).
+        MOV     subFFTSize,#8
+
+        // Note: setCount = subFFTNum/8 (reuse the grpSize reg for setCount)
+        LSR     grpSize,subFFTNum,#3
+        MOV     subFFTNum,grpSize
+
+
+        // pT0+1 increments pT0 by 8 bytes
+        // pT0+pointStep = increment of 8*pointStep bytes = grpSize bytes
+        // Note: outPointStep = pointStep for firststage
+
+        lsl     pointStep,grpSize, #3
+
+
+        // Calculate the step of input data for the next set
+        //MOV     step1,pointStep,LSL #1             // step1 = 2*pointStep
+        ld2     {dXr0,dXi0},[pSrc],pointStep         //  data[0]
+        lsl     step1,grpSize, #4
+        lsl     step2,pointStep, #3
+
+        ld2     {dXr1,dXi1},[pSrc],pointStep         //  data[1]
+        SUB     step2,step2,pointStep                // step2 = 7*pointStep
+        // setStep = - 7*pointStep+16
+        rsb     setStep,step2,#16
+
+        ld2     {dXr2,dXi2},[pSrc],pointStep         //  data[2]
+        ld2     {dXr3,dXi3},[pSrc],pointStep         //  data[3]
+        ld2     {dXr4,dXi4},[pSrc],pointStep         //  data[4]
+        ld2     {dXr5,dXi5},[pSrc],pointStep         //  data[5]
+        ld2     {dXr6,dXi6},[pSrc],pointStep         //  data[6]
+        //  data[7] & update pSrc for the next set
+        //  setStep = -7*pointStep + 16
+        ld2     {dXr7,dXi7},[pSrc],setStep
+        // grp = 0 a special case since all the twiddle factors are 1
+        // Loop on the sets
+
+radix8fsGrpZeroSetLoop\name :
+
+        // Decrement setcount
+        SUBS    setCount,setCount,#2
+
+
+        // finish first stage of 8 point FFT
+
+        // fadd    qU0,qX0,qX4
+        // fadd    qU2,qX1,qX5
+        // fadd    qU4,qX2,qX6
+        // fadd    qU6,qX3,qX7
+        fadd    dUr0,dXr0,dXr4
+        fadd    dUr2,dXr1,dXr5
+        fadd    dUr4,dXr2,dXr6
+        fadd    dUr6,dXr3,dXr7
+        fadd    dUi0,dXi0,dXi4
+        fadd    dUi2,dXi1,dXi5
+        fadd    dUi4,dXi2,dXi6
+        fadd    dUi6,dXi3,dXi7
+
+        // finish second stage of 8 point FFT
+
+        // fadd    qV0,qU0,qU4
+        // fsub    qV2,qU0,qU4
+        // fadd    qV4,qU2,qU6
+        // fsub    qV6,qU2,qU6
+        fadd    dVr0,dUr0,dUr4
+        fsub    dVr2,dUr0,dUr4
+        fadd    dVr4,dUr2,dUr6
+        fsub    dVr6,dUr2,dUr6
+        fadd    dVi0,dUi0,dUi4
+        fsub    dVi2,dUi0,dUi4
+        fadd    dVi4,dUi2,dUi6
+        fsub    dVi6,dUi2,dUi6
+
+        // finish third stage of 8 point FFT
+
+        // fadd    qY0,qV0,qV4
+        // fsub    qY4,qV0,qV4
+        fadd    dYr0,dVr0,dVr4
+        fsub    dYr4,dVr0,dVr4
+        fadd    dYi0,dVi0,dVi4
+        fsub    dYi4,dVi0,dVi4
+
+        st2     {dYr0,dYi0},[pDst],step1         // store y0
+
+        .ifeqs  "\inverse", "TRUE"
+
+            fsub    dYr2,dVr2,dVi6
+            fadd    dYi2,dVi2,dVr6
+
+            fadd    dYr6,dVr2,dVi6
+            st2     {dYr2,dYi2},[pDst],step1     // store y2
+            fsub    dYi6,dVi2,dVr6
+
+            // fsub    qU1,qX0,qX4
+            fsub    dUr1,dXr0,dXr4
+            fsub    dUi1,dXi0,dXi4
+
+            st2     {dYr4,dYi4},[pDst],step1     // store y4
+
+            // fsub    qU3,qX1,qX5
+            // fsub    qU5,qX2,qX6
+            fsub    dUr3,dXr1,dXr5
+            fsub    dUr5,dXr2,dXr6
+            fsub    dUi3,dXi1,dXi5
+            fsub    dUi5,dXi2,dXi6
+
+            st2     {dYr6,dYi6},[pDst],step1     // store y6
+
+        .ELSE
+
+            fadd    dYr6,dVr2,dVi6
+            fsub    dYi6,dVi2,dVr6
+
+            fsub    dYr2,dVr2,dVi6
+            st2     {dYr6,dYi6},[pDst],step1     // store y2
+            fadd    dYi2,dVi2,dVr6
+
+
+            // fsub    qU1,qX0,qX4
+            fsub    dUr1,dXr0,dXr4
+            fsub    dUi1,dXi0,dXi4
+
+            st2     {dYr4,dYi4},[pDst],step1     // store y4
+
+            // fsub    qU3,qX1,qX5
+            // fsub    qU5,qX2,qX6
+            fsub    dUr3,dXr1,dXr5
+            fsub    dUr5,dXr2,dXr6
+            fsub    dUi3,dXi1,dXi5
+            fsub    dUi5,dXi2,dXi6
+
+            st2     {dYr2,dYi2},[pDst],step1     // store y6
+
+
+        .ENDIF
+
+        // finish first stage of 8 point FFT
+
+        // fsub    qU7,qX3,qX7
+        fsub    dUr7,dXr3,dXr7
+        fsub    dUi7,dXi3,dXi7
+
+        mov     dT0s[0], t0
+
+        // finish second stage of 8 point FFT
+
+        fsub    dVr1,dUr1,dUi5
+        //  data[0] for next iteration
+        ld2     {dXr0,dXi0},[pSrc],pointStep
+        fadd    dVi1,dUi1,dUr5
+        fadd    dVr3,dUr1,dUi5
+        ld2     {dXr1,dXi1},[pSrc],pointStep     //  data[1]
+        fsub    dVi3,dUi1,dUr5
+
+        fsub    dVr5,dUr3,dUi7
+        ld2     {dXr2,dXi2},[pSrc],pointStep     //  data[2]
+        fadd    dVi5,dUi3,dUr7
+        fadd    dVr7,dUr3,dUi7
+        ld2     {dXr3,dXi3},[pSrc],pointStep     //  data[3]
+        fsub    dVi7,dUi3,dUr7
+
+        // finish third stage of 8 point FFT
+
+        .ifeqs  "\inverse", "TRUE"
+
+            // calculate a*v5
+            fmul    dT1,dVr5,dT0[0]              // use dVi0 for dT1
+
+            ld2     {dXr4,dXi4},[pSrc],pointStep //  data[4]
+            fmul    dVi5,dVi5,dT0[0]
+
+            ld2     {dXr5,dXi5},[pSrc],pointStep //  data[5]
+            fsub    dVr5,dT1,dVi5                // a * V5
+            fadd    dVi5,dT1,dVi5
+
+            ld2     {dXr6,dXi6},[pSrc],pointStep //  data[6]
+
+            // calculate  b*v7
+            fmul    dT1,dVr7,dT0[0]
+            fmul    dVi7,dVi7,dT0[0]
+
+            // fadd    qY1,qV1,qV5
+            // fsub    qY5,qV1,qV5
+            fadd    dYr1,dVr1,dVr5
+            fsub    dYr5,dVr1,dVr5
+            fadd    dYi1,dVi1,dVi5
+            fsub    dYi5,dVi1,dVi5
+
+            fadd    dVr7,dT1,dVi7                // b * V7
+            fsub    dVi7,dVi7,dT1
+            SUB     pDst, pDst, step2            // set pDst to y1
+
+            // On the last iteration,  this will read past the end of pSrc, 
+            // so skip this read.
+            BEQ     radix8SkipLastUpdateInv\name
+            ld2     {dXr7,dXi7},[pSrc],setStep   //  data[7]
+radix8SkipLastUpdateInv\name:
+
+            fsub    dYr3,dVr3,dVr7
+            fsub    dYi3,dVi3,dVi7
+            st2     {dYr1,dYi1},[pDst],step1     // store y1
+            fadd    dYr7,dVr3,dVr7
+            fadd    dYi7,dVi3,dVi7
+
+
+            st2     {dYr3,dYi3},[pDst],step1     // store y3
+            st2     {dYr5,dYi5},[pDst],step1     // store y5
+            st2     {dYr7,dYi7},[pDst]           // store y7
+            ADD pDst, pDst, #16
+
+        .ELSE
+
+            // calculate  b*v7
+            fmul    dT1,dVr7,dT0[0]
+            ld2     {dXr4,dXi4},[pSrc],pointStep //  data[4]
+            fmul    dVi7,dVi7,dT0[0]
+
+            ld2     {dXr5,dXi5},[pSrc],pointStep //  data[5]
+            fadd    dVr7,dT1,dVi7                     // b * V7
+            fsub    dVi7,dVi7,dT1
+
+            ld2     {dXr6,dXi6},[pSrc],pointStep //  data[6]
+
+            // calculate a*v5
+            fmul    dT1,dVr5,dT0[0]              // use dVi0 for dT1
+            fmul    dVi5,dVi5,dT0[0]
+
+            fadd    dYr7,dVr3,dVr7
+            fadd    dYi7,dVi3,dVi7
+            SUB     pDst, pDst, step2            // set pDst to y1
+
+            fsub    dVr5,dT1,dVi5                // a * V5
+            fadd    dVi5,dT1,dVi5
+
+            // On the last iteration,  this will read past the end of pSrc, 
+            // so skip this read.
+            BEQ     radix8SkipLastUpdateFwd\name
+            ld2     {dXr7,dXi7},[pSrc],setStep   //  data[7]
+radix8SkipLastUpdateFwd\name:
+
+            // fsub    qY5,qV1,qV5
+            fsub    dYr5,dVr1,dVr5
+            fsub    dYi5,dVi1,dVi5
+
+            fsub    dYr3,dVr3,dVr7
+            st2     {dYr7,dYi7},[pDst],step1     // store y1
+            fsub    dYi3,dVi3,dVi7
+
+            // fadd    qY1,qV1,qV5
+            fadd    dYr1,dVr1,dVr5
+            fadd    dYi1,dVi1,dVi5
+
+            st2     {dYr5,dYi5},[pDst],step1     // store y3
+            st2     {dYr3,dYi3},[pDst],step1     // store y5
+            st2     {dYr1,dYi1},[pDst],#16       // store y7
+
+        .ENDIF
+
+
+        // update pDst for the next set
+        SUB     pDst, pDst, step2
+        BGT     radix8fsGrpZeroSetLoop\name
+
+        // Save subFFTNum and subFFTSize for next stage
+        str     subFFTNum, [pSubFFTNum]
+        str     subFFTSize, [pSubFFTSize]
+        
+        .endm
+
+
+        // Allocate stack memory required by the function
+
+
+        M_START armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace,,d15
+            FFTSTAGE "FALSE","FALSE",FWD
+        M_END
+
+
+        M_START armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace,,d15
+            FFTSTAGE "FALSE","TRUE",INV
+        M_END
+
+
+
+        .end
diff --git a/dl/sp/src/arm/arm64/omxSP_FFTFwd_CToC_FC32.c b/dl/sp/src/arm/arm64/omxSP_FFTFwd_CToC_FC32.c
new file mode 100644
index 0000000..f29796b
--- /dev/null
+++ b/dl/sp/src/arm/arm64/omxSP_FFTFwd_CToC_FC32.c
@@ -0,0 +1,190 @@
+/*
+ *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+
+extern void armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+/**
+ * Function:  omxSP_FFTFwd_CToC_FC32_Sfs   (2.2.4.2.2)
+ *
+ * Description:
+ * Compute an FFT for a complex signal of length of 2^order, 
+ * where 0 <= order <= 15. 
+ * Transform length is determined by the specification structure, which 
+ * must be initialized prior to calling the FFT function using the appropriate 
+ * helper, i.e., <FFTInit_C_sc32> or <FFTInit_C_SC16>. The relationship 
+ * between the input and output sequences can be expressed in terms of the 
+ * DFT, i.e., 
+ *
+ *      X[k] = SUM[n=0...N-1]x[n].e^(-jnk.2.pi/N)
+ *      k = 0,1,2,..., N-1
+ *      N = 2^order
+ *
+ * Input Arguments:
+ *   pSrc - pointer to the input signal, a complex-valued vector of length
+ *            2^order; must be aligned on a 32 byte boundary. 
+ *   pFFTSpec - pointer to the preallocated and initialized specification 
+ *            structure 
+ *
+ * Output Arguments:
+ *   pDst - pointer to the complex-valued output vector, of length 2^order;
+ *            must be aligned on an 32-byte boundary. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - returned if one or more of the following conditions 
+ *              is true: 
+ *    -   one or more of the following pointers is NULL: pSrc, pDst, or 
+ *              pFFTSpec. 
+ *    -    pSrc or pDst is not 32-byte aligned 
+ *
+ */
+
+OMXResult omxSP_FFTFwd_CToC_FC32_Sfs(const OMX_FC32* pSrc,
+                                     OMX_FC32* pDst,
+                                     const OMXFFTSpec_C_FC32* pFFTSpec) {
+  ARMsFFTSpec_FC32* spec = (ARMsFFTSpec_FC32*)pFFTSpec;
+  int order;
+  long subFFTSize;
+  long subFFTNum;
+  OMX_FC32* pTwiddle;
+  OMX_FC32* pOut;
+
+  /*
+   * Check args are not NULL and the source and destination pointers
+   * are properly aligned.
+   */
+  if (!validateParametersFC32(pSrc, pDst, spec))
+    return OMX_Sts_BadArgErr;
+
+  order = fastlog2(spec->N);
+
+  subFFTSize = 1;
+  subFFTNum = spec->N;
+  pTwiddle = spec->pTwiddle;
+  pOut = spec->pBuf;
+
+  if (order > 3) {
+    OMX_FC32* argDst;
+
+    /*
+     * Set up argDst and pOut appropriately so that pOut = pDst for
+     * the very last FFT stage.
+     */
+    if ((order & 2) == 0) {
+      argDst = pOut;
+      pOut = pDst;
+    } else {
+      argDst = pDst;
+    }
+
+    /*
+     * Odd order uses a radix 8 first stage; even order, a radix 4
+     * first stage.
+     */
+    if (order & 1) {
+      armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace(
+          pSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
+    } else {
+      armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace(
+          pSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
+    }
+
+    /*
+     * Now use radix 4 stages to finish rest of the FFT
+     */
+    if (subFFTNum >= 4) {
+      while (subFFTNum > 4) {
+        OMX_FC32* tmp;
+
+        armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace(
+            argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+        /*
+         * Swap argDst and pOut
+         */
+        tmp = pOut;
+        pOut = argDst;
+        argDst = tmp;
+      }
+
+      armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace(
+          argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+    }
+  } else if (order == 3) {
+    armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace(
+        pSrc, pDst, pTwiddle, &subFFTNum, &subFFTSize);
+    armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace(
+        pDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+    armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace(
+        pOut, pDst, pTwiddle, &subFFTNum, &subFFTSize);
+  } else if (order == 2) {
+    armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace(
+        pSrc, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+    armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace(
+        pOut, pDst, pTwiddle, &subFFTNum, &subFFTSize);
+  } else {
+    /* Order = 1 */
+    armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace(
+        pSrc, pDst, pTwiddle, &subFFTNum, &subFFTSize);
+  }
+
+  return OMX_Sts_NoErr;
+}
diff --git a/dl/sp/src/arm/arm64/omxSP_FFTFwd_RToCCS_F32.c b/dl/sp/src/arm/arm64/omxSP_FFTFwd_RToCCS_F32.c
new file mode 100644
index 0000000..f1e503e
--- /dev/null
+++ b/dl/sp/src/arm/arm64/omxSP_FFTFwd_RToCCS_F32.c
@@ -0,0 +1,213 @@
+/*
+ *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+
+extern void armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void ComplexToRealFixup(OMX_FC32* pSrc,
+                               OMX_F32* pDst,
+                               const OMX_FC32* pTwiddle,
+                               OMX_F32* pBuf,
+                               long N);
+
+/**
+ * Function:  omxSP_FFTFwd_CToC_FC32_Sfs   (2.2.4.2.2)
+ *
+ * Description:
+ * Compute an FFT for a complex signal of length of 2^order,
+ * where 0 <= order <= 15.
+ * Transform length is determined by the specification structure, which
+ * must be initialized prior to calling the FFT function using the appropriate
+ * helper, i.e., <FFTInit_C_sc32> or <FFTInit_C_SC16>. The relationship
+ * between the input and output sequences can be expressed in terms of the
+ * DFT, i.e.,
+ *
+ *      X[k] = SUM[n=0...N-1]x[n].e^(-jnk.2.pi/N)
+ *      k = 0,1,2,..., N-1
+ *      N = 2^order
+ *
+ * Input Arguments:
+ *   pSrc - pointer to the input signal, a complex-valued vector of length
+ *            2^order; must be aligned on a 32 byte boundary.
+ *   pFFTSpec - pointer to the preallocated and initialized specification
+ *            structure
+ *
+ * Output Arguments:
+ *   pDst - pointer to the complex-valued output vector, of length 2^order;
+ *            must be aligned on an 32-byte boundary.
+ *
+ * Return Value:
+ *
+ *    OMX_Sts_NoErr - no error
+ *    OMX_Sts_BadArgErr - returned if one or more of the following conditions
+ *              is true:
+ *    -   one or more of the following pointers is NULL: pSrc, pDst, or
+ *              pFFTSpec.
+ *    -    pSrc or pDst is not 32-byte aligned
+ *
+ */
+
+OMXResult omxSP_FFTFwd_RToCCS_F32_Sfs(const OMX_F32* pSrc,
+                                      OMX_F32* pDst,
+                                      const OMXFFTSpec_R_F32* pFFTSpec) {
+  ARMsFFTSpec_R_FC32* spec = (ARMsFFTSpec_R_FC32*)pFFTSpec;
+  int order;
+  long subFFTSize;
+  long subFFTNum;
+  OMX_FC32* pTwiddle;
+  OMX_FC32* pOut;
+  OMX_FC32* pComplexSrc = (OMX_FC32*) pSrc;
+  OMX_FC32* pComplexDst = (OMX_FC32*) pDst;
+
+  /*
+   * Check args are not NULL and the source and destination pointers
+   * are properly aligned.
+   */
+  if (!validateParametersF32(pSrc, pDst, spec))
+    return OMX_Sts_BadArgErr;
+
+  /*
+   * Compute the RFFT using a complex FFT of one less order, so set
+   * order to be the order of the complex FFT.
+   */
+  order = fastlog2(spec->N) - 1;
+
+  subFFTSize = 1;
+  subFFTNum = spec->N >> 1;
+  pTwiddle = spec->pTwiddle;
+  pOut = (OMX_FC32*) spec->pBuf;
+
+  if (order > 3) {
+    OMX_FC32* argDst;
+    OMX_FC32* pComplexDst = (OMX_FC32*) pDst;
+
+    /*
+     * Set up argDst and pOut appropriately so that pOut = pDst for
+     * ComplexToRealFixup.
+     */
+    if ((order & 2) != 0) {
+      argDst = pOut;
+      pOut = pComplexDst;
+    } else {
+      argDst = pComplexDst;
+    }
+
+    /*
+     * Odd order uses a radix 8 first stage; even order, a radix 4
+     * first stage.
+     */
+    if (order & 1) {
+      armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace(
+          pComplexSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
+    } else {
+      armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace(
+          pComplexSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
+    }
+
+    /*
+     * Now use radix 4 stages to finish rest of the FFT
+     */
+    if (subFFTNum >= 4) {
+      while (subFFTNum > 4) {
+        OMX_FC32* tmp;
+
+        armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace(
+            argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+        /*
+         * Swap argDst and pOut
+         */
+        tmp = pOut;
+        pOut = argDst;
+        argDst = tmp;
+      }
+
+      armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace(
+          argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+    }
+  } else if (order == 3) {
+    armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace(
+        pComplexSrc, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+    armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace(
+        pOut, pComplexDst, pTwiddle, &subFFTNum, &subFFTSize);
+    armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace(
+        pComplexDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+  } else if (order == 2) {
+    armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace(
+        pComplexSrc, pComplexDst, pTwiddle, &subFFTNum, &subFFTSize);
+    armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace(
+        pComplexDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+  } else if (order == 1) {
+    armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace(
+        pComplexSrc, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+  } else {
+    /* Handle complex order 0 specially */
+    pOut->Re = pSrc[0];
+    pOut->Im = pSrc[1];
+  }
+
+  /*
+   * Complex FFT done. Fix up the complex result to give the correct
+   * RFFT.
+   */
+
+  ComplexToRealFixup(pOut, pDst, pTwiddle, spec->pBuf, spec->N);
+
+  return OMX_Sts_NoErr;
+}
diff --git a/dl/sp/src/arm/arm64/omxSP_FFTInv_CCSToR_F32.c b/dl/sp/src/arm/arm64/omxSP_FFTInv_CCSToR_F32.c
new file mode 100644
index 0000000..84de9cf
--- /dev/null
+++ b/dl/sp/src/arm/arm64/omxSP_FFTInv_CCSToR_F32.c
@@ -0,0 +1,259 @@
+/*
+ *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+
+extern void armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTInv_CCSToR_F32_preTwiddleRadix2(
+    const OMX_F32* pSrc,
+    const OMX_FC32* pTwiddle,
+    OMX_F32* pBuf,
+    long N);
+
+/*
+ * Scale FFT data by 1/|length|. |length| must be a power of two
+ */
+static inline ScaleRFFTData(OMX_F32* fftData, unsigned length) {
+  float32_t* data = (float32_t*)fftData;
+  float32_t scale = 2.0f / length;
+
+  if (length >= 4) {
+    /*
+     * Do 4 float elements at a time because |length| is always a
+     * multiple of 4 when |length| >= 4.
+     *
+     * TODO(rtoy): Figure out how to process 8 elements at a time
+     * using intrinsics or replace this with inline assembly.
+     */
+    do {
+      float32x4_t x = vld1q_f32(data);
+
+      length -= 4;
+      x = vmulq_n_f32(x, scale);
+      vst1q_f32(data, x);
+      data += 4;
+    } while (length > 0);
+  } else if (length == 2) {
+    float32x2_t x = vld1_f32(data);
+    x = vmul_n_f32(x, scale);
+    vst1_f32(data, x);
+  } else {
+    fftData[0] *= scale;
+  }
+}
+
+/**
+ * Function:  omxSP_FFTInv_CCSToR_F32_Sfs
+ *
+ * Description:
+ * These functions compute the inverse FFT for a conjugate-symmetric input 
+ * sequence.  Transform length is determined by the specification structure, 
+ * which must be initialized prior to calling the FFT function using 
+ * <FFTInit_R_F32>. For a transform of length M, the input sequence is 
+ * represented using a packed CCS vector of length M+2, and is organized 
+ * as follows: 
+ *
+ *   Index:   0  1  2    3    4    5    . . .  M-2       M-1      M      M+1 
+ *   Comp:  R[0] 0  R[1] I[1] R[2] I[2] . . .  R[M/2-1]  I[M/2-1] R[M/2] 0 
+ *
+ * where R[n] and I[n], respectively, denote the real and imaginary
+ * components for FFT bin n. Bins are numbered from 0 to M/2, where M
+ * is the FFT length.  Bin index 0 corresponds to the DC component,
+ * and bin index M/2 corresponds to the foldover frequency.
+ *
+ * Input Arguments:
+ *   pSrc - pointer to the complex-valued input sequence represented
+ *          using CCS format, of length (2^order) + 2; must be aligned on a
+ *          32-byte boundary.
+ *   pFFTSpec - pointer to the preallocated and initialized
+ *              specification structure
+ *
+ * Output Arguments:
+ *   pDst - pointer to the real-valued output sequence, of length
+ *          2^order ; must be aligned on a 32-byte boundary.
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+
+ *    OMX_Sts_BadArgErr - bad arguments if one or more of the
+ *      following is true:
+ *    -    pSrc, pDst, or pFFTSpec is NULL 
+ *    -    pSrc or pDst is not aligned on a 32-byte boundary 
+ *
+ */
+OMXResult omxSP_FFTInv_CCSToR_F32_Sfs(
+    const OMX_F32* pSrc,
+    OMX_F32* pDst,
+    const OMXFFTSpec_R_F32* pFFTSpec) {
+  ARMsFFTSpec_R_FC32* spec = (ARMsFFTSpec_R_FC32*)pFFTSpec;
+  int order;
+  long subFFTSize;
+  long subFFTNum;
+  OMX_FC32* pTwiddle;
+  OMX_FC32* pOut;
+  OMX_FC32* pComplexSrc;
+  OMX_FC32* pComplexDst = (OMX_FC32*) pDst;
+
+  /*
+   * Check args are not NULL and the source and destination pointers
+   * are properly aligned.
+   */
+  if (!validateParametersF32(pSrc, pDst, spec))
+    return OMX_Sts_BadArgErr;
+
+  /*
+   * Preprocess the input before calling the complex inverse FFT. The
+   * result is actually stored in the second half of the temp buffer
+   * in pFFTSpec.
+   */
+  if (spec->N > 1)
+    armSP_FFTInv_CCSToR_F32_preTwiddleRadix2(
+        pSrc, spec->pTwiddle, spec->pBuf, spec->N);
+  
+  /*
+   * Do a complex inverse FFT of half size.
+   */
+  order = fastlog2(spec->N) - 1;
+
+  subFFTSize = 1;
+  subFFTNum = spec->N >> 1;
+  pTwiddle = spec->pTwiddle;
+  /*
+   * The pBuf is split in half. The first half is the temp buffer. The
+   * second half holds the source data that was placed there by
+   * armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe.
+   */
+  pOut = (OMX_FC32*) spec->pBuf;
+  pComplexSrc = pOut + (1 << order);
+
+
+  if (order > 3) {
+    OMX_FC32* argDst;
+
+    /*
+     * Set up argDst and pOut appropriately so that pOut = pDst for
+     * the very last FFT stage.
+     */
+    if ((order & 2) == 0) {
+      argDst = pOut;
+      pOut = pComplexDst;
+    } else {
+      argDst = pComplexDst;
+    }
+
+    /*
+     * Odd order uses a radix 8 first stage; even order, a radix 4
+     * first stage.
+     */
+    if (order & 1) {
+      armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace(
+          pComplexSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
+    } else {
+      armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace(
+          pComplexSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
+    }
+
+    /*
+     * Now use radix 4 stages to finish rest of the FFT
+     */
+    if (subFFTNum >= 4) {
+      while (subFFTNum > 4) {
+        OMX_FC32* tmp;
+
+        armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace(
+            argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+        /*
+         * Swap argDst and pOut
+         */
+        tmp = pOut;
+        pOut = argDst;
+        argDst = tmp;
+      }
+
+      armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace(
+          argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+    }
+  } else if (order == 3) {
+    armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
+        pComplexSrc, pComplexDst, pTwiddle, &subFFTNum, &subFFTSize);
+    armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace(
+        pComplexDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+    armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace(
+        pOut, pComplexDst, pTwiddle, &subFFTNum, &subFFTSize);
+  } else if (order == 2) {
+    armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
+        pComplexSrc, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+    armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace(
+        pOut, pComplexDst, pTwiddle, &subFFTNum, &subFFTSize);
+  } else if (order == 1) {
+    armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
+        pComplexSrc, pComplexDst, pTwiddle, &subFFTNum, &subFFTSize);
+  } else {
+    /* Order = 0 */
+    *pComplexDst = *pComplexSrc;
+  }
+
+  ScaleRFFTData(pDst, spec->N);
+  return OMX_Sts_NoErr;
+}
+
diff --git a/dl/sp/src/arm/arm64/omxSP_FFTInv_CToC_FC32.c b/dl/sp/src/arm/arm64/omxSP_FFTInv_CToC_FC32.c
new file mode 100644
index 0000000..eec05e9
--- /dev/null
+++ b/dl/sp/src/arm/arm64/omxSP_FFTInv_CToC_FC32.c
@@ -0,0 +1,214 @@
+/*
+ *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+
+extern void armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+/*
+ * Scale FFT data by 1/|length|. |length| must be a power of two
+ */
+static inline ScaleFFTData(OMX_FC32* fftData, unsigned length) {
+  float32_t* data = (float32_t*)fftData;
+  float32_t scale = 1.0f / length;
+
+  /*
+   * Do two complex elements at a time because |length| is always
+   * greater than or equal to 2 (order >= 1)
+   */
+  do {
+    float32x4_t x = vld1q_f32(data);
+
+    length -= 2;
+    x = vmulq_n_f32(x, scale);
+    vst1q_f32(data, x);
+    data += 4;
+  } while (length > 0);
+}
+
+/**
+ * Function:  omxSP_FFTInv_CToC_FC32
+ *
+ * Description:
+ * These functions compute an inverse FFT for a complex signal of
+ * length of 2^order, where 0 <= order <= 15. Transform length is
+ * determined by the specification structure, which must be
+ * initialized prior to calling the FFT function using the appropriate
+ * helper, i.e., <FFTInit_C_FC32>. The relationship between the input
+ * and output sequences can be expressed in terms of the IDFT, i.e.:
+ *
+ *     x[n] = SUM[k=0,...,N-1] X[k].e^(jnk.2.pi/N)
+ *     n=0,1,2,...N-1
+ *     N=2^order.
+ *
+ * Input Arguments:
+ *   pSrc - pointer to the complex-valued input signal, of length 2^order ;
+ *          must be aligned on a 32-byte boundary.
+ *   pFFTSpec - pointer to the preallocated and initialized specification
+ *            structure
+ *
+ * Output Arguments:
+ *   order
+ *   pDst - pointer to the complex-valued output signal, of length 2^order;
+ *          must be aligned on a 32-byte boundary.
+ *
+ * Return Value:
+ *
+ *    OMX_Sts_NoErr - no error
+ *    OMX_Sts_BadArgErr - returned if one or more of the following conditions
+ *              is true:
+ *    -   one or more of the following pointers is NULL: pSrc, pDst, or
+ *              pFFTSpec.
+ *    -   pSrc or pDst is not 32-byte aligned
+ *
+ */
+
+OMXResult omxSP_FFTInv_CToC_FC32_Sfs(const OMX_FC32* pSrc,
+                                     OMX_FC32* pDst,
+                                     const OMXFFTSpec_C_FC32* pFFTSpec) {
+  ARMsFFTSpec_FC32* spec = (ARMsFFTSpec_FC32*)pFFTSpec;
+  int order;
+  long subFFTSize;
+  long subFFTNum;
+  OMX_FC32* pTwiddle;
+  OMX_FC32* pOut;
+
+  /*
+   * Check args are not NULL and the source and destination pointers
+   * are properly aligned.
+   */
+  if (!validateParametersFC32(pSrc, pDst, spec))
+    return OMX_Sts_BadArgErr;
+
+  order = fastlog2(spec->N);
+
+  subFFTSize = 1;
+  subFFTNum = spec->N;
+  pTwiddle = spec->pTwiddle;
+  pOut = spec->pBuf;
+
+  if (order > 3) {
+    OMX_FC32* argDst;
+
+    /*
+     * Set up argDst and pOut appropriately so that pOut = pDst for
+     * the very last FFT stage.
+     */
+    if ((order & 2) == 0) {
+      argDst = pOut;
+      pOut = pDst;
+    } else {
+      argDst = pDst;
+    }
+
+    /*
+     * Odd order uses a radix 8 first stage; even order, a radix 4
+     * first stage.
+     */
+    if (order & 1) {
+      armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace(
+          pSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
+    } else {
+      armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace(
+          pSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
+    }
+
+    /*
+     * Now use radix 4 stages to finish rest of the FFT
+     */
+    if (subFFTNum >= 4) {
+      while (subFFTNum > 4) {
+        OMX_FC32* tmp;
+
+        armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace(
+            argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+        /*
+         * Swap argDst and pOut
+         */
+        tmp = pOut;
+        pOut = argDst;
+        argDst = tmp;
+      }
+
+      armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace(
+          argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+    }
+  } else if (order == 3) {
+    armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
+        pSrc, pDst, pTwiddle, &subFFTNum, &subFFTSize);
+    armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace(
+        pDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+    armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace(
+        pOut, pDst, pTwiddle, &subFFTNum, &subFFTSize);
+  } else if (order == 2) {
+    armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
+        pSrc, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+    armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace(
+        pOut, pDst, pTwiddle, &subFFTNum, &subFFTSize);
+  } else {
+    /* Order = 1 */
+    armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
+        pSrc, pDst, pTwiddle, &subFFTNum, &subFFTSize);
+  }
+
+  ScaleFFTData(pDst, spec->N);
+  return OMX_Sts_NoErr;
+}