diff options
Diffstat (limited to 'dl/sp/src/arm/arm64')
-rw-r--r-- | dl/sp/src/arm/arm64/ComplexToRealFixup.S | 261 | ||||
-rw-r--r-- | dl/sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S | 280 | ||||
-rw-r--r-- | dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_fs_s.S | 136 | ||||
-rw-r--r-- | dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_ls_s.S | 149 | ||||
-rw-r--r-- | dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_s.S | 185 | ||||
-rw-r--r-- | dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_fs_s.S | 266 | ||||
-rw-r--r-- | dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_ls_s.S | 371 | ||||
-rw-r--r-- | dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_s.S | 339 | ||||
-rw-r--r-- | dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S | 473 | ||||
-rw-r--r-- | dl/sp/src/arm/arm64/omxSP_FFTFwd_CToC_FC32.c | 190 | ||||
-rw-r--r-- | dl/sp/src/arm/arm64/omxSP_FFTFwd_RToCCS_F32.c | 213 | ||||
-rw-r--r-- | dl/sp/src/arm/arm64/omxSP_FFTInv_CCSToR_F32.c | 259 | ||||
-rw-r--r-- | dl/sp/src/arm/arm64/omxSP_FFTInv_CToC_FC32.c | 214 |
13 files changed, 3336 insertions, 0 deletions
diff --git a/dl/sp/src/arm/arm64/ComplexToRealFixup.S b/dl/sp/src/arm/arm64/ComplexToRealFixup.S new file mode 100644 index 0000000..9b30093 --- /dev/null +++ b/dl/sp/src/arm/arm64/ComplexToRealFixup.S @@ -0,0 +1,261 @@ +// +// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the LICENSE file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// +// This is a modification of omxSP_FFTFwd_RToCCS_S32_Sfs_s.s +// to support float instead of SC32. +// + +// +// Description: +// Compute FFT for a real signal +// +// + + +// Include standard headers + +#include "dl/api/arm/arm64COMM_s.h" +#include "dl/api/arm/omxtypes_s.h" + + +// Import symbols required from other files +// (For example tables) + +// Set debugging level +//DEBUG_ON SETL {TRUE} + + + +// Guarding implementation by the processor name + + + + // Guarding implementation by the processor name + +// Import symbols required from other files + + +//Input Registers + +#define pSrc x0 +#define pDst x1 +#define pTwiddle x2 +#define pOut x3 +#define subFFTNum x4 + +// Output registers + +//Local Scratch Registers + +#define argTwiddle x5 +#define argDst x6 +#define subFFTSize x7 +#define N subFFTNum +#define order x14 +#define step x8 +#define step1 pTwiddle +#define twStep x9 +#define zero w10 +#define pTwiddleTmp pOut + +// Neon registers + +#define dX0 v0.2s +#define dX0s v0.s +#define dX0r v2.2s +#define dX0rs v2.s +#define dX0i v3.2s +#define dX0is v3.s +#define dX1r v4.2s +#define dX1i v5.2s +#define dT0 v6.2s +#define dT1 v7.2s +#define dT2 v8.2s +#define dT3 v9.2s +#define qT0 v10.2s +#define qT1 v12.2s +#define dW0r v14.2s +#define dW0r8b v14.8b +#define dW0i v15.2s +#define dW1r v16.2s +#define dW1r8b v16.8b +#define dW1i v17.2s +#define dY0r v14.2s +#define dY0i v15.2s +#define dY1r v16.2s +#define dY1i v17.2s +#define qT2 v18.2s +#define qT3 v20.2s + +#define half v0.2s +#define dZip v21.2s +#define dZip8b v21.8b + + // Allocate stack memory required by the function + + // Write function header + M_START ComplexToRealFixup,,d15 + + asr N, N, #1 + + clz order, subFFTNum // N = 2^order + + RSB order,order,#63 + MOV subFFTSize,subFFTNum // subFFTSize = N/2 + //MOV subFFTNum,N + mov argDst, pDst + mov argTwiddle, pTwiddle + + // F(0) = 1/2[Z(0) + Z'(0)] - j [Z(0) - Z'(0)] + // 1/2[(a+jb) + (a-jb)] - j [(a+jb) - (a-jb)] + // 1/2[2a+j0] - j [0+j2b] + // (a+b, 0) + + // F(N/2) = 1/2[Z(0) + Z'(0)] + j [Z(0) - Z'(0)] + // 1/2[(a+jb) + (a-jb)] + j [(a+jb) - (a-jb)] + // 1/2[2a+j0] + j [0+j2b] + // (a-b, 0) + + // F(0) and F(N/2) + ld2 {dX0rs,dX0is}[0],[pSrc], #8 + MOV zero,#0 + mov dX0rs[1],zero + lsl step,subFFTSize, #3 // step = N/2 * 8 bytes + mov dX0i[1],zero + // twStep = 3N/8 * 8 bytes pointing to W^1 + SUB twStep,step,subFFTSize,LSL #1 + + fadd dY0r,dX0r,dX0i // F(0) = ((Z0.r+Z0.i) , 0) + lsl step1,subFFTSize, #2 // step1 = N/2 * 4 bytes + fsub dY0i,dX0r,dX0i // F(N/2) = ((Z0.r-Z0.i) , 0) + SUBS subFFTSize,subFFTSize,#2 + + st1 {dY0r},[argDst],step + ADD pTwiddleTmp,argTwiddle,#8 // W^2 + st1 {dY0i},[argDst], #8 + ADD argTwiddle,argTwiddle,twStep // W^1 + +// dup dzero,zero + SUB argDst,argDst,step + + BLT End + BEQ lastElement + SUB step,step,#24 + SUB step1,step1,#8 // (N/4-1)*8 bytes + + // F(k) = 1/2[Z(k) + Z'(N/2-k)] -j*W^(k) [Z(k) - Z'(N/2-k)] + // Note: W^k is stored as negative values in the table + // Process 4 elements at a time. E.g: F(1),F(2) and F(N/2-2),F(N/2-1) + // since both of them require Z(1),Z(2) and Z(N/2-2),Z(N/2-1) + + fmov half, #0.5 + +evenOddButterflyLoop: + + + ld1 {dW0r},[argTwiddle],step1 + ld1 {dW1r},[argTwiddle], #8 + + ld2 {dX0r,dX0i},[pSrc],step + SUB argTwiddle,argTwiddle,step1 + ld2 {dX1r,dX1i},[pSrc], #16 + + + + SUB step1,step1,#8 // (N/4-2)*8 bytes + ld1 {dW0i},[pTwiddleTmp],step1 + ld1 {dW1i},[pTwiddleTmp], #8 + SUB pSrc,pSrc,step + + SUB pTwiddleTmp,pTwiddleTmp,step1 + rev64 dX1r,dX1r + rev64 dX1i,dX1i + SUBS subFFTSize,subFFTSize,#4 + + + + fsub dT2,dX0r,dX1r // a-c + SUB step1,step1,#8 + fadd dT0,dX0r,dX1r // a+c + fsub dT1,dX0i,dX1i // b-d + fadd dT3,dX0i,dX1i // b+d + fmul dT0,dT0,half[0] + fmul dT1,dT1,half[0] + // VZIP dW1r,dW1i + // VZIP dW0r,dW0i + zip1 dZip, dW1r, dW1i + zip2 dW1i, dW1r, dW1i + mov dW1r8b, dZip8b + zip1 dZip, dW0r, dW0i + zip2 dW0i, dW0r, dW0i + mov dW0r8b, dZip8b + + fmul qT0,dW1r,dT2 + fmul qT1,dW1r,dT3 + fmul qT2,dW0r,dT2 + fmul qT3,dW0r,dT3 + + fmla qT0,dW1i,dT3 + fmls qT1,dW1i,dT2 + + fmls qT2,dW0i,dT3 + fmla qT3,dW0i,dT2 + + + fmul dX1r,qT0,half[0] + fmul dX1i,qT1,half[0] + + fsub dY1r,dT0,dX1i // F(N/2 -1) + fadd dY1i,dT1,dX1r + fneg dY1i,dY1i + + rev64 dY1r,dY1r + rev64 dY1i,dY1i + + + fmul dX0r,qT2,half[0] + fmul dX0i,qT3,half[0] + + fsub dY0r,dT0,dX0i // F(1) + fadd dY0i,dT1,dX0r + + + st2 {dY0r,dY0i},[argDst],step + st2 {dY1r,dY1i},[argDst], #16 + SUB argDst,argDst,step + SUB step,step,#32 // (N/2-4)*8 bytes + + + BGT evenOddButterflyLoop + + // set both the ptrs to the last element + SUB pSrc,pSrc,#8 + SUB argDst,argDst,#8 + + + + // Last element can be expanded as follows + // 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)] + // 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)] + // 1/2[2a+j0] + j (c+jd) [0+j2b] + // (a-bc, -bd) + // Since (c,d) = (0,1) for the last element, result is just (a,-b) + +lastElement: + ld1 {dX0r},[pSrc] + + st1 {dX0rs}[0],[argDst], #4 + fneg dX0r,dX0r + st1 {dX0rs}[1],[argDst], #4 +End: + + // Write function tail + M_END + + .end diff --git a/dl/sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S b/dl/sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S new file mode 100644 index 0000000..da68314 --- /dev/null +++ b/dl/sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S @@ -0,0 +1,280 @@ +// +// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the LICENSE file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// +// This is a modification of +// armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.s to support float +// instead of SC32. +// + +// +// Description: +// Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT +// It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation +// +// + + +// Include standard headers + +#include "dl/api/arm/arm64COMM_s.h" +#include "dl/api/arm/omxtypes_s.h" + + +// Import symbols required from other files +// (For example tables) + + +// Set debugging level +//DEBUG_ON SETL {TRUE} + + + +// Guarding implementation by the processor name + + + + // Guarding implementation by the processor name + + + +//Input Registers + +#define pSrc x0 +#define pTwiddle x1 +#define pOut x2 +#define subFFTNum x3 + +// Output registers + +//Local Scratch Registers + +#define argTwiddle x5 +#define argDst x6 +#define subFFTSize x7 +#define N subFFTNum + +#define pOut1 x13 + +#define size x7 +#define step x8 +#define step1 x9 +#define twStep x10 +#define pTwiddleTmp x11 +#define argTwiddle1 x12 + +// Neon registers + +#define dX0 v0.2s +#define dX0s v0.s +#define dShift v1.2s +#define dX1 v1.2s +#define dX1s v1.s +#define dY0 v2.2s +#define dY08b v2.8b +#define dY1 v3.2s +#define dX0r v0.2s +#define dX0rs v0.s +#define dX0i v1.2s +#define dX1r v2.2s +#define dX1i v3.2s +#define dW0r v4.2s +#define dW0r8b v4.8b +#define dW0i v5.2s +#define dW1r v6.2s +#define dW1r8b v6.8b +#define dW1i v7.2s +#define dT0 v8.2s +#define dT1 v9.2s +#define dT2 v10.2s +#define dT3 v11.2s +#define qT0 v12.2s +#define qT1 v14.2s +#define qT2 v16.2s +#define qT3 v18.2s +#define dY0r v4.2s +#define dY0i v5.2s +#define dY1r v6.2s +#define dY1i v7.2s + +#define dY2 v4.2s +#define dY3 v5.2s +#define dW0 v6.2s +#define dW1 v7.2s +#define dW0Tmp v10.2s +#define dW1Neg v11.2s + +#define dZip v19.2s +#define dZip8b v19.8b +#define half v13.2s + + .MACRO FFTSTAGE scaled, inverse, name + + fmov half, 0.5 + + asr size, subFFTNum, #1 // preserve the contents of N = subFFTNum + lsl step, subFFTNum, #2 // step = N/2 * 8 bytes + + + // Z(k) = 1/2 {[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]} + // Note: W^(k) is stored as negated value and also need to + // conjugate the values from the table + + // Z(0) : no need of twiddle multiply + // Z(0) = 1/2 { [F(0) + F'(N/2)] +j [F(0) - F'(N/2)] } + + ld1 {dX0},[pSrc],step + ADD pOut1,pOut,step // pOut1 = pOut+ N/2*8 bytes + + ld1 {dX1},[pSrc], #8 + // twStep = 3N/8 * 8 bytes pointing to W^1 + SUB twStep,step,size,LSL #1 + + lsl step1,size, #2 // step1 = N/4 * 8 = N/2*4 bytes + SUB step1,step1,#8 // (N/4-1)*8 bytes + + fadd dY0,dX0,dX1 // [b+d | a+c] + fsub dY1,dX0,dX1 // [b-d | a-c] + fmul dY0, dY0, half[0] + fmul dY1, dY1, half[0] + + // dY0= [a-c | a+c] ;dY1= [b-d | b+d] + // VZIP dY0,dY1 + zip1 dZip,dY0,dY1 + zip2 dY1,dY0,dY1 + mov dY08b, dZip8b + + fsub dX0,dY0,dY1 + SUBS size,size,#2 + fadd dX1,dY0,dY1 + + SUB pSrc,pSrc,step + + st1 {dX0s}[0],[pOut1], #4 + ADD pTwiddleTmp,pTwiddle,#8 // W^2 + st1 {dX1s}[1],[pOut1], #4 + ADD argTwiddle1,pTwiddle,twStep // W^1 + + + BLT decrementScale\name + BEQ lastElement\name + + + // Z(k) = 1/2[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)] + // Note: W^k is stored as negative values in the table and also + // need to conjugate the values from the table. + // + // Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1) + // since both of them require F(1),F(2) and F(N/2-2),F(N/2-1) + + + SUB step,step,#24 +evenOddButterflyLoop\name : + + + ld1 {dW0r},[argTwiddle1],step1 + ld1 {dW1r},[argTwiddle1], #8 + + ld2 {dX0r,dX0i},[pSrc],step + SUB argTwiddle1,argTwiddle1,step1 + ld2 {dX1r,dX1i},[pSrc], #16 + + SUB step1,step1,#8 // (N/4-2)*8 bytes + ld1 {dW0i},[pTwiddleTmp],step1 + ld1 {dW1i},[pTwiddleTmp], #8 + SUB pSrc,pSrc,step + + SUB pTwiddleTmp,pTwiddleTmp,step1 + rev64 dX1r,dX1r + rev64 dX1i,dX1i + SUBS size,size,#4 + + + fsub dT2,dX0r,dX1r // a-c + fadd dT3,dX0i,dX1i // b+d + fadd dT0,dX0r,dX1r // a+c + fsub dT1,dX0i,dX1i // b-d + SUB step1,step1,#8 + + fmul dT2, dT2, half[0] + fmul dT3, dT3, half[0] + + fmul dT0, dT0, half[0] + fmul dT1, dT1, half[0] + + // VZIP dW1r,dW1i + // VZIP dW0r,dW0i + zip1 dZip, dW1r,dW1i + zip2 dW1i,dW1r,dW1i + mov dW1r8b, dZip8b + zip1 dZip,dW0r,dW0i + zip2 dW0i,dW0r,dW0i + mov dW0r8b, dZip8b + + fmul dX1r,dW1r,dT2 + fmul dX1i,dW1r,dT3 + fmul dX0r,dW0r,dT2 + fmul dX0i,dW0r,dT3 + + fmls dX1r,dW1i,dT3 + fmla dX1i,dW1i,dT2 + + fmla dX0r,dW0i,dT3 + fmls dX0i,dW0i,dT2 + + + fadd dY1r,dT0,dX1i // F(N/2 -1) + fsub dY1i,dX1r,dT1 + + rev64 dY1r,dY1r + rev64 dY1i,dY1i + + + fadd dY0r,dT0,dX0i // F(1) + fsub dY0i,dT1,dX0r + + + st2 {dY0r,dY0i},[pOut1],step + st2 {dY1r,dY1i},[pOut1], #16 + SUB pOut1,pOut1,step + SUB step,step,#32 // (N/2-4)*8 bytes + + + BGT evenOddButterflyLoop\name + + + // set both the ptrs to the last element + SUB pSrc,pSrc,#8 + SUB pOut1,pOut1,#8 + + // Last element can be expanded as follows + // 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (since W^k is stored as + // -ve) + // 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)] + // 1/2[2a+j0] - j (c-jd) [0+j2b] + // (a+bc, -bd) + // Since (c,d) = (0,1) for the last element, result is just (a,-b) + +lastElement\name : + ld1 {dX0r},[pSrc] + + st1 {dX0rs}[0],[pOut1], #4 + fneg dX0r,dX0r + st1 {dX0rs}[1],[pOut1] + + + +decrementScale\name : + + .endm + + M_START armSP_FFTInv_CCSToR_F32_preTwiddleRadix2,,d15 + FFTSTAGE "FALSE","TRUE",Inv + M_END + + .end diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_fs_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_fs_s.S new file mode 100644 index 0000000..b22912d --- /dev/null +++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_fs_s.S @@ -0,0 +1,136 @@ +// +// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the LICENSE file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// +// This is a modification of armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S +// to support float instead of SC32. +// + +// +// Description: +// Compute the first stage of a Radix 2 DIT in-order out-of-place FFT +// stage for a N point complex signal. +// +// + + +// Include standard headers + +#include "dl/api/arm/arm64COMM_s.h" +#include "dl/api/arm/omxtypes_s.h" + + +// Import symbols required from other files +// (For example tables) + + + + +// Set debugging level +//DEBUG_ON SETL {TRUE} + + + +// Guarding implementation by the processor name + + + +// Guarding implementation by the processor name + + +//Input Registers + +#define pSrc x0 +#define pDst x1 +#define pTwiddle x2 +#define pSubFFTNum x3 +#define pSubFFTSize x4 + + +//Output Registers + + +//Local Scratch Registers + +#define subFFTNum x5 +#define subFFTSize x6 +#define pointStep x7 +#define outPointStep x7 +#define grpSize x8 +#define setCount x8 +#define step x9 +#define dstStep x9 + +// Neon Registers +#define dX0 v0.2s +#define dX1 v1.2s +#define dY0 v2.2s +#define dY1 v3.2s + + .MACRO FFTSTAGE scaled, inverse, name + + // Define stack arguments + + // Move args values into our work registers + ldr subFFTNum, [pSubFFTNum] + ldr subFFTSize, [pSubFFTSize] + + // update subFFTSize and subFFTNum into RN6 and RN7 for the next stage + + + MOV subFFTSize,#2 + LSR grpSize,subFFTNum,#1 + MOV subFFTNum,grpSize + + + // pT0+1 increments pT0 by 8 bytes + // pT0+pointStep = increment of 8*pointStep bytes = 4*grpSize bytes + // Note: outPointStep = pointStep for firststage + // Note: setCount = grpSize/2 (reuse the updated grpSize for setCount) + + lsl pointStep, grpSize, #3 + rsb step, pointStep, #8 + + // Loop on the sets for grp zero + +grpZeroSetLoop\name : + + LD1 {dX0},[pSrc],pointStep + LD1 {dX1},[pSrc],step // step = -pointStep + 8 + + SUBS setCount,setCount,#1 + + fadd dY0,dX0,dX1 + fsub dY1,dX0,dX1 + + ST1 {dY0},[pDst],outPointStep + // dstStep = step = -pointStep + 8 + ST1 {dY1},[pDst],dstStep + + BGT grpZeroSetLoop\name + + + // Save subFFTNum and subFFTSize for next stage + str subFFTNum, [pSubFFTNum] + str subFFTSize, [pSubFFTSize] + + .endm + + + + M_START armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace + FFTSTAGE "FALSE","FALSE",fwd + M_END + + + + M_START armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace + FFTSTAGE "FALSE","TRUE",inv + M_END + + .end diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_ls_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_ls_s.S new file mode 100644 index 0000000..e7de11e --- /dev/null +++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_ls_s.S @@ -0,0 +1,149 @@ +// +// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the LICENSE file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// +// This is a modification of armSP_FFT_CToC_SC32_Radix2_ls_unsafe_s.S +// to support float instead of SC32. +// + +// +// Description: +// Compute the last stage of a Radix 2 DIT in-order out-of-place FFT +// stage for a N point complex signal. +// +// + + +// Include standard headers + +#include "dl/api/arm/arm64COMM_s.h" +#include "dl/api/arm/omxtypes_s.h" + + +// Import symbols required from other files +// (For example tables) + + + + +// Set debugging level +//DEBUG_ON SETL {TRUE} + + +// Guarding implementation by the processor name + + +//Input Registers + +#define pSrc x0 +#define pDst x1 +#define pTwiddle x2 +#define pSubFFTNum x3 +#define pSubFFTSize x4 + + +//Output Registers + + +//Local Scratch Registers + + +#define subFFTNum x5 +#define subFFTSize x6 +#define outPointStep x8 +#define grpCount x9 +#define dstStep x10 + +// Neon Registers + +#define dWr v0.2s +#define dWi v1.2s +#define dXr0 v2.2s +#define dXi0 v3.2s +#define dXr1 v4.2s +#define dXi1 v5.2s +#define dYr0 v6.2s +#define dYi0 v7.2s +#define dYr1 v8.2s +#define dYi1 v9.2s +#define qT0 v10.2s +#define qT1 v12.2s + + .MACRO FFTSTAGE scaled, inverse, name + + // Move parameters into our work registers + ldr subFFTSize, [pSubFFTSize] + + lsl outPointStep, subFFTSize, #3 + + // Update grpCount and grpSize rightaway + + MOV subFFTNum,#1 //after the last stage + LSL grpCount,subFFTSize,#1 + + // update subFFTSize for the next stage + MOV subFFTSize,grpCount + + rsb dstStep,outPointStep,#16 + + // Loop on 2 grps at a time for the last stage + +radix2lsGrpLoop\name : + // dWr = [pTwiddle[0].Re, pTwiddle[1].Re] + // dWi = [pTwiddle[0].Im, pTwiddle[1].Im] + ld2 {dWr,dWi},[pTwiddle], #16 + + // dXr0 = [pSrc[0].Re, pSrc[2].Re] + // dXi0 = [pSrc[0].Im, pSrc[2].Im] + // dXr1 = [pSrc[1].Re, pSrc[3].Re] + // dXi1 = [pSrc[1].Im, pSrc[3].Im] + ld4 {dXr0,dXi0,dXr1,dXi1}, [pSrc], #32 + + SUBS grpCount,grpCount,#4 // grpCount is multiplied by 2 + + .ifeqs "\inverse", "TRUE" + fmul qT0,dWr,dXr1 + fmla qT0,dWi,dXi1 // real part + fmul qT1,dWr,dXi1 + fmls qT1,dWi,dXr1 // imag part + + .else + + fmul qT0,dWr,dXr1 + fmls qT0,dWi,dXi1 // real part + fmul qT1,dWr,dXi1 + fmla qT1,dWi,dXr1 // imag part + + .endif + + fsub dYr0,dXr0,qT0 + fsub dYi0,dXi0,qT1 + fadd dYr1,dXr0,qT0 + fadd dYi1,dXi0,qT1 + + st2 {dYr0,dYi0},[pDst],outPointStep + st2 {dYr1,dYi1},[pDst],dstStep // dstStep = step = -outPointStep + 16 + + BGT radix2lsGrpLoop\name + + + .endm + + + + M_START armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace,,d12 + FFTSTAGE "FALSE","FALSE",fwd + M_END + + + + M_START armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace,,d12 + FFTSTAGE "FALSE","TRUE",inv + M_END + + .end diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_s.S new file mode 100644 index 0000000..530a815 --- /dev/null +++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_s.S @@ -0,0 +1,185 @@ +// +// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the LICENSE file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// +// This is a modification of armSP_FFT_CToC_SC32_Radix2_unsafe_s.s +// to support float instead of SC32. +// + +// Description: +// Compute a Radix 2 DIT in-order out-of-place FFT stage for an N point +// complex signal. This handles the general stage, not the first or last +// stage. +// +// + + +// Include standard headers + +#include "dl/api/arm/arm64COMM_s.h" +#include "dl/api/arm/omxtypes_s.h" + + +// Import symbols required from other files +// (For example tables) + + + +// Set debugging level +//DEBUG_ON SETL {TRUE} + + + +// Guarding implementation by the processor name + + + + +// Guarding implementation by the processor name + +//Input Registers + +#define pSrc x0 +#define pDst x1 +#define pTwiddle x2 +#define pSubFFTNum x3 +#define pSubFFTSize x4 + + +//Output Registers + + +//Local Scratch Registers + +#define subFFTNum x5 +#define subFFTSize x6 +#define outPointStep x8 +#define pointStep x9 +#define pointStep32 w9 +#define grpCount x10 +#define grpCount32 w10 +#define setCount x13 +#define step x15 +#define dstStep x11 + +// Neon Registers + +#define dW v0.2s +#define dX0 v2.2s +#define dX1 v3.2s +#define dX2 v4.2s +#define dX3 v5.2s +#define dY0 v6.2s +#define dY1 v7.2s +#define dY2 v8.2s +#define dY3 v9.2s +#define qT0 v10.2s +#define qT1 v11.2s + + .MACRO FFTSTAGE scaled, inverse, name + + // Define stack arguments + + // Move args values into our work registers + ldr subFFTNum, [pSubFFTNum] + ldr subFFTSize, [pSubFFTSize] + + // Update grpCount and grpSize rightaway inorder to reuse pGrpCount + // and pGrpSize regs + + LSR subFFTNum,subFFTNum,#1 //grpSize + LSL grpCount,subFFTSize,#1 + + + // pT0+1 increments pT0 by 8 bytes + // pT0+pointStep = increment of 8*pointStep bytes = 4*grpSize bytes + lsl pointStep, subFFTNum, #2 + + // update subFFTSize for the next stage + MOV subFFTSize,grpCount + + // pOut0+1 increments pOut0 by 8 bytes + // pOut0+outPointStep == increment of 8*outPointStep bytes = + // 4*size bytes + smull outPointStep, grpCount32, pointStep32 + + LSL pointStep,pointStep,#1 + + + rsb step,pointStep,#16 + rsb dstStep,outPointStep,#16 + + // Loop on the groups + +radix2GrpLoop\name : + lsr setCount, pointStep, #3 + LD1 {dW},[pTwiddle],pointStep //[wi | wr] + + + // Loop on the sets + + +radix2SetLoop\name : + + + // point0: dX0-real part dX1-img part + LD2 {dX0,dX1},[pSrc],pointStep + // point1: dX2-real part dX3-img part + LD2 {dX2,dX3},[pSrc],step + + SUBS setCount,setCount,#2 + + .ifeqs "\inverse", "TRUE" + fmul qT0,dX2,dW[0] + fmla qT0,dX3,dW[1] // real part + fmul qT1,dX3,dW[0] + fmls qT1,dX2,dW[1] // imag part + + .else + + fmul qT0,dX2,dW[0] + fmls qT0,dX3,dW[1] // real part + fmul qT1,dX3,dW[0] + fmla qT1,dX2,dW[1] // imag part + + .endif + + fsub dY0,dX0,qT0 + fsub dY1,dX1,qT1 + fadd dY2,dX0,qT0 + fadd dY3,dX1,qT1 + + st2 {dY0,dY1},[pDst],outPointStep + // dstStep = -outPointStep + 16 + st2 {dY2,dY3},[pDst],dstStep + + BGT radix2SetLoop\name + + SUBS grpCount,grpCount,#2 + ADD pSrc,pSrc,pointStep + BGT radix2GrpLoop\name + + + str subFFTNum, [pSubFFTNum] + str subFFTSize, [pSubFFTSize] + .endm + + + + M_START armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace,,d11 + FFTSTAGE "FALSE","FALSE",FWD + M_END + + + + M_START armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace,,d11 + FFTSTAGE "FALSE","TRUE",INV + M_END + + + .end diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_fs_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_fs_s.S new file mode 100644 index 0000000..624ef3e --- /dev/null +++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_fs_s.S @@ -0,0 +1,266 @@ +// +// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the LICENSE file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// +// +// This is a modification of armSP_FFT_CToC_SC32_Radix4_fs_unsafe_s.s +// to support float instead of SC32. +// + +// +// Description: +// Compute a first stage Radix 4 FFT stage for a N point complex signal +// +// + + +// Include standard headers + +#include "dl/api/arm/arm64COMM_s.h" +#include "dl/api/arm/omxtypes_s.h" + +// Import symbols required from other files +// (For example tables) + + + + +// Set debugging level +//DEBUG_ON SETL {TRUE} + + + +// Guarding implementation by the processor name + + + +// Guarding implementation by the processor name + +//Input Registers + +#define pSrc x0 +#define pDst x1 +#define pTwiddle x2 +#define pSubFFTNum x3 +#define pSubFFTSize x4 + + +//Output Registers + + +//Local Scratch Registers + +#define subFFTNum x5 +#define subFFTSize x6 +#define grpSize x7 +// Reuse grpSize as setCount +#define setCount x7 +#define pointStep x8 +#define outPointStep x8 +#define setStep x9 +#define step1 x10 +#define step3 x11 + +// Neon Registers + +#define dXr0 v0.2s +#define dXi0 v1.2s +#define dXr1 v2.2s +#define dXi1 v3.2s +#define dXr2 v4.2s +#define dXi2 v5.2s +#define dXr3 v6.2s +#define dXi3 v7.2s +#define dYr0 v8.2s +#define dYi0 v9.2s +#define dYr1 v10.2s +#define dYi1 v11.2s +#define dYr2 v12.2s +#define dYi2 v13.2s +#define dYr3 v14.2s +#define dYi3 v15.2s +#define dZr0 v16.2s +#define dZi0 v17.2s +#define dZr1 v18.2s +#define dZi1 v19.2s +#define dZr2 v20.2s +#define dZi2 v21.2s +#define dZr3 v22.2s +#define dZi3 v23.2s + + + .MACRO FFTSTAGE scaled, inverse, name + + // Define stack arguments + + // Move args values into our work registers + ldr subFFTNum, [pSubFFTNum] + ldr subFFTSize, [pSubFFTSize] + + // pT0+1 increments pT0 by 8 bytes + // pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes + // Note: outPointStep = pointStep for firststage + + lsl pointStep, subFFTNum, #1 + + // Update pSubFFTSize and pSubFFTNum regs + ld2 {dXr0,dXi0}, [pSrc], pointStep // data[0] + + // subFFTSize = 1 for the first stage + MOV subFFTSize,#4 + + // Note: setCount = subFFTNum/4 (reuse the grpSize reg for setCount) + LSR grpSize,subFFTNum,#2 + ld2 {dXr1,dXi1}, [pSrc], pointStep // data[1] + MOV subFFTNum,grpSize + + + // Calculate the step of input data for the next set + //MOV setStep,pointStep,LSL #1 + lsl setStep, grpSize, #4 + ld2 {dXr2,dXi2}, [pSrc], pointStep // data[2] + + // setStep = 3*pointStep + ADD setStep,setStep,pointStep + // setStep = - 3*pointStep+16 + + rsb setStep,setStep,#16 + // data[3] & update pSrc for the next set + ld2 {dXr3,dXi3}, [pSrc], setStep + + // step1 = 2*pointStep + lsl step1, pointStep, #1 + + // fadd qY0, qX0, qX2 + fadd dYr0, dXr0, dXr2 + fadd dYi0, dXi0, dXi2 + // step3 = -pointStep + neg step3, pointStep + + // grp = 0 a special case since all the twiddle factors are 1 + // Loop on the sets : 2 sets at a time + +radix4fsGrpZeroSetLoop\name : + + + + // Decrement setcount + SUBS setCount,setCount,#2 + + + // finish first stage of 4 point FFT + + + // fsub qy2,qx0,qx2 + fsub dYr2, dXr0, dXr2 + fsub dYi2, dXi0, dXi2 + + ld2 {dXr0,dXi0}, [pSrc], step1 // data[0] + // fadd qy1,qx1,qx3 + fadd dYr1, dXr1, dXr3 + fadd dYi1, dXi1, dXi3 + ld2 {dXr2,dXi2}, [pSrc], step3 // data[2] + // fsub qy3,qx1,qx3 + fsub dYr3, dXr1, dXr3 + fsub dYi3, dXi1, dXi3 + + + // finish second stage of 4 point FFT + + .ifeqs "\inverse", "TRUE" + + ld2 {dXr1,dXi1}, [pSrc], step1 // data[1] + // fadd qz0,qy0,qy1 + fadd dZr0, dYr0, dYr1 + fadd dZi0, dYi0, dYi1 + + // data[3] & update pSrc for the next set, but not if it's the + // last iteration so that we don't read past the end of the + // input array. + BEQ radix4SkipLastUpdateInv\name + ld2 {dXr3,dXi3}, [pSrc], setStep + +radix4SkipLastUpdateInv\name: + FSUB dZr3,dYr2,dYi3 + + st2 {dZr0,dZi0},[pDst],outPointStep + FADD dZi3,dYi2,dYr3 + + // fsub qZ1,qY0,qY1 + FSUB dZr1, dYr0, dYr1 + FSUB dZi1, dYi0, dYi1 + st2 {dZr3,dZi3},[pDst],outPointStep + + FADD dZr2,dYr2,dYi3 + st2 {dZr1,dZi1},[pDst],outPointStep + FSUB dZi2,dYi2,dYr3 + + // fadd qY0, qX0, qX2 + FADD dYr0, dXr0, dXr2 // u0 for next iteration + FADD dYi0, dXi0, dXi2 + st2 {dZr2,dZi2},[pDst],setStep + + + .else + + ld2 {dXr1,dXi1}, [pSrc], step1 // data[1] + // fadd qZ0,qY0,qY1 + fadd dZr0, dYr0, dYr1 + fadd dZi0, dYi0, dYi1 + + // data[3] & update pSrc for the next set, but not if it's the + // last iteration so that we don't read past the end of the + // input array. + BEQ radix4SkipLastUpdateFwd\name + ld2 {dXr3,dXi3}, [pSrc], setStep + +radix4SkipLastUpdateFwd\name: + FADD dZr2,dYr2,dYi3 + + st2 {dZr0,dZi0},[pDst],outPointStep + FSUB dZi2,dYi2,dYr3 + + // fsub qz1,qy0,qy1 + fsub dZr1, dYr0, dYr1 + fsub dZi1, dYi0, dYi1 + st2 {dZr2,dZi2},[pDst],outPointStep + + FSUB dZr3,dYr2,dYi3 + st2 {dZr1,dZi1},[pDst],outPointStep + FADD dZi3,dYi2,dYr3 + + // fadd qy0,qx0,qx2 + fadd dYr0, dXr0, dXr2 // u0 for next iteration + fadd dYi0, dXi0, dXi2 + + st2 {dZr3,dZi3},[pDst],setStep + + .endif + + BGT radix4fsGrpZeroSetLoop\name + + // Save subFFTNum and subFFTSize for next stage + str subFFTNum, [pSubFFTNum] + str subFFTSize, [pSubFFTSize] + + .endm + + + + M_START armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace,,d15 + FFTSTAGE "FALSE","FALSE",fwd + M_END + + + + M_START armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace,,d15 + FFTSTAGE "FALSE","TRUE",inv + M_END + + + .end diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_ls_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_ls_s.S new file mode 100644 index 0000000..2fc2e60 --- /dev/null +++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_ls_s.S @@ -0,0 +1,371 @@ +// +// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the LICENSE file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// +// This is a modification of armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.s +// to support float instead of SC32. +// + +// +// Description: +// Compute a Radix 4 FFT stage for a N point complex signal +// +// + + +// Include standard headers + +#include "dl/api/arm/arm64COMM_s.h" +#include "dl/api/arm/omxtypes_s.h" + +// Import symbols required from other files +// (For example tables) + + + + +// Set debugging level +//DEBUG_ON SETL {TRUE} + + +// Guarding implementation by the processor name + + +// Import symbols required from other files +// (For example tables) + //IMPORT armAAC_constTable + +//Input Registers + +#define pSrc x0 +#define pDst x1 +#define pTwiddle x2 +#define pSubFFTNum x3 +#define pSubFFTSize x4 + + + +//Output Registers + + +//Local Scratch Registers + +#define subFFTNum x5 +#define subFFTSize x6 +#define outPointStep x8 +#define grpCount x9 +#define dstStep x10 +#define grpTwStep x13 +#define stepTwiddle x14 +#define twStep x15 +#define step16 x11 +#define step24 x12 + + +// Neon Registers + +#define dButterfly1Real02 v0.2s +#define dButterfly1Real028b v0.8b +#define dButterfly1Imag02 v1.2s +#define dButterfly1Imag028b v1.8b +#define dButterfly1Real13 v2.2s +#define dButterfly1Real138b v2.8b +#define dButterfly1Imag13 v3.2s +#define dButterfly1Imag138b v3.8b +#define dButterfly2Real02 v4.2s +#define dButterfly2Imag02 v5.2s +#define dButterfly2Real13 v6.2s +#define dButterfly2Imag13 v7.2s +#define dXr0 v0.2s +#define dXi0 v1.2s +#define dXr08b v0.8b +#define dXi08b v1.8b +#define dXr1 v2.2s +#define dXi1 v3.2s +#define dXr2 v4.2s +#define dXi2 v5.2s +#define dXr3 v6.2s +#define dXi3 v7.2s + +#define dYr0 v16.2s +#define dYi0 v17.2s +#define dYr1 v18.2s +#define dYi1 v19.2s +#define dYr2 v20.2s +#define dYi2 v21.2s +#define dYr3 v22.2s +#define dYi3 v23.2s + +#define dW1r v8.2s +#define dW1i v9.2s +#define dW2r v10.2s +#define dW2r8b v10.8b +#define dW2i v11.2s +#define dW3r v12.2s +#define dW3r8b v12.8b +#define dW3i v13.2s + +#define dZr0 v14.2s +#define dZi0 v15.2s +#define dZr08b v14.8b +#define dZi08b v15.8b +#define dZr1 v26.2s +#define dZi1 v27.2s +#define dZr2 v28.2s +#define dZi2 v29.2s +#define dZr3 v30.2s +#define dZi3 v31.2s + +#define dZip v24.2s +#define dZip8b v24.8b + + .MACRO FFTSTAGE scaled, inverse , name + + // Define stack arguments + + // Move args values into our work registers + ldr subFFTNum, [pSubFFTNum] + ldr subFFTSize, [pSubFFTSize] + + // pOut0+1 increments pOut0 by 8 bytes + // pOut0+outPointStep == increment of 8*outPointStep bytes + lsl outPointStep,subFFTSize, #3 + + // Update grpCount and grpSize rightaway + + ld2 {dW1r,dW1i},[pTwiddle] // [wi|wr] + MOV step16,#16 + LSL grpCount,subFFTSize,#2 + + ld1 {dW2r},[pTwiddle] // [wi|wr] + MOV subFFTNum,#1 //after the last stage + + ld1 {dW3r},[pTwiddle],step16 // [wi|wr] + MOV stepTwiddle,#0 + + ld1 {dW2i},[pTwiddle],#8 // [wi|wr] + SUB grpTwStep,stepTwiddle,#8 // grpTwStep = -8 to start with + + // update subFFTSize for the next stage + MOV subFFTSize,grpCount + ld1 {dW3i},[pTwiddle],grpTwStep // [wi|wr] + lsl dstStep,outPointStep, #1 + + // AC.r AC.i BD.r BD.i + ld4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc], #32 + ADD dstStep,dstStep,outPointStep // dstStep = 3*outPointStep + + rsb dstStep,dstStep,#16 // dstStep = - 3*outPointStep+16 + MOV step24,#24 + + // AC.r AC.i BD.r BD.i + ld4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc], #32 + + + // Process two groups at a time + +radix4lsGrpLoop\name : + + // VZIP dW2r,dW2i + zip1 dZip, dW2r, dW2i + zip2 dW2i, dW2r, dW2i + mov dW2r8b, dZip8b + + ADD stepTwiddle,stepTwiddle,#16 + + // VZIP dW3r,dW3i + zip1 dZip, dW3r,dW3i + zip2 dW3i, dW3r, dW3i + mov dW3r8b, dZip8b + ADD grpTwStep,stepTwiddle,#4 + + // VUZP dButterfly1Real13, dButterfly2Real13 // B.r D.r + uzp1 dZip, dButterfly1Real13, dButterfly2Real13 // B.r D.r + uzp2 dButterfly2Real13, dButterfly1Real13, dButterfly2Real13 // B.r D.r + mov dButterfly1Real138b, dZip8b + + SUB twStep,stepTwiddle,#16 // -16+stepTwiddle + + // VUZP dButterfly1Imag13, dButterfly2Imag13 // B.i D.i + uzp1 dZip, dButterfly1Imag13, dButterfly2Imag13 // B.i D.i + uzp2 dButterfly2Imag13, dButterfly1Imag13, dButterfly2Imag13 // B.i D.i + mov dButterfly1Imag138b, dZip8b + lsl grpTwStep,grpTwStep,#1 + + // VUZP dButterfly1Real02, dButterfly2Real02 // A.r C.r + uzp1 dZip, dButterfly1Real02, dButterfly2Real02 // A.r C.r + uzp2 dButterfly2Real02, dButterfly1Real02, dButterfly2Real02 // A.r C.r + mov dButterfly1Real028b, dZip8b + rsb grpTwStep,grpTwStep,#0 // -8-2*stepTwiddle + + // VUZP dButterfly1Imag02, dButterfly2Imag02 // A.i C.i + uzp1 dZip, dButterfly1Imag02, dButterfly2Imag02 // A.i C.i + uzp2 dButterfly2Imag02, dButterfly1Imag02, dButterfly2Imag02 // A.i C.i + mov dButterfly1Imag028b, dZip8b + + + // grpCount is multiplied by 4 + SUBS grpCount,grpCount,#8 + + .ifeqs "\inverse", "TRUE" + fmul dZr1,dW1r,dXr1 + fmla dZr1,dW1i,dXi1 // real part + fmul dZi1,dW1r,dXi1 + fmls dZi1,dW1i,dXr1 // imag part + + .else + + fmul dZr1,dW1r,dXr1 + fmls dZr1,dW1i,dXi1 // real part + fmul dZi1,dW1r,dXi1 + fmla dZi1,dW1i,dXr1 // imag part + + .endif + + ld2 {dW1r,dW1i},[pTwiddle],stepTwiddle // [wi|wr] + + .ifeqs "\inverse", "TRUE" + fmul dZr2,dW2r,dXr2 + fmla dZr2,dW2i,dXi2 // real part + fmul dZi2,dW2r,dXi2 + ld1 {dW2r},[pTwiddle],step16 // [wi|wr] + fmls dZi2,dW2i,dXr2 // imag part + + .else + + fmul dZr2,dW2r,dXr2 + fmls dZr2,dW2i,dXi2 // real part + fmul dZi2,dW2r,dXi2 + ld1 {dW2r},[pTwiddle],step16 // [wi|wr] + fmla dZi2,dW2i,dXr2 // imag part + + .endif + + + ld1 {dW2i},[pTwiddle],twStep // [wi|wr] + + // move qX0 so as to load for the next iteration + // MOV qZ0,qX0 + mov dZr08b, dXr08b + mov dZi08b, dXi08b + + .ifeqs "\inverse", "TRUE" + fmul dZr3,dW3r,dXr3 + fmla dZr3,dW3i,dXi3 // real part + fmul dZi3,dW3r,dXi3 + ld1 {dW3r},[pTwiddle],step24 + fmls dZi3,dW3i,dXr3 // imag part + + .else + + fmul dZr3,dW3r,dXr3 + fmls dZr3,dW3i,dXi3 // real part + fmul dZi3,dW3r,dXi3 + ld1 {dW3r},[pTwiddle],step24 + fmla dZi3,dW3i,dXr3 // imag part + + .endif + + ld1 {dW3i},[pTwiddle],grpTwStep // [wi|wr] + + // Don't do the load on the last iteration so we don't read past the end + // of pSrc. + bne skipIncrement\name + add pSrc, pSrc, #64 +skipIncrement\name: + beq radix4lsSkipRead\name + // AC.r AC.i BD.r BD.i + ld4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc], #32 + + // AC.r AC.i BD.r BD.i + ld4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc], #32 +radix4lsSkipRead\name: + + // finish first stage of 4 point FFT + + // fadd qY0,qZ0,qZ2 + fadd dYr0,dZr0,dZr2 + fadd dYi0,dZi0,dZi2 + // fsub qY2,qZ0,qZ2 + fsub dYr2,dZr0,dZr2 + fsub dYi2,dZi0,dZi2 + // fadd qY1,qZ1,qZ3 + fadd dYr1,dZr1,dZr3 + fadd dYi1,dZi1,dZi3 + // fsub qY3,qZ1,qZ3 + fsub dYr3,dZr1,dZr3 + fsub dYi3,dZi1,dZi3 + + + // finish second stage of 4 point FFT + + .ifeqs "\inverse", "TRUE" + + // fsub qZ0,qY2,qY1 + fsub dZr0,dYr2,dYr1 + fsub dZi0,dYi2,dYi1 + fadd dZr3,dYr0,dYi3 + st2 {dZr0,dZi0},[pDst],outPointStep + fsub dZi3,dYi0,dYr3 + + // fadd qZ2,qY2,qY1 + fadd dZr2,dYr2,dYr1 + fadd dZi2,dYi2,dYi1 + + st2 {dZr3,dZi3},[pDst],outPointStep + + fsub dZr1,dYr0,dYi3 + st2 {dZr2,dZi2},[pDst],outPointStep + fadd dZi1,dYi0,dYr3 + + // dstStep = -outPointStep + 16 + st2 {dZr1,dZi1},[pDst],dstStep + + + .else + + // fsub qZ0,qY2,qY1 + fsub dZr0,dYr2,dYr1 + fsub dZi0,dYi2,dYi1 + + fsub dZr1,dYr0,dYi3 + st2 {dZr0,dZi0},[pDst],outPointStep + fadd dZi1,dYi0,dYr3 + + // fadd qZ2,qY2,qY1 + fadd dZr2,dYr2,dYr1 + fadd dZi2,dYi2,dYi1 + + st2 {dZr1,dZi1},[pDst],outPointStep + + fadd dZr3,dYr0,dYi3 + st2 {dZr2,dZi2},[pDst],outPointStep + fsub dZi3,dYi0,dYr3 + + // dstStep = -outPointStep + 16 + st2 {dZr3,dZi3},[pDst],dstStep + + + .endif + + BGT radix4lsGrpLoop\name + + .endm + + + M_START armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace,,d15 + FFTSTAGE "FALSE","FALSE",fwd + M_END + + + M_START armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace,,d15 + FFTSTAGE "FALSE","TRUE",inv + M_END + + + .end diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_s.S new file mode 100644 index 0000000..830fd16 --- /dev/null +++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_s.S @@ -0,0 +1,339 @@ +// +// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the LICENSE file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// +// +// This is a modification of armSP_FFT_CToC_SC32_Radix4_unsafe_s.s +// to support float instead of SC32. +// + +// +// Description: +// Compute a Radix 4 FFT stage for a N point complex signal +// +// + + +// Include standard headers + +#include "dl/api/arm/arm64COMM_s.h" +#include "dl/api/arm/omxtypes_s.h" + + +// Import symbols required from other files +// (For example tables) + + + + +// Set debugging level +//DEBUG_ON SETL {TRUE} + + + +// Guarding implementation by the processor name + + + + +// Guarding implementation by the processor name + + +// Import symbols required from other files +// (For example tables) + + +//Input Registers + +#define pSrc x0 +#define pDst x1 +#define pTwiddle x2 +#define pSubFFTNum x3 +#define pSubFFTSize x4 + + + +//Output Registers + + +//Local Scratch Registers + +#define subFFTNum x5 +#define subFFTSize x6 +#define grpCount x7 +#define grpCount32 w7 +#define pointStep x8 +#define pointStep32 w8 +#define outPointStep x9 +#define stepTwiddle x10 +#define setCount x11 +#define srcStep x12 +#define setStep x13 +#define dstStep x14 +#define twStep x15 + +// Neon Registers + +#define dW1 v0.2s +#define dW2 v1.2s +#define dW3 v2.2s + +#define dXr0 v4.2s +#define dXi0 v5.2s +#define dXr1 v6.2s +#define dXi1 v7.2s +#define dXr2 v8.2s +#define dXi2 v9.2s +#define dXr3 v10.2s +#define dXi3 v11.2s +#define dYr0 v12.2s +#define dYi0 v13.2s +#define dYr1 v14.2s +#define dYi1 v15.2s +#define dYr2 v16.2s +#define dYi2 v17.2s +#define dYr3 v18.2s +#define dYi3 v19.2s +#define dZr0 v20.2s +#define dZi0 v21.2s +#define dZr1 v22.2s +#define dZi1 v23.2s +#define dZr2 v24.2s +#define dZi2 v25.2s +#define dZr3 v26.2s +#define dZi3 v27.2s + + .MACRO FFTSTAGE scaled, inverse , name + + // Define stack arguments + + // Move args values into our work registers + ldr subFFTNum, [pSubFFTNum] + ldr subFFTSize, [pSubFFTSize] + + // Update grpCount and grpSize rightaway inorder to reuse + // pGrpCount and pGrpSize regs + + LSL grpCount,subFFTSize,#2 + LSR subFFTNum,subFFTNum,#2 + MOV subFFTSize,grpCount + + ld1 {dW1},[pTwiddle] //[wi | wr] + // pT0+1 increments pT0 by 8 bytes + // pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes + lsl pointStep,subFFTNum, #1 + + // pOut0+1 increments pOut0 by 8 bytes + // pOut0+outPointStep == increment of 8*outPointStep bytes + // = 2*size bytes + + MOV stepTwiddle,#0 + ld1 {dW2},[pTwiddle] //[wi | wr] + smull outPointStep,grpCount32,pointStep32 + + LSL pointStep,pointStep,#2 // 2*grpSize + + ld1 {dW3},[pTwiddle] //[wi | wr] + lsl srcStep,pointStep, #1 // srcStep = 2*pointStep + + ADD setStep,srcStep,pointStep // setStep = 3*pointStep + + rsb setStep,setStep,#0 // setStep = - 3*pointStep + SUB srcStep,srcStep,#16 // srcStep = 2*pointStep-16 + + lsl dstStep,outPointStep, #1 + + ADD dstStep,dstStep,outPointStep // dstStep = 3*outPointStep + // dstStep = - 3*outPointStep+16 + rsb dstStep,dstStep,#16 + + +radix4GrpLoop\name : + + ld2 {dXr0,dXi0},[pSrc],pointStep // data[0] + ADD stepTwiddle,stepTwiddle,pointStep + ld2 {dXr1,dXi1},[pSrc],pointStep // data[1] + // set pTwiddle to the first point + ADD pTwiddle,pTwiddle,stepTwiddle + ld2 {dXr2,dXi2},[pSrc],pointStep // data[2] + lsl twStep,stepTwiddle, #2 + + // data[3] & update pSrc for the next set + ld2 {dXr3,dXi3},[pSrc],setStep + SUB twStep,stepTwiddle,twStep // twStep = -3*stepTwiddle + + lsr setCount,pointStep, #3 + + // set pSrc to data[0] of the next set + ADD pSrc,pSrc,#16 + // increment to data[1] of the next set + ADD pSrc,pSrc,pointStep + + + // Loop on the sets + +radix4SetLoop\name : + + + + .ifeqs "\inverse", "TRUE" + fmul dZr1,dXr1,dW1[0] + fmul dZi1,dXi1,dW1[0] + fmul dZr2,dXr2,dW2[0] + fmul dZi2,dXi2,dW2[0] + fmul dZr3,dXr3,dW3[0] + fmul dZi3,dXi3,dW3[0] + + fmla dZr1,dXi1,dW1[1] // real part + fmls dZi1,dXr1,dW1[1] // imag part + + // data[1] for next iteration + ld2 {dXr1,dXi1},[pSrc],pointStep + + fmla dZr2,dXi2,dW2[1] // real part + fmls dZi2,dXr2,dW2[1] // imag part + + // data[2] for next iteration + ld2 {dXr2,dXi2},[pSrc],pointStep + + fmla dZr3,dXi3,dW3[1] // real part + fmls dZi3,dXr3,dW3[1] // imag part + .else + fmul dZr1,dXr1,dW1[0] + fmul dZi1,dXi1,dW1[0] + fmul dZr2,dXr2,dW2[0] + fmul dZi2,dXi2,dW2[0] + fmul dZr3,dXr3,dW3[0] + fmul dZi3,dXi3,dW3[0] + + fmls dZr1,dXi1,dW1[1] // real part + fmla dZi1,dXr1,dW1[1] // imag part + + // data[1] for next iteration + ld2 {dXr1,dXi1},[pSrc],pointStep + + fmls dZr2,dXi2,dW2[1] // real part + fmla dZi2,dXr2,dW2[1] // imag part + + // data[2] for next iteration + ld2 {dXr2,dXi2},[pSrc],pointStep + + fmls dZr3,dXi3,dW3[1] // real part + fmla dZi3,dXr3,dW3[1] // imag part + .endif + + // data[3] & update pSrc to data[0] + // But don't read on the very last iteration because that reads past + // the end of pSrc. The last iteration is grpCount = 4, setCount = 2. + cmp grpCount, #4 + + b.ne skipUpdate\name + cmp setCount, #2 + b.ne skipUpdate\name + add pSrc, pSrc, setStep + beq radix4SkipRead\name +skipUpdate\name: + ld2 {dXr3,dXi3},[pSrc],setStep +radix4SkipRead\name: + + SUBS setCount,setCount,#2 + + // finish first stage of 4 point FFT + // fadd qY0,qX0,qZ2 + // fsub qY2,qX0,qZ2 + fadd dYr0,dXr0,dZr2 + fsub dYr2,dXr0,dZr2 + fadd dYi0,dXi0,dZi2 + fsub dYi2,dXi0,dZi2 + + // data[0] for next iteration + ld2 {dXr0,dXi0},[pSrc], #16 + // fadd qY1,qZ1,qZ3 + // fsub qY3,qZ1,qZ3 + fadd dYr1,dZr1,dZr3 + fsub dYr3,dZr1,dZr3 + fadd dYi1,dZi1,dZi3 + fsub dYi3,dZi1,dZi3 + + // finish second stage of 4 point FFT + + // fsub qZ0,qY2,qY1 + fsub dZr0,dYr2,dYr1 + fsub dZi0,dYi2,dYi1 + + .ifeqs "\inverse", "TRUE" + + fadd dZr3,dYr0,dYi3 + st2 {dZr0,dZi0},[pDst],outPointStep + fsub dZi3,dYi0,dYr3 + + // fadd qZ2,qY2,qY1 + fadd dZr2,dYr2,dYr1 + fadd dZi2,dYi2,dYi1 + + st2 {dZr3,dZi3},[pDst],outPointStep + + fsub dZr1,dYr0,dYi3 + st2 {dZr2,dZi2},[pDst],outPointStep + fadd dZi1,dYi0,dYr3 + + st2 {dZr1,dZi1},[pDst],dstStep + + + .else + + fsub dZr1,dYr0,dYi3 + st2 {dZr0,dZi0},[pDst],outPointStep + fadd dZi1,dYi0,dYr3 + + // fadd qZ2,qY2,qY1 + fadd dZr2,dYr2,dYr1 + fadd dZi2,dYi2,dYi1 + + st2 {dZr1,dZi1},[pDst],outPointStep + + fadd dZr3,dYr0,dYi3 + st2 {dZr2,dZi2},[pDst],outPointStep + fsub dZi3,dYi0,dYr3 + + st2 {dZr3,dZi3},[pDst],dstStep + + + .endif + + // increment to data[1] of the next set + ADD pSrc,pSrc,pointStep + BGT radix4SetLoop\name + + + ld1 {dW1},[pTwiddle],stepTwiddle //[wi | wr] + // subtract 4 since grpCount multiplied by 4 + SUBS grpCount,grpCount,#4 + ld1 {dW2},[pTwiddle],stepTwiddle //[wi | wr] + // increment pSrc for the next grp + ADD pSrc,pSrc,srcStep + ld1 {dW3},[pTwiddle],twStep //[wi | wr] + BGT radix4GrpLoop\name + + str subFFTNum, [pSubFFTNum] + str subFFTSize, [pSubFFTSize] + + .endm + + + M_START armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace,,d15 + FFTSTAGE "FALSE","FALSE",FWD + M_END + + + M_START armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace,,d15 + FFTSTAGE "FALSE","TRUE",INV + M_END + + + .end diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S new file mode 100644 index 0000000..f348e6a --- /dev/null +++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S @@ -0,0 +1,473 @@ +// +// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the LICENSE file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// +// This is a modification of armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.s +// to support float instead of SC32. +// + +// +// Description: +// Compute a first stage Radix 8 FFT stage for a N point complex signal +// +// + + +// Include standard headers + +#include "dl/api/arm/arm64COMM_s.h" +#include "dl/api/arm/omxtypes_s.h" + +// Import symbols required from other files +// (For example tables) + + +// Set debugging level +//DEBUG_ON SETL {TRUE} + + + +// Guarding implementation by the processor name + + + + +// Guarding implementation by the processor name + +//Input Registers + +#define pSrc x0 +#define pDst x1 +#define pTwiddle x2 +#define pSubFFTNum x3 +#define pSubFFTSize x4 + + +//Output Registers + + +//Local Scratch Registers + +#define subFFTNum x5 +#define subFFTSize x6 +#define grpSize x7 +// Reuse grpSize as setCount +#define setCount x7 +#define pointStep x8 +#define outPointStep x8 +#define setStep x9 +#define step1 x10 +#define step2 x11 +#define t0 w12 + + +// Neon Registers + +#define dXr0 v0.2s +#define dXi0 v1.2s +#define dXr1 v2.2s +#define dXi1 v3.2s +#define dXr2 v4.2s +#define dXi2 v5.2s +#define dXr3 v6.2s +#define dXi3 v7.2s +#define dXr4 v8.2s +#define dXi4 v9.2s +#define dXr5 v10.2s +#define dXi5 v11.2s +#define dXr6 v12.2s +#define dXi6 v13.2s +#define dXr7 v14.2s +#define dXi7 v15.2s +#define qX0 v0.4s +#define qX1 v1.4s +#define qX2 v2.4s +#define qX3 v3.4s +#define qX4 v4.4s +#define qX5 v5.4s +#define qX6 v6.4s +#define qX7 v7.4s + +#define dUr0 v16.2s +#define dUi0 v17.2s +#define dUr2 v18.2s +#define dUi2 v19.2s +#define dUr4 v20.2s +#define dUi4 v21.2s +#define dUr6 v22.2s +#define dUi6 v23.2s +#define dUr1 v24.2s +#define dUi1 v25.2s +#define dUr3 v26.2s +#define dUi3 v27.2s +#define dUr5 v28.2s +#define dUi5 v29.2s +// reuse dXr7 and dXi7 +#define dUr7 v30.2s +#define dUi7 v31.2s +#define qU0 v8.4s +#define qU1 v12.4s +#define qU2 v9.4s +#define qU3 v13.4s +#define qU4 v10.4s +#define qU5 v14.4s +#define qU6 v11.4s +#define qU7 v15.4s + + +#define dVr0 v24.2s +#define dVi0 v25.2s +#define dVr2 v26.2s +#define dVi2 v27.2s +#define dVr4 v28.2s +#define dVi4 v29.2s +#define dVr6 v30.2s +#define dVi6 v31.2s +#define dVr1 v16.2s +#define dVi1 v17.2s +#define dVr3 v18.2s +#define dVi3 v19.2s +#define dVr5 v20.2s +#define dVi5 v21.2s +#define dVr7 v22.2s +#define dVi7 v23.2s +#define qV0 v12.4s +#define qV1 v8.4s +#define qV2 v13.4s +#define qV3 v9.4s +#define qV4 v14.4s +#define qV5 v10.4s +#define qV6 v15.4s +#define qV7 v11.4s + +#define dYr0 v16.2s +#define dYi0 v17.2s +#define dYr2 v18.2s +#define dYi2 v19.2s +#define dYr4 v20.2s +#define dYi4 v21.2s +#define dYr6 v22.2s +#define dYi6 v23.2s +#define dYr1 v24.2s +#define dYi1 v25.2s +#define dYr3 v26.2s +#define dYi3 v27.2s +#define dYr5 v28.2s +#define dYi5 v29.2s +#define dYr7 v30.2s +#define dYi7 v31.2s +#define qY0 v8.4s +#define qY1 v12.4s +#define qY2 v9.4s +#define qY3 v13.4s +#define qY4 v10.4s +#define qY5 v14.4s +#define qY6 v11.4s +#define qY7 v15.4s + +#define dT0 v14.2s +#define dT0s v14.s +#define dT1 v15.2s + + .MACRO FFTSTAGE scaled, inverse, name + + // Define stack arguments + + // Move args values into our work registers + ldr subFFTNum, [pSubFFTNum] + ldr subFFTSize, [pSubFFTSize] + + // Update pSubFFTSize and pSubFFTNum regs + // subFFTSize = 1 for the first stage + + movz t0, 0x3f35, lsl #16 // High half word of sqrt(1/2). + movk t0, 0x04f3 // Low half word of sqrt(1/2). + MOV subFFTSize,#8 + + // Note: setCount = subFFTNum/8 (reuse the grpSize reg for setCount) + LSR grpSize,subFFTNum,#3 + MOV subFFTNum,grpSize + + + // pT0+1 increments pT0 by 8 bytes + // pT0+pointStep = increment of 8*pointStep bytes = grpSize bytes + // Note: outPointStep = pointStep for firststage + + lsl pointStep,grpSize, #3 + + + // Calculate the step of input data for the next set + //MOV step1,pointStep,LSL #1 // step1 = 2*pointStep + ld2 {dXr0,dXi0},[pSrc],pointStep // data[0] + lsl step1,grpSize, #4 + lsl step2,pointStep, #3 + + ld2 {dXr1,dXi1},[pSrc],pointStep // data[1] + SUB step2,step2,pointStep // step2 = 7*pointStep + // setStep = - 7*pointStep+16 + rsb setStep,step2,#16 + + ld2 {dXr2,dXi2},[pSrc],pointStep // data[2] + ld2 {dXr3,dXi3},[pSrc],pointStep // data[3] + ld2 {dXr4,dXi4},[pSrc],pointStep // data[4] + ld2 {dXr5,dXi5},[pSrc],pointStep // data[5] + ld2 {dXr6,dXi6},[pSrc],pointStep // data[6] + // data[7] & update pSrc for the next set + // setStep = -7*pointStep + 16 + ld2 {dXr7,dXi7},[pSrc],setStep + // grp = 0 a special case since all the twiddle factors are 1 + // Loop on the sets + +radix8fsGrpZeroSetLoop\name : + + // Decrement setcount + SUBS setCount,setCount,#2 + + + // finish first stage of 8 point FFT + + // fadd qU0,qX0,qX4 + // fadd qU2,qX1,qX5 + // fadd qU4,qX2,qX6 + // fadd qU6,qX3,qX7 + fadd dUr0,dXr0,dXr4 + fadd dUr2,dXr1,dXr5 + fadd dUr4,dXr2,dXr6 + fadd dUr6,dXr3,dXr7 + fadd dUi0,dXi0,dXi4 + fadd dUi2,dXi1,dXi5 + fadd dUi4,dXi2,dXi6 + fadd dUi6,dXi3,dXi7 + + // finish second stage of 8 point FFT + + // fadd qV0,qU0,qU4 + // fsub qV2,qU0,qU4 + // fadd qV4,qU2,qU6 + // fsub qV6,qU2,qU6 + fadd dVr0,dUr0,dUr4 + fsub dVr2,dUr0,dUr4 + fadd dVr4,dUr2,dUr6 + fsub dVr6,dUr2,dUr6 + fadd dVi0,dUi0,dUi4 + fsub dVi2,dUi0,dUi4 + fadd dVi4,dUi2,dUi6 + fsub dVi6,dUi2,dUi6 + + // finish third stage of 8 point FFT + + // fadd qY0,qV0,qV4 + // fsub qY4,qV0,qV4 + fadd dYr0,dVr0,dVr4 + fsub dYr4,dVr0,dVr4 + fadd dYi0,dVi0,dVi4 + fsub dYi4,dVi0,dVi4 + + st2 {dYr0,dYi0},[pDst],step1 // store y0 + + .ifeqs "\inverse", "TRUE" + + fsub dYr2,dVr2,dVi6 + fadd dYi2,dVi2,dVr6 + + fadd dYr6,dVr2,dVi6 + st2 {dYr2,dYi2},[pDst],step1 // store y2 + fsub dYi6,dVi2,dVr6 + + // fsub qU1,qX0,qX4 + fsub dUr1,dXr0,dXr4 + fsub dUi1,dXi0,dXi4 + + st2 {dYr4,dYi4},[pDst],step1 // store y4 + + // fsub qU3,qX1,qX5 + // fsub qU5,qX2,qX6 + fsub dUr3,dXr1,dXr5 + fsub dUr5,dXr2,dXr6 + fsub dUi3,dXi1,dXi5 + fsub dUi5,dXi2,dXi6 + + st2 {dYr6,dYi6},[pDst],step1 // store y6 + + .ELSE + + fadd dYr6,dVr2,dVi6 + fsub dYi6,dVi2,dVr6 + + fsub dYr2,dVr2,dVi6 + st2 {dYr6,dYi6},[pDst],step1 // store y2 + fadd dYi2,dVi2,dVr6 + + + // fsub qU1,qX0,qX4 + fsub dUr1,dXr0,dXr4 + fsub dUi1,dXi0,dXi4 + + st2 {dYr4,dYi4},[pDst],step1 // store y4 + + // fsub qU3,qX1,qX5 + // fsub qU5,qX2,qX6 + fsub dUr3,dXr1,dXr5 + fsub dUr5,dXr2,dXr6 + fsub dUi3,dXi1,dXi5 + fsub dUi5,dXi2,dXi6 + + st2 {dYr2,dYi2},[pDst],step1 // store y6 + + + .ENDIF + + // finish first stage of 8 point FFT + + // fsub qU7,qX3,qX7 + fsub dUr7,dXr3,dXr7 + fsub dUi7,dXi3,dXi7 + + mov dT0s[0], t0 + + // finish second stage of 8 point FFT + + fsub dVr1,dUr1,dUi5 + // data[0] for next iteration + ld2 {dXr0,dXi0},[pSrc],pointStep + fadd dVi1,dUi1,dUr5 + fadd dVr3,dUr1,dUi5 + ld2 {dXr1,dXi1},[pSrc],pointStep // data[1] + fsub dVi3,dUi1,dUr5 + + fsub dVr5,dUr3,dUi7 + ld2 {dXr2,dXi2},[pSrc],pointStep // data[2] + fadd dVi5,dUi3,dUr7 + fadd dVr7,dUr3,dUi7 + ld2 {dXr3,dXi3},[pSrc],pointStep // data[3] + fsub dVi7,dUi3,dUr7 + + // finish third stage of 8 point FFT + + .ifeqs "\inverse", "TRUE" + + // calculate a*v5 + fmul dT1,dVr5,dT0[0] // use dVi0 for dT1 + + ld2 {dXr4,dXi4},[pSrc],pointStep // data[4] + fmul dVi5,dVi5,dT0[0] + + ld2 {dXr5,dXi5},[pSrc],pointStep // data[5] + fsub dVr5,dT1,dVi5 // a * V5 + fadd dVi5,dT1,dVi5 + + ld2 {dXr6,dXi6},[pSrc],pointStep // data[6] + + // calculate b*v7 + fmul dT1,dVr7,dT0[0] + fmul dVi7,dVi7,dT0[0] + + // fadd qY1,qV1,qV5 + // fsub qY5,qV1,qV5 + fadd dYr1,dVr1,dVr5 + fsub dYr5,dVr1,dVr5 + fadd dYi1,dVi1,dVi5 + fsub dYi5,dVi1,dVi5 + + fadd dVr7,dT1,dVi7 // b * V7 + fsub dVi7,dVi7,dT1 + SUB pDst, pDst, step2 // set pDst to y1 + + // On the last iteration, this will read past the end of pSrc, + // so skip this read. + BEQ radix8SkipLastUpdateInv\name + ld2 {dXr7,dXi7},[pSrc],setStep // data[7] +radix8SkipLastUpdateInv\name: + + fsub dYr3,dVr3,dVr7 + fsub dYi3,dVi3,dVi7 + st2 {dYr1,dYi1},[pDst],step1 // store y1 + fadd dYr7,dVr3,dVr7 + fadd dYi7,dVi3,dVi7 + + + st2 {dYr3,dYi3},[pDst],step1 // store y3 + st2 {dYr5,dYi5},[pDst],step1 // store y5 + st2 {dYr7,dYi7},[pDst] // store y7 + ADD pDst, pDst, #16 + + .ELSE + + // calculate b*v7 + fmul dT1,dVr7,dT0[0] + ld2 {dXr4,dXi4},[pSrc],pointStep // data[4] + fmul dVi7,dVi7,dT0[0] + + ld2 {dXr5,dXi5},[pSrc],pointStep // data[5] + fadd dVr7,dT1,dVi7 // b * V7 + fsub dVi7,dVi7,dT1 + + ld2 {dXr6,dXi6},[pSrc],pointStep // data[6] + + // calculate a*v5 + fmul dT1,dVr5,dT0[0] // use dVi0 for dT1 + fmul dVi5,dVi5,dT0[0] + + fadd dYr7,dVr3,dVr7 + fadd dYi7,dVi3,dVi7 + SUB pDst, pDst, step2 // set pDst to y1 + + fsub dVr5,dT1,dVi5 // a * V5 + fadd dVi5,dT1,dVi5 + + // On the last iteration, this will read past the end of pSrc, + // so skip this read. + BEQ radix8SkipLastUpdateFwd\name + ld2 {dXr7,dXi7},[pSrc],setStep // data[7] +radix8SkipLastUpdateFwd\name: + + // fsub qY5,qV1,qV5 + fsub dYr5,dVr1,dVr5 + fsub dYi5,dVi1,dVi5 + + fsub dYr3,dVr3,dVr7 + st2 {dYr7,dYi7},[pDst],step1 // store y1 + fsub dYi3,dVi3,dVi7 + + // fadd qY1,qV1,qV5 + fadd dYr1,dVr1,dVr5 + fadd dYi1,dVi1,dVi5 + + st2 {dYr5,dYi5},[pDst],step1 // store y3 + st2 {dYr3,dYi3},[pDst],step1 // store y5 + st2 {dYr1,dYi1},[pDst],#16 // store y7 + + .ENDIF + + + // update pDst for the next set + SUB pDst, pDst, step2 + BGT radix8fsGrpZeroSetLoop\name + + // Save subFFTNum and subFFTSize for next stage + str subFFTNum, [pSubFFTNum] + str subFFTSize, [pSubFFTSize] + + .endm + + + // Allocate stack memory required by the function + + + M_START armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace,,d15 + FFTSTAGE "FALSE","FALSE",FWD + M_END + + + M_START armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace,,d15 + FFTSTAGE "FALSE","TRUE",INV + M_END + + + + .end diff --git a/dl/sp/src/arm/arm64/omxSP_FFTFwd_CToC_FC32.c b/dl/sp/src/arm/arm64/omxSP_FFTFwd_CToC_FC32.c new file mode 100644 index 0000000..f29796b --- /dev/null +++ b/dl/sp/src/arm/arm64/omxSP_FFTFwd_CToC_FC32.c @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "dl/api/omxtypes.h" +#include "dl/sp/api/armSP.h" +#include "dl/sp/api/omxSP.h" + +extern void armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace( + const OMX_FC32* pSrc, + OMX_FC32* pDst, + OMX_FC32* pTwiddle, + long* subFFTNum, + long* subFFTSize); + +extern void armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace( + const OMX_FC32* pSrc, + OMX_FC32* pDst, + OMX_FC32* pTwiddle, + long* subFFTNum, + long* subFFTSize); + +extern void armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace( + const OMX_FC32* pSrc, + OMX_FC32* pDst, + OMX_FC32* pTwiddle, + long* subFFTNum, + long* subFFTSize); + +extern void armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace( + const OMX_FC32* pSrc, + OMX_FC32* pDst, + OMX_FC32* pTwiddle, + long* subFFTNum, + long* subFFTSize); + +extern void armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace( + const OMX_FC32* pSrc, + OMX_FC32* pDst, + OMX_FC32* pTwiddle, + long* subFFTNum, + long* subFFTSize); + +extern void armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace( + const OMX_FC32* pSrc, + OMX_FC32* pDst, + OMX_FC32* pTwiddle, + long* subFFTNum, + long* subFFTSize); + +extern void armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace( + const OMX_FC32* pSrc, + OMX_FC32* pDst, + OMX_FC32* pTwiddle, + long* subFFTNum, + long* subFFTSize); + +/** + * Function: omxSP_FFTFwd_CToC_FC32_Sfs (2.2.4.2.2) + * + * Description: + * Compute an FFT for a complex signal of length of 2^order, + * where 0 <= order <= 15. + * Transform length is determined by the specification structure, which + * must be initialized prior to calling the FFT function using the appropriate + * helper, i.e., <FFTInit_C_sc32> or <FFTInit_C_SC16>. The relationship + * between the input and output sequences can be expressed in terms of the + * DFT, i.e., + * + * X[k] = SUM[n=0...N-1]x[n].e^(-jnk.2.pi/N) + * k = 0,1,2,..., N-1 + * N = 2^order + * + * Input Arguments: + * pSrc - pointer to the input signal, a complex-valued vector of length + * 2^order; must be aligned on a 32 byte boundary. + * pFFTSpec - pointer to the preallocated and initialized specification + * structure + * + * Output Arguments: + * pDst - pointer to the complex-valued output vector, of length 2^order; + * must be aligned on an 32-byte boundary. + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - returned if one or more of the following conditions + * is true: + * - one or more of the following pointers is NULL: pSrc, pDst, or + * pFFTSpec. + * - pSrc or pDst is not 32-byte aligned + * + */ + +OMXResult omxSP_FFTFwd_CToC_FC32_Sfs(const OMX_FC32* pSrc, + OMX_FC32* pDst, + const OMXFFTSpec_C_FC32* pFFTSpec) { + ARMsFFTSpec_FC32* spec = (ARMsFFTSpec_FC32*)pFFTSpec; + int order; + long subFFTSize; + long subFFTNum; + OMX_FC32* pTwiddle; + OMX_FC32* pOut; + + /* + * Check args are not NULL and the source and destination pointers + * are properly aligned. + */ + if (!validateParametersFC32(pSrc, pDst, spec)) + return OMX_Sts_BadArgErr; + + order = fastlog2(spec->N); + + subFFTSize = 1; + subFFTNum = spec->N; + pTwiddle = spec->pTwiddle; + pOut = spec->pBuf; + + if (order > 3) { + OMX_FC32* argDst; + + /* + * Set up argDst and pOut appropriately so that pOut = pDst for + * the very last FFT stage. + */ + if ((order & 2) == 0) { + argDst = pOut; + pOut = pDst; + } else { + argDst = pDst; + } + + /* + * Odd order uses a radix 8 first stage; even order, a radix 4 + * first stage. + */ + if (order & 1) { + armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace( + pSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize); + } else { + armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace( + pSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize); + } + + /* + * Now use radix 4 stages to finish rest of the FFT + */ + if (subFFTNum >= 4) { + while (subFFTNum > 4) { + OMX_FC32* tmp; + + armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace( + argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize); + /* + * Swap argDst and pOut + */ + tmp = pOut; + pOut = argDst; + argDst = tmp; + } + + armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace( + argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize); + } + } else if (order == 3) { + armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace( + pSrc, pDst, pTwiddle, &subFFTNum, &subFFTSize); + armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace( + pDst, pOut, pTwiddle, &subFFTNum, &subFFTSize); + armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace( + pOut, pDst, pTwiddle, &subFFTNum, &subFFTSize); + } else if (order == 2) { + armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace( + pSrc, pOut, pTwiddle, &subFFTNum, &subFFTSize); + armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace( + pOut, pDst, pTwiddle, &subFFTNum, &subFFTSize); + } else { + /* Order = 1 */ + armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace( + pSrc, pDst, pTwiddle, &subFFTNum, &subFFTSize); + } + + return OMX_Sts_NoErr; +} diff --git a/dl/sp/src/arm/arm64/omxSP_FFTFwd_RToCCS_F32.c b/dl/sp/src/arm/arm64/omxSP_FFTFwd_RToCCS_F32.c new file mode 100644 index 0000000..f1e503e --- /dev/null +++ b/dl/sp/src/arm/arm64/omxSP_FFTFwd_RToCCS_F32.c @@ -0,0 +1,213 @@ +/* + * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "dl/api/omxtypes.h" +#include "dl/sp/api/armSP.h" +#include "dl/sp/api/omxSP.h" + +extern void armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace( + const OMX_FC32* pSrc, + OMX_FC32* pDst, + OMX_FC32* pTwiddle, + long* subFFTNum, + long* subFFTSize); + +extern void armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace( + const OMX_FC32* pSrc, + OMX_FC32* pDst, + OMX_FC32* pTwiddle, + long* subFFTNum, + long* subFFTSize); + +extern void armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace( + const OMX_FC32* pSrc, + OMX_FC32* pDst, + OMX_FC32* pTwiddle, + long* subFFTNum, + long* subFFTSize); + +extern void armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace( + const OMX_FC32* pSrc, + OMX_FC32* pDst, + OMX_FC32* pTwiddle, + long* subFFTNum, + long* subFFTSize); + +extern void armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace( + const OMX_FC32* pSrc, + OMX_FC32* pDst, + OMX_FC32* pTwiddle, + long* subFFTNum, + long* subFFTSize); + +extern void armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace( + const OMX_FC32* pSrc, + OMX_FC32* pDst, + OMX_FC32* pTwiddle, + long* subFFTNum, + long* subFFTSize); + +extern void armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace( + const OMX_FC32* pSrc, + OMX_FC32* pDst, + OMX_FC32* pTwiddle, + long* subFFTNum, + long* subFFTSize); + +extern void ComplexToRealFixup(OMX_FC32* pSrc, + OMX_F32* pDst, + const OMX_FC32* pTwiddle, + OMX_F32* pBuf, + long N); + +/** + * Function: omxSP_FFTFwd_CToC_FC32_Sfs (2.2.4.2.2) + * + * Description: + * Compute an FFT for a complex signal of length of 2^order, + * where 0 <= order <= 15. + * Transform length is determined by the specification structure, which + * must be initialized prior to calling the FFT function using the appropriate + * helper, i.e., <FFTInit_C_sc32> or <FFTInit_C_SC16>. The relationship + * between the input and output sequences can be expressed in terms of the + * DFT, i.e., + * + * X[k] = SUM[n=0...N-1]x[n].e^(-jnk.2.pi/N) + * k = 0,1,2,..., N-1 + * N = 2^order + * + * Input Arguments: + * pSrc - pointer to the input signal, a complex-valued vector of length + * 2^order; must be aligned on a 32 byte boundary. + * pFFTSpec - pointer to the preallocated and initialized specification + * structure + * + * Output Arguments: + * pDst - pointer to the complex-valued output vector, of length 2^order; + * must be aligned on an 32-byte boundary. + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - returned if one or more of the following conditions + * is true: + * - one or more of the following pointers is NULL: pSrc, pDst, or + * pFFTSpec. + * - pSrc or pDst is not 32-byte aligned + * + */ + +OMXResult omxSP_FFTFwd_RToCCS_F32_Sfs(const OMX_F32* pSrc, + OMX_F32* pDst, + const OMXFFTSpec_R_F32* pFFTSpec) { + ARMsFFTSpec_R_FC32* spec = (ARMsFFTSpec_R_FC32*)pFFTSpec; + int order; + long subFFTSize; + long subFFTNum; + OMX_FC32* pTwiddle; + OMX_FC32* pOut; + OMX_FC32* pComplexSrc = (OMX_FC32*) pSrc; + OMX_FC32* pComplexDst = (OMX_FC32*) pDst; + + /* + * Check args are not NULL and the source and destination pointers + * are properly aligned. + */ + if (!validateParametersF32(pSrc, pDst, spec)) + return OMX_Sts_BadArgErr; + + /* + * Compute the RFFT using a complex FFT of one less order, so set + * order to be the order of the complex FFT. + */ + order = fastlog2(spec->N) - 1; + + subFFTSize = 1; + subFFTNum = spec->N >> 1; + pTwiddle = spec->pTwiddle; + pOut = (OMX_FC32*) spec->pBuf; + + if (order > 3) { + OMX_FC32* argDst; + OMX_FC32* pComplexDst = (OMX_FC32*) pDst; + + /* + * Set up argDst and pOut appropriately so that pOut = pDst for + * ComplexToRealFixup. + */ + if ((order & 2) != 0) { + argDst = pOut; + pOut = pComplexDst; + } else { + argDst = pComplexDst; + } + + /* + * Odd order uses a radix 8 first stage; even order, a radix 4 + * first stage. + */ + if (order & 1) { + armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace( + pComplexSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize); + } else { + armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace( + pComplexSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize); + } + + /* + * Now use radix 4 stages to finish rest of the FFT + */ + if (subFFTNum >= 4) { + while (subFFTNum > 4) { + OMX_FC32* tmp; + + armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace( + argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize); + /* + * Swap argDst and pOut + */ + tmp = pOut; + pOut = argDst; + argDst = tmp; + } + + armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace( + argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize); + } + } else if (order == 3) { + armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace( + pComplexSrc, pOut, pTwiddle, &subFFTNum, &subFFTSize); + armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace( + pOut, pComplexDst, pTwiddle, &subFFTNum, &subFFTSize); + armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace( + pComplexDst, pOut, pTwiddle, &subFFTNum, &subFFTSize); + } else if (order == 2) { + armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace( + pComplexSrc, pComplexDst, pTwiddle, &subFFTNum, &subFFTSize); + armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace( + pComplexDst, pOut, pTwiddle, &subFFTNum, &subFFTSize); + } else if (order == 1) { + armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace( + pComplexSrc, pOut, pTwiddle, &subFFTNum, &subFFTSize); + } else { + /* Handle complex order 0 specially */ + pOut->Re = pSrc[0]; + pOut->Im = pSrc[1]; + } + + /* + * Complex FFT done. Fix up the complex result to give the correct + * RFFT. + */ + + ComplexToRealFixup(pOut, pDst, pTwiddle, spec->pBuf, spec->N); + + return OMX_Sts_NoErr; +} diff --git a/dl/sp/src/arm/arm64/omxSP_FFTInv_CCSToR_F32.c b/dl/sp/src/arm/arm64/omxSP_FFTInv_CCSToR_F32.c new file mode 100644 index 0000000..84de9cf --- /dev/null +++ b/dl/sp/src/arm/arm64/omxSP_FFTInv_CCSToR_F32.c @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "dl/api/omxtypes.h" +#include "dl/sp/api/armSP.h" +#include "dl/sp/api/omxSP.h" + +extern void armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace( + const OMX_FC32* pSrc, + OMX_FC32* pDst, + OMX_FC32* pTwiddle, + long* subFFTNum, + long* subFFTSize); + +extern void armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace( + const OMX_FC32* pSrc, + OMX_FC32* pDst, + OMX_FC32* pTwiddle, + long* subFFTNum, + long* subFFTSize); + +extern void armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace( + const OMX_FC32* pSrc, + OMX_FC32* pDst, + OMX_FC32* pTwiddle, + long* subFFTNum, + long* subFFTSize); + +extern void armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace( + const OMX_FC32* pSrc, + OMX_FC32* pDst, + OMX_FC32* pTwiddle, + long* subFFTNum, + long* subFFTSize); + +extern void armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace( + const OMX_FC32* pSrc, + OMX_FC32* pDst, + OMX_FC32* pTwiddle, + long* subFFTNum, + long* subFFTSize); + +extern void armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace( + const OMX_FC32* pSrc, + OMX_FC32* pDst, + OMX_FC32* pTwiddle, + long* subFFTNum, + long* subFFTSize); + +extern void armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace( + const OMX_FC32* pSrc, + OMX_FC32* pDst, + OMX_FC32* pTwiddle, + long* subFFTNum, + long* subFFTSize); + +extern void armSP_FFTInv_CCSToR_F32_preTwiddleRadix2( + const OMX_F32* pSrc, + const OMX_FC32* pTwiddle, + OMX_F32* pBuf, + long N); + +/* + * Scale FFT data by 1/|length|. |length| must be a power of two + */ +static inline ScaleRFFTData(OMX_F32* fftData, unsigned length) { + float32_t* data = (float32_t*)fftData; + float32_t scale = 2.0f / length; + + if (length >= 4) { + /* + * Do 4 float elements at a time because |length| is always a + * multiple of 4 when |length| >= 4. + * + * TODO(rtoy): Figure out how to process 8 elements at a time + * using intrinsics or replace this with inline assembly. + */ + do { + float32x4_t x = vld1q_f32(data); + + length -= 4; + x = vmulq_n_f32(x, scale); + vst1q_f32(data, x); + data += 4; + } while (length > 0); + } else if (length == 2) { + float32x2_t x = vld1_f32(data); + x = vmul_n_f32(x, scale); + vst1_f32(data, x); + } else { + fftData[0] *= scale; + } +} + +/** + * Function: omxSP_FFTInv_CCSToR_F32_Sfs + * + * Description: + * These functions compute the inverse FFT for a conjugate-symmetric input + * sequence. Transform length is determined by the specification structure, + * which must be initialized prior to calling the FFT function using + * <FFTInit_R_F32>. For a transform of length M, the input sequence is + * represented using a packed CCS vector of length M+2, and is organized + * as follows: + * + * Index: 0 1 2 3 4 5 . . . M-2 M-1 M M+1 + * Comp: R[0] 0 R[1] I[1] R[2] I[2] . . . R[M/2-1] I[M/2-1] R[M/2] 0 + * + * where R[n] and I[n], respectively, denote the real and imaginary + * components for FFT bin n. Bins are numbered from 0 to M/2, where M + * is the FFT length. Bin index 0 corresponds to the DC component, + * and bin index M/2 corresponds to the foldover frequency. + * + * Input Arguments: + * pSrc - pointer to the complex-valued input sequence represented + * using CCS format, of length (2^order) + 2; must be aligned on a + * 32-byte boundary. + * pFFTSpec - pointer to the preallocated and initialized + * specification structure + * + * Output Arguments: + * pDst - pointer to the real-valued output sequence, of length + * 2^order ; must be aligned on a 32-byte boundary. + * + * Return Value: + * + * OMX_Sts_NoErr - no error + + * OMX_Sts_BadArgErr - bad arguments if one or more of the + * following is true: + * - pSrc, pDst, or pFFTSpec is NULL + * - pSrc or pDst is not aligned on a 32-byte boundary + * + */ +OMXResult omxSP_FFTInv_CCSToR_F32_Sfs( + const OMX_F32* pSrc, + OMX_F32* pDst, + const OMXFFTSpec_R_F32* pFFTSpec) { + ARMsFFTSpec_R_FC32* spec = (ARMsFFTSpec_R_FC32*)pFFTSpec; + int order; + long subFFTSize; + long subFFTNum; + OMX_FC32* pTwiddle; + OMX_FC32* pOut; + OMX_FC32* pComplexSrc; + OMX_FC32* pComplexDst = (OMX_FC32*) pDst; + + /* + * Check args are not NULL and the source and destination pointers + * are properly aligned. + */ + if (!validateParametersF32(pSrc, pDst, spec)) + return OMX_Sts_BadArgErr; + + /* + * Preprocess the input before calling the complex inverse FFT. The + * result is actually stored in the second half of the temp buffer + * in pFFTSpec. + */ + if (spec->N > 1) + armSP_FFTInv_CCSToR_F32_preTwiddleRadix2( + pSrc, spec->pTwiddle, spec->pBuf, spec->N); + + /* + * Do a complex inverse FFT of half size. + */ + order = fastlog2(spec->N) - 1; + + subFFTSize = 1; + subFFTNum = spec->N >> 1; + pTwiddle = spec->pTwiddle; + /* + * The pBuf is split in half. The first half is the temp buffer. The + * second half holds the source data that was placed there by + * armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe. + */ + pOut = (OMX_FC32*) spec->pBuf; + pComplexSrc = pOut + (1 << order); + + + if (order > 3) { + OMX_FC32* argDst; + + /* + * Set up argDst and pOut appropriately so that pOut = pDst for + * the very last FFT stage. + */ + if ((order & 2) == 0) { + argDst = pOut; + pOut = pComplexDst; + } else { + argDst = pComplexDst; + } + + /* + * Odd order uses a radix 8 first stage; even order, a radix 4 + * first stage. + */ + if (order & 1) { + armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace( + pComplexSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize); + } else { + armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace( + pComplexSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize); + } + + /* + * Now use radix 4 stages to finish rest of the FFT + */ + if (subFFTNum >= 4) { + while (subFFTNum > 4) { + OMX_FC32* tmp; + + armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace( + argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize); + /* + * Swap argDst and pOut + */ + tmp = pOut; + pOut = argDst; + argDst = tmp; + } + + armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace( + argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize); + } + } else if (order == 3) { + armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace( + pComplexSrc, pComplexDst, pTwiddle, &subFFTNum, &subFFTSize); + armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace( + pComplexDst, pOut, pTwiddle, &subFFTNum, &subFFTSize); + armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace( + pOut, pComplexDst, pTwiddle, &subFFTNum, &subFFTSize); + } else if (order == 2) { + armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace( + pComplexSrc, pOut, pTwiddle, &subFFTNum, &subFFTSize); + armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace( + pOut, pComplexDst, pTwiddle, &subFFTNum, &subFFTSize); + } else if (order == 1) { + armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace( + pComplexSrc, pComplexDst, pTwiddle, &subFFTNum, &subFFTSize); + } else { + /* Order = 0 */ + *pComplexDst = *pComplexSrc; + } + + ScaleRFFTData(pDst, spec->N); + return OMX_Sts_NoErr; +} + diff --git a/dl/sp/src/arm/arm64/omxSP_FFTInv_CToC_FC32.c b/dl/sp/src/arm/arm64/omxSP_FFTInv_CToC_FC32.c new file mode 100644 index 0000000..eec05e9 --- /dev/null +++ b/dl/sp/src/arm/arm64/omxSP_FFTInv_CToC_FC32.c @@ -0,0 +1,214 @@ +/* + * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "dl/api/omxtypes.h" +#include "dl/sp/api/armSP.h" +#include "dl/sp/api/omxSP.h" + +extern void armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace( + const OMX_FC32* pSrc, + OMX_FC32* pDst, + OMX_FC32* pTwiddle, + long* subFFTNum, + long* subFFTSize); + +extern void armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace( + const OMX_FC32* pSrc, + OMX_FC32* pDst, + OMX_FC32* pTwiddle, + long* subFFTNum, + long* subFFTSize); + +extern void armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace( + const OMX_FC32* pSrc, + OMX_FC32* pDst, + OMX_FC32* pTwiddle, + long* subFFTNum, + long* subFFTSize); + +extern void armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace( + const OMX_FC32* pSrc, + OMX_FC32* pDst, + OMX_FC32* pTwiddle, + long* subFFTNum, + long* subFFTSize); + +extern void armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace( + const OMX_FC32* pSrc, + OMX_FC32* pDst, + OMX_FC32* pTwiddle, + long* subFFTNum, + long* subFFTSize); + +extern void armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace( + const OMX_FC32* pSrc, + OMX_FC32* pDst, + OMX_FC32* pTwiddle, + long* subFFTNum, + long* subFFTSize); + +extern void armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace( + const OMX_FC32* pSrc, + OMX_FC32* pDst, + OMX_FC32* pTwiddle, + long* subFFTNum, + long* subFFTSize); + +/* + * Scale FFT data by 1/|length|. |length| must be a power of two + */ +static inline ScaleFFTData(OMX_FC32* fftData, unsigned length) { + float32_t* data = (float32_t*)fftData; + float32_t scale = 1.0f / length; + + /* + * Do two complex elements at a time because |length| is always + * greater than or equal to 2 (order >= 1) + */ + do { + float32x4_t x = vld1q_f32(data); + + length -= 2; + x = vmulq_n_f32(x, scale); + vst1q_f32(data, x); + data += 4; + } while (length > 0); +} + +/** + * Function: omxSP_FFTInv_CToC_FC32 + * + * Description: + * These functions compute an inverse FFT for a complex signal of + * length of 2^order, where 0 <= order <= 15. Transform length is + * determined by the specification structure, which must be + * initialized prior to calling the FFT function using the appropriate + * helper, i.e., <FFTInit_C_FC32>. The relationship between the input + * and output sequences can be expressed in terms of the IDFT, i.e.: + * + * x[n] = SUM[k=0,...,N-1] X[k].e^(jnk.2.pi/N) + * n=0,1,2,...N-1 + * N=2^order. + * + * Input Arguments: + * pSrc - pointer to the complex-valued input signal, of length 2^order ; + * must be aligned on a 32-byte boundary. + * pFFTSpec - pointer to the preallocated and initialized specification + * structure + * + * Output Arguments: + * order + * pDst - pointer to the complex-valued output signal, of length 2^order; + * must be aligned on a 32-byte boundary. + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - returned if one or more of the following conditions + * is true: + * - one or more of the following pointers is NULL: pSrc, pDst, or + * pFFTSpec. + * - pSrc or pDst is not 32-byte aligned + * + */ + +OMXResult omxSP_FFTInv_CToC_FC32_Sfs(const OMX_FC32* pSrc, + OMX_FC32* pDst, + const OMXFFTSpec_C_FC32* pFFTSpec) { + ARMsFFTSpec_FC32* spec = (ARMsFFTSpec_FC32*)pFFTSpec; + int order; + long subFFTSize; + long subFFTNum; + OMX_FC32* pTwiddle; + OMX_FC32* pOut; + + /* + * Check args are not NULL and the source and destination pointers + * are properly aligned. + */ + if (!validateParametersFC32(pSrc, pDst, spec)) + return OMX_Sts_BadArgErr; + + order = fastlog2(spec->N); + + subFFTSize = 1; + subFFTNum = spec->N; + pTwiddle = spec->pTwiddle; + pOut = spec->pBuf; + + if (order > 3) { + OMX_FC32* argDst; + + /* + * Set up argDst and pOut appropriately so that pOut = pDst for + * the very last FFT stage. + */ + if ((order & 2) == 0) { + argDst = pOut; + pOut = pDst; + } else { + argDst = pDst; + } + + /* + * Odd order uses a radix 8 first stage; even order, a radix 4 + * first stage. + */ + if (order & 1) { + armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace( + pSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize); + } else { + armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace( + pSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize); + } + + /* + * Now use radix 4 stages to finish rest of the FFT + */ + if (subFFTNum >= 4) { + while (subFFTNum > 4) { + OMX_FC32* tmp; + + armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace( + argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize); + /* + * Swap argDst and pOut + */ + tmp = pOut; + pOut = argDst; + argDst = tmp; + } + + armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace( + argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize); + } + } else if (order == 3) { + armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace( + pSrc, pDst, pTwiddle, &subFFTNum, &subFFTSize); + armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace( + pDst, pOut, pTwiddle, &subFFTNum, &subFFTSize); + armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace( + pOut, pDst, pTwiddle, &subFFTNum, &subFFTSize); + } else if (order == 2) { + armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace( + pSrc, pOut, pTwiddle, &subFFTNum, &subFFTSize); + armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace( + pOut, pDst, pTwiddle, &subFFTNum, &subFFTSize); + } else { + /* Order = 1 */ + armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace( + pSrc, pDst, pTwiddle, &subFFTNum, &subFFTSize); + } + + ScaleFFTData(pDst, spec->N); + return OMX_Sts_NoErr; +} |