summaryrefslogtreecommitdiff
path: root/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_fs_s.S
diff options
context:
space:
mode:
Diffstat (limited to 'dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_fs_s.S')
-rw-r--r--dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_fs_s.S266
1 files changed, 266 insertions, 0 deletions
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_fs_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_fs_s.S
new file mode 100644
index 0000000..624ef3e
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_fs_s.S
@@ -0,0 +1,266 @@
+//
+// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+//
+//
+// This is a modification of armSP_FFT_CToC_SC32_Radix4_fs_unsafe_s.s
+// to support float instead of SC32.
+//
+
+//
+// Description:
+// Compute a first stage Radix 4 FFT stage for a N point complex signal
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+// Import symbols required from other files
+// (For example tables)
+
+
+
+
+// Set debugging level
+//DEBUG_ON SETL {TRUE}
+
+
+
+// Guarding implementation by the processor name
+
+
+
+// Guarding implementation by the processor name
+
+//Input Registers
+
+#define pSrc x0
+#define pDst x1
+#define pTwiddle x2
+#define pSubFFTNum x3
+#define pSubFFTSize x4
+
+
+//Output Registers
+
+
+//Local Scratch Registers
+
+#define subFFTNum x5
+#define subFFTSize x6
+#define grpSize x7
+// Reuse grpSize as setCount
+#define setCount x7
+#define pointStep x8
+#define outPointStep x8
+#define setStep x9
+#define step1 x10
+#define step3 x11
+
+// Neon Registers
+
+#define dXr0 v0.2s
+#define dXi0 v1.2s
+#define dXr1 v2.2s
+#define dXi1 v3.2s
+#define dXr2 v4.2s
+#define dXi2 v5.2s
+#define dXr3 v6.2s
+#define dXi3 v7.2s
+#define dYr0 v8.2s
+#define dYi0 v9.2s
+#define dYr1 v10.2s
+#define dYi1 v11.2s
+#define dYr2 v12.2s
+#define dYi2 v13.2s
+#define dYr3 v14.2s
+#define dYi3 v15.2s
+#define dZr0 v16.2s
+#define dZi0 v17.2s
+#define dZr1 v18.2s
+#define dZi1 v19.2s
+#define dZr2 v20.2s
+#define dZi2 v21.2s
+#define dZr3 v22.2s
+#define dZi3 v23.2s
+
+
+ .MACRO FFTSTAGE scaled, inverse, name
+
+ // Define stack arguments
+
+ // Move args values into our work registers
+ ldr subFFTNum, [pSubFFTNum]
+ ldr subFFTSize, [pSubFFTSize]
+
+ // pT0+1 increments pT0 by 8 bytes
+ // pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
+ // Note: outPointStep = pointStep for firststage
+
+ lsl pointStep, subFFTNum, #1
+
+ // Update pSubFFTSize and pSubFFTNum regs
+ ld2 {dXr0,dXi0}, [pSrc], pointStep // data[0]
+
+ // subFFTSize = 1 for the first stage
+ MOV subFFTSize,#4
+
+ // Note: setCount = subFFTNum/4 (reuse the grpSize reg for setCount)
+ LSR grpSize,subFFTNum,#2
+ ld2 {dXr1,dXi1}, [pSrc], pointStep // data[1]
+ MOV subFFTNum,grpSize
+
+
+ // Calculate the step of input data for the next set
+ //MOV setStep,pointStep,LSL #1
+ lsl setStep, grpSize, #4
+ ld2 {dXr2,dXi2}, [pSrc], pointStep // data[2]
+
+ // setStep = 3*pointStep
+ ADD setStep,setStep,pointStep
+ // setStep = - 3*pointStep+16
+
+ rsb setStep,setStep,#16
+ // data[3] & update pSrc for the next set
+ ld2 {dXr3,dXi3}, [pSrc], setStep
+
+ // step1 = 2*pointStep
+ lsl step1, pointStep, #1
+
+ // fadd qY0, qX0, qX2
+ fadd dYr0, dXr0, dXr2
+ fadd dYi0, dXi0, dXi2
+ // step3 = -pointStep
+ neg step3, pointStep
+
+ // grp = 0 a special case since all the twiddle factors are 1
+ // Loop on the sets : 2 sets at a time
+
+radix4fsGrpZeroSetLoop\name :
+
+
+
+ // Decrement setcount
+ SUBS setCount,setCount,#2
+
+
+ // finish first stage of 4 point FFT
+
+
+ // fsub qy2,qx0,qx2
+ fsub dYr2, dXr0, dXr2
+ fsub dYi2, dXi0, dXi2
+
+ ld2 {dXr0,dXi0}, [pSrc], step1 // data[0]
+ // fadd qy1,qx1,qx3
+ fadd dYr1, dXr1, dXr3
+ fadd dYi1, dXi1, dXi3
+ ld2 {dXr2,dXi2}, [pSrc], step3 // data[2]
+ // fsub qy3,qx1,qx3
+ fsub dYr3, dXr1, dXr3
+ fsub dYi3, dXi1, dXi3
+
+
+ // finish second stage of 4 point FFT
+
+ .ifeqs "\inverse", "TRUE"
+
+ ld2 {dXr1,dXi1}, [pSrc], step1 // data[1]
+ // fadd qz0,qy0,qy1
+ fadd dZr0, dYr0, dYr1
+ fadd dZi0, dYi0, dYi1
+
+ // data[3] & update pSrc for the next set, but not if it's the
+ // last iteration so that we don't read past the end of the
+ // input array.
+ BEQ radix4SkipLastUpdateInv\name
+ ld2 {dXr3,dXi3}, [pSrc], setStep
+
+radix4SkipLastUpdateInv\name:
+ FSUB dZr3,dYr2,dYi3
+
+ st2 {dZr0,dZi0},[pDst],outPointStep
+ FADD dZi3,dYi2,dYr3
+
+ // fsub qZ1,qY0,qY1
+ FSUB dZr1, dYr0, dYr1
+ FSUB dZi1, dYi0, dYi1
+ st2 {dZr3,dZi3},[pDst],outPointStep
+
+ FADD dZr2,dYr2,dYi3
+ st2 {dZr1,dZi1},[pDst],outPointStep
+ FSUB dZi2,dYi2,dYr3
+
+ // fadd qY0, qX0, qX2
+ FADD dYr0, dXr0, dXr2 // u0 for next iteration
+ FADD dYi0, dXi0, dXi2
+ st2 {dZr2,dZi2},[pDst],setStep
+
+
+ .else
+
+ ld2 {dXr1,dXi1}, [pSrc], step1 // data[1]
+ // fadd qZ0,qY0,qY1
+ fadd dZr0, dYr0, dYr1
+ fadd dZi0, dYi0, dYi1
+
+ // data[3] & update pSrc for the next set, but not if it's the
+ // last iteration so that we don't read past the end of the
+ // input array.
+ BEQ radix4SkipLastUpdateFwd\name
+ ld2 {dXr3,dXi3}, [pSrc], setStep
+
+radix4SkipLastUpdateFwd\name:
+ FADD dZr2,dYr2,dYi3
+
+ st2 {dZr0,dZi0},[pDst],outPointStep
+ FSUB dZi2,dYi2,dYr3
+
+ // fsub qz1,qy0,qy1
+ fsub dZr1, dYr0, dYr1
+ fsub dZi1, dYi0, dYi1
+ st2 {dZr2,dZi2},[pDst],outPointStep
+
+ FSUB dZr3,dYr2,dYi3
+ st2 {dZr1,dZi1},[pDst],outPointStep
+ FADD dZi3,dYi2,dYr3
+
+ // fadd qy0,qx0,qx2
+ fadd dYr0, dXr0, dXr2 // u0 for next iteration
+ fadd dYi0, dXi0, dXi2
+
+ st2 {dZr3,dZi3},[pDst],setStep
+
+ .endif
+
+ BGT radix4fsGrpZeroSetLoop\name
+
+ // Save subFFTNum and subFFTSize for next stage
+ str subFFTNum, [pSubFFTNum]
+ str subFFTSize, [pSubFFTSize]
+
+ .endm
+
+
+
+ M_START armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace,,d15
+ FFTSTAGE "FALSE","FALSE",fwd
+ M_END
+
+
+
+ M_START armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace,,d15
+ FFTSTAGE "FALSE","TRUE",inv
+ M_END
+
+
+ .end