1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
|
.macro push_v_regs
stp q8, q9, [sp, #-32]!
stp q10, q11, [sp, #-32]!
stp q12, q13, [sp, #-32]!
stp q14, q15, [sp, #-32]!
stp X8, X9, [sp, #-16]!
stp X10, X11, [sp, #-16]!
stp X12, X13, [sp, #-16]!
stp X14, X15, [sp, #-16]!
stp X16, X17, [sp, #-16]!
stp X18, X19, [sp, #-16]!
stp X20, X21, [sp, #-16]!
stp X29, X30, [sp, #-16]!
.endm
.macro pop_v_regs
ldp X29, X30, [sp], #16
ldp X20, X21, [sp], #16
ldp X18, X19, [sp], #16
ldp X16, X17, [sp], #16
ldp X14, X15, [sp], #16
ldp X12, X13, [sp], #16
ldp X10, X11, [sp], #16
ldp X8, X9, [sp], #16
ldp q14, q15, [sp], #32
ldp q12, q13, [sp], #32
ldp q10, q11, [sp], #32
ldp q8, q9, [sp], #32
.endm
.text
.p2align 2
.global ixheaacd_cos_sin_mod_loop2
ixheaacd_cos_sin_mod_loop2:
// STMFD sp!, {x4-x12, x14}
push_v_regs
//stp x19, x20,[sp,#-16]!
//VPUSH {D8-D15}
//generating load addresses
ADD x3, x0, x2, LSL #3 //psubband1 = &subband[2 * M - 1];
SUB x3, x3, #4
ADD x10, x0, #256
ADD x11, x10, x2, LSL #3
SUB x11, x11, #4
MOV x8, #-4
LDR w19, =0
DUP V0.4s, w19
DUP V1.4s, w19
LDR w6, [x0]
sxtw x6, w6
ASR x4, x2, #1 //M_2 = ixheaacd_shx32(M, 1);
SUB x4, x4, #1
ASR x6, x6, #1 //*psubband = *psubband >> 1;
LD1 {v2.s}[0], [x3]
STR w6, [x0], #4 //psubband++;
sxtw x6, w6
LDR w7, [x0]
sxtw x7, w7
ASR x7, x7, #1
sub x20, x7, #0
neg x6, x20
STR w6, [x3], #-4
sxtw x6, w6
LD1 {v3.s}[0], [x3] // im = *psubband1;
LD2 {v0.h, v1.h}[0], [x1], #4
sxtl v0.4s, v0.4h
sxtl v1.4s, v1.4h
dup v0.2s, v0.s[0]
dup v1.2s, v1.s[0]
LD1 {v2.s}[1], [x11] //re = *psubband12;
// LDR w6, [x10]
// sxtw x6,w6
// ASR x7, x6, #1
// MOV x9, #0
// QSUB x7, x9, x7
LD1 {v4.s}[0], [x10]
SSHR v4.2s, v4.2s, #1
MOV x9, #0
DUP v6.2s, w9
SQSUB v4.2s, v6.2s, v4.2s
ST1 {v4.s}[0], [x11]
// str X7, [X11]
SUB x11, x11, #4
// sxtw x7,w7
LDR w6, [x10, #4]
sxtw x6, w6
ASR x6, x6, #1
STR w6, [x10], #4
sxtw x6, w6
LD1 {v3.s}[1], [x11]
sMULL v4.2d, v0.2s, v2.2s //qsub 2nd
sshr v4.2d, v4.2d, #16
sMULL v6.2d, v0.2s, v3.2s //add 2nd
sshr v6.2d, v6.2d, #16
sMULL v8.2d, v1.2s, v2.2s //add 1st
sshr v8.2d, v8.2d, #16
sMULL v10.2d, v1.2s, v3.2s //qsub 1st
sshr v10.2d, v10.2d, #16
add v12.2d, v8.2d , v6.2d
SQSUB v14.2d, v10.2d , v4.2d
SQSUB v16.2d, v4.2d , v10.2d
//shrn v12.2s, v12.2d,#32
//shrn v14.2s, v14.2d,#32
//shrn v16.2s, v16.2d,#32
ST1 {v12.s}[0], [x3], x8
ST1 {v14.s}[0], [x0], #4
SQNEG v12.4s, v12.4s
ST1 {v12.s}[2], [x10], #4
ST1 {v16.s}[2], [x11], x8
LOOP1:
LD1 {v2.2s}, [x0]
LD1 {v3.2s}, [x10]
LDR w5, [x3] //RE2
sxtw x5, w5
LDR w6, [x11] //RE3
sxtw x6, w6
//VTRN.32 D2, D3
TRN1 v4.2s, v2.2s, v3.2s
TRN2 v3.2s, v2.2s, v3.2s
MOV v2.8b, v4.8b
sMULL v4.2d, v0.2s, v2.2s //qsub 2nd
sshr v4.2d, v4.2d, #16
sMULL v6.2d, v0.2s, v3.2s //add 2nd
sshr v6.2d, v6.2d, #16
sMULL v8.2d, v1.2s, v2.2s //add 1st
sshr v8.2d, v8.2d, #16
sMULL v10.2d, v1.2s, v3.2s //qsub 1st
sshr v10.2d, v10.2d, #16
add v12.2d, v8.2d , v6.2d
SQSUB v14.2d, v4.2d , v10.2d
SQSUB v16.2d, v10.2d , v4.2d
//shrn v12.2s, v12.2d,#32
//shrn v14.2s, v14.2d,#32
//shrn v16.2s, v16.2d,#32
ST1 {v12.s}[0], [x0], #4
ST1 {v14.s}[0], [x3], x8
SQNEG v12.4s, v12.4s
ST1 {v12.s}[2], [x11], x8
ST1 {v16.s}[2], [x10], #4
LDR w19, =0
DUP V0.4s, w19
DUP V1.4s, w19
// second part
LD2 {v0.h, v1.h}[0], [x1], #4
sxtl v0.4s, v0.4h
sxtl v1.4s, v1.4h
dup v0.2s, v0.s[0]
dup v1.2s, v1.s[0]
mov v3.s[0], w5
mov v3.s[1], w6
LD1 {v2.s}[0], [x3]
LD1 {v2.s}[1], [x11]
sMULL v4.2d, v0.2s, v2.2s //qsub 2nd
sshr v4.2d, v4.2d, #16
sMULL v6.2d, v0.2s, v3.2s //add 2nd
sshr v6.2d, v6.2d, #16
sMULL v8.2d, v1.2s, v2.2s //add 1st
sshr v8.2d, v8.2d, #16
sMULL v10.2d, v1.2s, v3.2s //qsub 1st
sshr v10.2d, v10.2d, #16
add v12.2d, v4.2d , v10.2d
SQSUB v14.2d, v8.2d , v6.2d
SQSUB v16.2d, v6.2d , v8.2d
//shrn v12.2s, v12.2d,#32
//shrn v14.2s, v14.2d,#32
//shrn v16.2s, v16.2d,#32
ST1 {v12.s}[0], [x3], x8
ST1 {v14.s}[0], [x0], #4
SQNEG v12.4s, v12.4s
subs x4, x4, #1
ST1 {v12.s}[2], [x10], #4
ST1 {v16.s}[2], [x11], x8
BGT LOOP1
//VPOP {D8-D15}
// LDMFD sp!, {x4-x12, x15}
//ldp x19, x20,[sp],#16
pop_v_regs
ret
|