src/modules/audio_coding/codecs/isac/fix/source/lpc_masking_model_neon.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177

@
@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
@
@ Use of this source code is governed by a BSD-style license
@ that can be found in the LICENSE file in the root of the source
@ tree. An additional intellectual property rights grant can be found
@ in the file PATENTS.  All contributing project authors may
@ be found in the AUTHORS file in the root of the source tree.
@

@ Contains a function for WebRtcIsacfix_CalculateResidualEnergyNeon() in
@ iSAC codec, optimized for ARM Neon platform. Reference code in
@ lpc_masking_model.c.

.arch armv7-a
.fpu neon
.global WebRtcIsacfix_CalculateResidualEnergyNeon
.align  2

@ int32_t WebRtcIsacfix_CalculateResidualEnergyNeon(int lpc_order,
@                                                   int32_t q_val_corr,
@                                                   int q_val_polynomial,
@                                                   int16_t* a_polynomial,
@                                                   int32_t* corr_coeffs,
@                                                   int* q_val_residual_energy);

WebRtcIsacfix_CalculateResidualEnergyNeon:
.fnstart
.save {r4-r11}
  push {r4-r11}

  sub r13, r13, #16
  str r1, [r13, #8]
  str r2, [r13, #12]

  mov r4, #1
  vmov.s64 q11, #0            @ Initialize shift_internal.
  vmov.s64 q13, #0            @ Initialize sum64.
  vmov.s64 q10, #0
  vmov.u8 d20[0], r4          @ Set q10 to 1.

  cmp r0, #0
  blt POST_LOOP_I

  add r9, r3, r0, asl #1      @ &a_polynomial[lpc_order]
  mov r6, #0                  @ Loop counter i.
  ldr r11, [r13, #48]
  sub r10, r0, #1
  mov r7, r3                  @ &a_polynomial[0]
  str r9, [r13, #4]

LOOP_I:
  ldr r2, [r11], #4            @ corr_coeffs[i]
  vmov.s64 q15, #0            @ Initialize the sum64_tmp.
  vdup.s32 d25, r2

  cmp r0, r6                  @ Compare lpc_order to i.
  movle r2, r6
  ble POST_LOOP_J

  mov r1, r6                  @ j = i;
  mov r12, r7                  @ &a_polynomial[i]
  mov r4, r3                  @ &a_polynomial[j - i]

LOOP_J:
  ldr r8, [r12], #4
  ldr r5, [r4], #4
  vmov.u32 d0[0], r8
  vmov.u32 d1[0], r5
  vmull.s16 q0, d0, d1
  vmull.s32 q0, d0, d25
  cmp r6, #0                  @ i == 0?
  vshl.s64 q0, q11
  beq SUM1
  vshl.s64 q0, #1

SUM1:
  vqadd.s64 q14, q0, q15      @ Sum and test overflow.
  add r1, r1, #2
  bvc MOV1                    @ Skip the shift if there's no overflow.
  vshr.s64 q0, #1
  vshr.s64 q15, #1
  vadd.s64 q14, q0, q15
  vsub.s64 q11, q10

MOV1:
  cmp r0, r1                  @ Compare lpc_order to j.
  vmov.s64 q15, q14
  bgt LOOP_J

  bic r1, r10, #1
  add r2, r6, #2
  add r2, r1, r2

POST_LOOP_J:
  vqadd.s64 q0, q13, q15      @ Sum and test overflow.
  bvc MOV2                    @ Skip the shift if there's no overflow.
  vshr.s64 q13, #1
  vshr.s64 q15, #1
  vadd.s64 q0, q13, q15
  vsub.s64 q11, q10

MOV2:
  vmov.s64 q13, q0            @ update sum64.
  cmp r2, r0
  bne CHECK_LOOP_CONDITION

  @ Last sample in the inner loop.
  ldr r4, [r13, #4]
  ldrsh r8, [r4]
  ldrsh r12, [r9]
  mul r8, r8, r12
  vmov.s32 d0[0], r8
  vmull.s32 q0, d0, d25
  cmp r6, #0                  @ i == 0?
  vshl.s64 q0, q11
  beq SUM2
  vshl.s64 q0, #1

SUM2:
  vqadd.s64 d1, d0, d26       @ Sum and test overflow.
  bvc MOV3                    @ Skip the shift if there's no overflow.
  vshr.s64 q13, #1
  vshr.s64 d0, #1
  vadd.s64 d1, d0, d26
  vsub.s64 q11, q10

MOV3:
  vmov.s64 d26, d1            @ update sum64.

CHECK_LOOP_CONDITION:
  add r6, r6, #1
  sub r9, r9, #2
  cmp r0, r6                  @ Compare i to lpc_order.
  sub r10, r10, #1
  add r7, r7, #2
  bge LOOP_I

POST_LOOP_I:
  mov r3, #0
  vqadd.s64 d0, d26, d27      @ Sum and test overflow.
  bvc GET_SHIFT_NORM          @ Skip the shift if there's no overflow.
  vshr.s64 q13, #1
  vadd.s64 d0, d26, d27
  vsub.s64 q11, q10

GET_SHIFT_NORM:
  vcls.s32 d1, d0             @ Count leading extra sign bits.
  vmov.32 r2, d1[1]           @ Store # of sign bits of only the 32 MSBs.
  vmovl.s32 q1, d1
  vshl.s64 d0, d3             @ d3 contains # of sign bits of the 32 MSBs.

  vcls.s32 d1, d0             @ Count again the leading extra sign bits.
  vmov.s32 r1, d1[1]          @ Store # of sign bits of only the 32 MSBs.
  vmovl.s32 q1, d1
  vshl.s64 d0, d3             @ d3 contains # of sign bits of the 32 MSBs.

  vmov.s32 r0, d0[1]          @ residual_energy
  vmov.s32 r3, d22[0]         @ shift_internal

  @ Calculate the value for q_val_residual_energy.
  ldr r4, [r13, #8]            @ q_val_corr
  ldr r5, [r13, #12]           @ q_val_polynomial
  sub r12, r4, #32
  add r12, r12, r5, asl #1
  add r1, r12, r1              @ add 1st part of shift_internal.
  add r12, r1, r2              @ add 2nd part of shift_internal.
  ldr r2, [r13, #52]
  add r3, r12, r3              @ value for q_val_residual_energy.
  str r3, [r2, #0]

  add r13, r13, #16
  pop {r4-r11}
  bx  r14

.fnend