modules/audio_coding/codecs/isac/fix/source/filters_neon.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145

@
@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
@
@ Use of this source code is governed by a BSD-style license
@ that can be found in the LICENSE file in the root of the source
@ tree. An additional intellectual property rights grant can be found
@ in the file PATENTS.  All contributing project authors may
@ be found in the AUTHORS file in the root of the source tree.
@
@ Reference code in filters.c. Output is bit-exact.

#include "webrtc/system_wrappers/interface/asm_defines.h"

GLOBAL_FUNCTION WebRtcIsacfix_AutocorrNeon
.align  2

@ int WebRtcIsacfix_AutocorrNeon(
@     int32_t* __restrict r,
@     const int16_t* __restrict x,
@     int16_t N,
@     int16_t order,
@     int16_t* __restrict scale);

DEFINE_FUNCTION WebRtcIsacfix_AutocorrNeon
  push       {r3 - r12}

  @ Constant initializations
  mov        r4, #33
  vmov.i32   d0, #0
  vmov.i32   q8, #0
  vmov.i32   d29, #0               @ Initialize (-scale).
  vmov.u8    d30, #255             @ Initialize d30 as -1.
  vmov.i32   d0[0], r4             @ d0: 00000033 (low), 00000000 (high)
  vmov.i32   d25, #32

  mov        r5, r1                @ x
  mov        r6, r2                @ N

@ Generate the first coefficient r0.
LOOP_R0:
  vld1.16    {d18}, [r5]!          @ x[]
  subs       r6, r6, #4
  vmull.s16  q9, d18, d18
  vpadal.s32 q8, q9
  bgt        LOOP_R0

  vadd.i64   d16, d16, d17

  @ Calculate scaling (the value of shifting).
  vmov       d17, d16

  @ Check overflow and determine the value for 'scale'.
  @ vclz cannot deal with a 64-bit, so we have to do vclz on both the upper and
  @ lower 32-bit words. Note that we don't care about the value of the upper
  @ word in d17.

  @ Check the case of 1 bit overflow. If it occurs store the results for
  @ scale and r[0] in d17 and d29.

  vshr.u64   d3, d16, #1
  vclt.s32   d1, d16, #0           @ < 0 ?
  vbit       d17, d3, d1           @ For r[0]
  vbit       d29, d30, d1          @ -scale = -1

  @ For the case of more than 1 bit overflow. If it occurs overwrite the
  @ results for scale and r[0] in d17 and d29.
  vclz.s32   d5, d16               @ Leading zeros of the two 32 bit words.
  vshr.s64   d26, d5, #32          @ Keep only the upper 32 bits.
  vsub.i64   d31, d26, d0          @ zeros - 33
  vshl.i64   d27, d26, #32
  vorr       d27, d26              @ Duplicate the high word with its low one.
  vshl.u64   d2, d16, d31          @ Shift by (-scale).
  vclt.s32   d1, d27, d25          @ < 32 ?
  vbit       d17, d2, d1           @ For r[0]
  vbit       d29, d31, d1          @ -scale

  vst1.32    d17[0], [r0]!         @ r[0]
  mov        r5, #1                @ outer loop counter

@ Generate rest of the coefficients
LOOP_R:
  vmov.i32   q8, #0                @ Initialize the accumulation result.
  vmov.i32   q9, #0                @ Initialize the accumulation result.
  mov        r7, r1                @ &x[0]
  add        r6, r7, r5, lsl #1    @ x[i]
  sub        r12, r2, r5           @ N - i
  lsr        r8, r12, #3           @ inner loop counter
  sub        r12, r8, lsl #3       @ Leftover samples to be processed

LOOP_8X_SAMPLES:                   @ Multiple of 8 samples
  vld1.16    {d20, d21}, [r7]!     @ x[0, ...]
  vld1.16    {d22, d23}, [r6]!     @ x[i, ...]
  vmull.s16  q12, d20, d22
  vmull.s16  q13, d21, d23
  subs       r8, #1
  vpadal.s32 q8, q12
  vpadal.s32 q9, q13
  bgt        LOOP_8X_SAMPLES

  cmp r12, #4
  blt REST_SAMPLES

Four_SAMPLES:
  vld1.16    d20, [r7]!
  vld1.16    d22, [r6]!
  vmull.s16  q12, d20, d22
  vpadal.s32 q8, q12
  sub r12, #4

REST_SAMPLES:
  mov        r8, #0                @ Initialize lower word of the accumulation.
  mov        r4, #0                @ Initialize upper word of the accumulation.
  cmp r12, #0
  ble SUMUP

LOOP_REST_SAMPLES:
  ldrh       r9, [r7], #2          @ x[0, ...]
  ldrh       r10, [r6], #2         @ x[i, ...]
  smulbb     r11, r9, r10
  adds       r8, r8, r11           @ lower word of the accumulation.
  adc        r4, r4, r11, asr #31  @ upper word of the accumulation.
  subs       r12, #1
  bgt        LOOP_REST_SAMPLES

@ Added the multiplication results together and do a shift.
SUMUP:
  vadd.i64   d16, d17
  vadd.i64   d18, d19
  vadd.i64   d18, d16
  vmov       d17, r8, r4
  vadd.i64   d18, d17
  vshl.s64   d18, d29              @ Shift left by (-scale).
  vst1.32    d18[0], [r0]!         @ r[i]

  add        r5, #1
  cmp        r5, r3
  ble        LOOP_R

  vneg.s32   d29, d29              @ Get value for 'scale'.
  ldr        r2, [sp, #40]         @ &scale
  add        r0, r3, #1            @ return (order + 1)
  vst1.s16   d29[0], [r2]          @ Store 'scale'

  pop        {r3 - r12}
  bx         lr