webrtc/common_audio/signal_processing/cross_correlation_neon.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159

@
@ Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
@
@ Use of this source code is governed by a BSD-style license
@ that can be found in the LICENSE file in the root of the source
@ tree. An additional intellectual property rights grant can be found
@ in the file PATENTS.  All contributing project authors may
@ be found in the AUTHORS file in the root of the source tree.
@

@ cross_correlation_neon.s
@ This file contains the function WebRtcSpl_CrossCorrelationNeon(),
@ optimized for ARM Neon platform.
@
@ Reference Ccode at end of this file.
@ Output is bit-exact with the reference C code, but not with the generic
@ C code in file cross_correlation.c, due to reduction of shift operations
@ from using Neon registers.

@ Register usage:
@
@ r0: *cross_correlation (function argument)
@ r1: *seq1 (function argument)
@ r2: *seq2 (function argument)
@ r3: dim_seq (function argument); then, total iteration of LOOP_DIM_SEQ
@ r4: counter for LOOP_DIM_CROSS_CORRELATION
@ r5: seq2_ptr
@ r6: seq1_ptr
@ r7: Total iteration of LOOP_DIM_SEQ_RESIDUAL
@ r8, r9, r10, r11, r12: scratch

#include "webrtc/system_wrappers/interface/asm_defines.h"

GLOBAL_FUNCTION WebRtcSpl_CrossCorrelationNeon
.align  2
DEFINE_FUNCTION WebRtcSpl_CrossCorrelationNeon
  push {r4-r11}

  @ Put the shift value (-right_shifts) into a Neon register.
  ldrsh r10, [sp, #36]
  rsb r10, r10, #0
  mov r8, r10, asr #31
  vmov d16, r10, r8

  @ Initialize loop counters.
  and r7, r3, #7              @ inner_loop_len2 = dim_seq % 8;
  asr r3, r3, #3              @ inner_loop_len1 = dim_seq / 8;
  ldrsh r4, [sp, #32]         @ dim_cross_correlation

LOOP_DIM_CROSS_CORRELATION:
  vmov.i32 q9, #0
  vmov.i32 q14, #0
  movs r8, r3                 @ inner_loop_len1
  mov r6, r1                  @ seq1_ptr
  mov r5, r2                  @ seq2_ptr
  ble POST_LOOP_DIM_SEQ

LOOP_DIM_SEQ:
  vld1.16 {d20, d21}, [r6]!   @ seq1_ptr
  vld1.16 {d22, d23}, [r5]!   @ seq2_ptr
  subs r8, r8, #1
  vmull.s16 q12, d20, d22
  vmull.s16 q13, d21, d23
  vpadal.s32 q9, q12
  vpadal.s32 q14, q13
  bgt LOOP_DIM_SEQ

POST_LOOP_DIM_SEQ:
  movs r10, r7                @ Loop counter
  mov r12, #0
  mov r8, #0
  ble POST_LOOP_DIM_SEQ_RESIDUAL

LOOP_DIM_SEQ_RESIDUAL:
  ldrh r11, [r6], #2
  ldrh r9, [r5], #2
  smulbb r11, r11, r9
  adds r8, r8, r11
  adc r12, r12, r11, asr #31
  subs r10, #1
  bgt LOOP_DIM_SEQ_RESIDUAL

POST_LOOP_DIM_SEQ_RESIDUAL:   @ Sum the results up and do the shift.
  vadd.i64 d18, d19
  vadd.i64 d28, d29
  vadd.i64 d18, d28
  vmov.32 d17[0], r8
  vmov.32 d17[1], r12
  vadd.i64 d17, d18
  vshl.s64 d17, d16
  vst1.32 d17[0], [r0]!       @ Store the output

  ldr r8, [sp, #40]           @ step_seq2
  add r2, r8, lsl #1          @ prepare for seq2_ptr(r5) in the next loop.

  subs r4, #1
  bgt LOOP_DIM_CROSS_CORRELATION

  pop {r4-r11}
  bx  lr

@ TODO(kma): Place this piece of reference code into a C code file.
@ void WebRtcSpl_CrossCorrelationNeon(WebRtc_Word32* cross_correlation,
@                                     WebRtc_Word16* seq1,
@                                     WebRtc_Word16* seq2,
@                                     WebRtc_Word16 dim_seq,
@                                     WebRtc_Word16 dim_cross_correlation,
@                                     WebRtc_Word16 right_shifts,
@                                     WebRtc_Word16 step_seq2) {
@   int i = 0;
@   int j = 0;
@   int inner_loop_len1 = dim_seq >> 3;
@   int inner_loop_len2 = dim_seq - (inner_loop_len1 << 3);
@
@   assert(dim_cross_correlation > 0);
@   assert(dim_seq > 0);
@
@   for (i = 0; i < dim_cross_correlation; i++) {
@     int16_t *seq1_ptr = seq1;
@     int16_t *seq2_ptr = seq2 + (step_seq2 * i);
@     int64_t sum = 0;
@
@     for (j = inner_loop_len1; j > 0; j -= 1) {
@       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
@       seq1_ptr++;
@       seq2_ptr++;
@       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
@       seq1_ptr++;
@       seq2_ptr++;
@       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
@       seq1_ptr++;
@       seq2_ptr++;
@       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
@       seq1_ptr++;
@       seq2_ptr++;
@       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
@       seq1_ptr++;
@       seq2_ptr++;
@       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
@       seq1_ptr++;
@       seq2_ptr++;
@       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
@       seq1_ptr++;
@       seq2_ptr++;
@       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
@       seq1_ptr++;
@       seq2_ptr++;
@     }
@
@     // Calculate the rest of the samples.
@     for (j = inner_loop_len2; j > 0; j -= 1) {
@       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
@       seq1_ptr++;
@       seq2_ptr++;
@     }
@
@     *cross_correlation++ = (int32_t)(sum >> right_shifts);
@   }
@ }