1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
|
// Copyright 2022 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
#include <xnnpack/assembly.h>
.syntax unified
// void xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2(
// size_t rows, r0
// const uint32_t* input, r1
// const uint8_t* weight_widths, r2
// const uint16_t* weights, r3
// uint64_t* output) sp -> r12
// d8-d15, r12-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved.
// Register usage
// input r1 d2
// weights r3 d3 d4 d5
// output r12 d0 d1
// weight_widths r2 r4
BEGIN_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2
.arm
#ifndef __APPLE__
.arch armv7-a
.fpu neon
#endif
LDR r12, [sp] // output
PUSH {r4} // push 4 bytes
VMOV.U8 q0, #0 // weight_accumulator
0:
LDRB r4, [r2], #1 // weight_widths
SUBS r4, r4, #1
BLS 2f // less than 2 weights?
1:
VLD1.16 {d3}, [r3]! // weights
VLD1.32 {d2}, [r1]! // input
SUBS r4, r4, #2
VMOVL.U16 q2, d3
VMLAL.U32 q0, d4, d2[0]
VMLAL.U32 q0, d5, d2[1]
BHI 1b
BLO 3f // is there a remainder?
2:
VLD1.32 {d3[]}, [r3]! // weights
VLD1.32 {d2[]}, [r1]! // input
VMOVL.U16 q2, d3
VMLAL.U32 q0, d4, d2
3:
VST1.64 {d0}, [r12]!
SUBS r0, r0, #1
VMOV d0, d1
BNE 0b
POP {r4}
BX lr
END_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2
#ifdef __ELF__
.section ".note.GNU-stack","",%progbits
#endif
|