aboutsummaryrefslogtreecommitdiff
path: root/src/u32-filterbank-accumulate/aarch32-neon-x2.S
blob: 840d7c9f3b59feaaa74a135562b2522e30c623c9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
// Copyright 2022 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#include <xnnpack/assembly.h>

.syntax unified

// void xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2(
//     size_t rows,                          r0
//     const uint32_t* input,                r1
//     const uint8_t* weight_widths,         r2
//     const uint16_t* weights,              r3
//     uint64_t* output)                     sp -> r12

// d8-d15, r12-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved.

// Register usage
// input   r1  d2
// weights r3  d3 d4 d5
// output  r12 d0 d1

// weight_widths r2 r4

BEGIN_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2
        .arm
#ifndef __APPLE__
        .arch   armv7-a
        .fpu    neon
#endif
        LDR     r12, [sp]               // output
        PUSH    {r4,lr}                 // push 8 bytes
        VMOV.U8 d0, #0                  // weight_accumulator
0:
        LDRB    r4, [r2], #1            // weight_widths
        SUBS    r4, r4, #1
        VMOV.U8 d1, #0                  // unweight_accumulator
        BLS     2f                      // less than 2 weights?

1:
        VLD1.16 {d3}, [r3]!             // weights
        VLD1.32 {d2}, [r1]!             // input
        SUBS    r4, r4, #2
        VMOVL.U16 q2, d3
        VMLAL.U32 q0, d4, d2[0]
        VMLAL.U32 q0, d5, d2[1]
        BHI     1b

        BLO     3f                      // is there a remainder?
2:
        VLD1.32 {d3[]}, [r3]!           // weights
        VLD1.32 {d2[]}, [r1]!           // input
        VMOVL.U16 q2, d3
        VMLAL.U32 q0, d4, d2

3:
        VST1.64 {d0}, [r12]!
        SUBS    r0, r0, #1
        VMOV    d0, d1
        BNE     0b

        POP     {r4,pc}

END_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2

#ifdef __ELF__
.section ".note.GNU-stack","",%progbits
#endif