aboutsummaryrefslogtreecommitdiff
path: root/src/u32-filterbank-accumulate/gen/neon-x2.c
blob: 743244a83457c7e888db5c8310a61a04273db638 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
// Auto-generated file. Do not edit!
//   Template: src/u32-filterbank-accumulate/neon.c.in
//   Generator: tools/xngen
//
// Copyright 2022 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#include <assert.h>
#include <stddef.h>
#include <stdint.h>

#include <arm_neon.h>

#include <xnnpack/math.h>
#include <xnnpack/filterbank.h>


void xnn_u32_filterbank_accumulate_ukernel__neon_x2(
    size_t rows,
    const uint32_t* input,
    const uint8_t* weight_widths,
    const uint16_t* weights,
    uint64_t* output) {

  assert(rows != 0);
  assert(input != NULL);
  assert(weight_widths != NULL);
  assert(weights != NULL);
  assert(output != NULL);

  uint64x2_t weight_accumulator = vdupq_n_u64(0);

  do {
    size_t n = (size_t) *weight_widths++;
    assert(n != 0);

    for (;n >= 2; n -= 2) {
      const uint32x2_t vi = vld1_u32(input); input += 2;
      const uint16x4_t vw = vld1_u16(weights); weights += 4;
      const uint32x4_t vw32 = vmovl_u16(vw);

      weight_accumulator = vmlal_lane_u32(weight_accumulator, vget_low_u32(vw32), vi, 0);
      weight_accumulator = vmlal_lane_u32(weight_accumulator, vget_high_u32(vw32), vi, 1);
    }

    if XNN_UNPREDICTABLE(n != 0) {
      const uint32x2_t vi = vld1_dup_u32(input); input += 1;
      const uint16x4_t vw = vreinterpret_u16_u32(vld1_dup_u32((const void*) weights)); weights += 2;
      const uint32x2_t vw32 = vget_low_u32(vmovl_u16(vw));

      weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi);
    }

    vst1_u64(output, vget_low_u64(weight_accumulator));  output += 1;
    weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0));

  } while (--rows != 0);
}