aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFrank Barchard <fbarchard@google.com>2022-08-23 14:03:22 -0700
committerXNNPACK Team <xnnpack-github-robot@google.com>2022-08-23 14:04:25 -0700
commit317950c01f4db6676986ec1837caacef6c845e70 (patch)
tree508b0f38c4b93a3c8e9ea9da1b5b60daec24507a
parent0cc320718a9940dce5aba58130290c3760a840b4 (diff)
downloadXNNPACK-317950c01f4db6676986ec1837caacef6c845e70.tar.gz
filterbank accumulate neon use for loop instead of do/while
PiperOrigin-RevId: 469551042
-rw-r--r--src/u32-filterbank-accumulate/gen/neon-x2.c15
-rw-r--r--src/u32-filterbank-accumulate/neon.c.in17
2 files changed, 13 insertions, 19 deletions
diff --git a/src/u32-filterbank-accumulate/gen/neon-x2.c b/src/u32-filterbank-accumulate/gen/neon-x2.c
index 155516885..ad7aab503 100644
--- a/src/u32-filterbank-accumulate/gen/neon-x2.c
+++ b/src/u32-filterbank-accumulate/gen/neon-x2.c
@@ -36,16 +36,13 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x2(
size_t n = (size_t) *weight_widths++;
assert(n != 0);
- if (n >= 2) {
- do {
- const uint32x2_t vi = vld1_u32(input); input += 2;
- const uint16x4_t vw = vld1_u16(weights); weights += 4;
- const uint32x4_t vw32 = vmovl_u16(vw);
+ for (;n >= 2; n -= 2) {
+ const uint32x2_t vi = vld1_u32(input); input += 2;
+ const uint16x4_t vw = vld1_u16(weights); weights += 4;
+ const uint32x4_t vw32 = vmovl_u16(vw);
- weight_accumulator = vmlal_lane_u32(weight_accumulator, vget_low_u32(vw32), vi, 0);
- weight_accumulator = vmlal_lane_u32(weight_accumulator, vget_high_u32(vw32), vi, 1);
- n -= 2;
- } while (n >= 2);
+ weight_accumulator = vmlal_lane_u32(weight_accumulator, vget_low_u32(vw32), vi, 0);
+ weight_accumulator = vmlal_lane_u32(weight_accumulator, vget_high_u32(vw32), vi, 1);
}
if (n != 0) {
do {
diff --git a/src/u32-filterbank-accumulate/neon.c.in b/src/u32-filterbank-accumulate/neon.c.in
index b51ac5a65..4038107f8 100644
--- a/src/u32-filterbank-accumulate/neon.c.in
+++ b/src/u32-filterbank-accumulate/neon.c.in
@@ -33,16 +33,13 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x${BATCH_TILE}(
assert(n != 0);
$if BATCH_TILE == 2:
- if (n >= 2) {
- do {
- const uint32x2_t vi = vld1_u32(input); input += 2;
- const uint16x4_t vw = vld1_u16(weights); weights += 4;
- const uint32x4_t vw32 = vmovl_u16(vw);
-
- weight_accumulator = vmlal_lane_u32(weight_accumulator, vget_low_u32(vw32), vi, 0);
- weight_accumulator = vmlal_lane_u32(weight_accumulator, vget_high_u32(vw32), vi, 1);
- n -= 2;
- } while (n >= 2);
+ for (;n >= 2; n -= 2) {
+ const uint32x2_t vi = vld1_u32(input); input += 2;
+ const uint16x4_t vw = vld1_u16(weights); weights += 4;
+ const uint32x4_t vw32 = vmovl_u16(vw);
+
+ weight_accumulator = vmlal_lane_u32(weight_accumulator, vget_low_u32(vw32), vi, 0);
+ weight_accumulator = vmlal_lane_u32(weight_accumulator, vget_high_u32(vw32), vi, 1);
}
if (n != 0) {
do {