diff options
author | Frank Barchard <fbarchard@google.com> | 2022-08-23 14:03:22 -0700 |
---|---|---|
committer | XNNPACK Team <xnnpack-github-robot@google.com> | 2022-08-23 14:04:25 -0700 |
commit | 317950c01f4db6676986ec1837caacef6c845e70 (patch) | |
tree | 508b0f38c4b93a3c8e9ea9da1b5b60daec24507a | |
parent | 0cc320718a9940dce5aba58130290c3760a840b4 (diff) | |
download | XNNPACK-317950c01f4db6676986ec1837caacef6c845e70.tar.gz |
filterbank accumulate neon use for loop instead of do/while
PiperOrigin-RevId: 469551042
-rw-r--r-- | src/u32-filterbank-accumulate/gen/neon-x2.c | 15 | ||||
-rw-r--r-- | src/u32-filterbank-accumulate/neon.c.in | 17 |
2 files changed, 13 insertions, 19 deletions
diff --git a/src/u32-filterbank-accumulate/gen/neon-x2.c b/src/u32-filterbank-accumulate/gen/neon-x2.c index 155516885..ad7aab503 100644 --- a/src/u32-filterbank-accumulate/gen/neon-x2.c +++ b/src/u32-filterbank-accumulate/gen/neon-x2.c @@ -36,16 +36,13 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x2( size_t n = (size_t) *weight_widths++; assert(n != 0); - if (n >= 2) { - do { - const uint32x2_t vi = vld1_u32(input); input += 2; - const uint16x4_t vw = vld1_u16(weights); weights += 4; - const uint32x4_t vw32 = vmovl_u16(vw); + for (;n >= 2; n -= 2) { + const uint32x2_t vi = vld1_u32(input); input += 2; + const uint16x4_t vw = vld1_u16(weights); weights += 4; + const uint32x4_t vw32 = vmovl_u16(vw); - weight_accumulator = vmlal_lane_u32(weight_accumulator, vget_low_u32(vw32), vi, 0); - weight_accumulator = vmlal_lane_u32(weight_accumulator, vget_high_u32(vw32), vi, 1); - n -= 2; - } while (n >= 2); + weight_accumulator = vmlal_lane_u32(weight_accumulator, vget_low_u32(vw32), vi, 0); + weight_accumulator = vmlal_lane_u32(weight_accumulator, vget_high_u32(vw32), vi, 1); } if (n != 0) { do { diff --git a/src/u32-filterbank-accumulate/neon.c.in b/src/u32-filterbank-accumulate/neon.c.in index b51ac5a65..4038107f8 100644 --- a/src/u32-filterbank-accumulate/neon.c.in +++ b/src/u32-filterbank-accumulate/neon.c.in @@ -33,16 +33,13 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x${BATCH_TILE}( assert(n != 0); $if BATCH_TILE == 2: - if (n >= 2) { - do { - const uint32x2_t vi = vld1_u32(input); input += 2; - const uint16x4_t vw = vld1_u16(weights); weights += 4; - const uint32x4_t vw32 = vmovl_u16(vw); - - weight_accumulator = vmlal_lane_u32(weight_accumulator, vget_low_u32(vw32), vi, 0); - weight_accumulator = vmlal_lane_u32(weight_accumulator, vget_high_u32(vw32), vi, 1); - n -= 2; - } while (n >= 2); + for (;n >= 2; n -= 2) { + const uint32x2_t vi = vld1_u32(input); input += 2; + const uint16x4_t vw = vld1_u16(weights); weights += 4; + const uint32x4_t vw32 = vmovl_u16(vw); + + weight_accumulator = vmlal_lane_u32(weight_accumulator, vget_low_u32(vw32), vi, 0); + weight_accumulator = vmlal_lane_u32(weight_accumulator, vget_high_u32(vw32), vi, 1); } if (n != 0) { do { |