diff options
author | Marat Dukhan <maratek@google.com> | 2022-08-31 16:43:46 -0700 |
---|---|---|
committer | XNNPACK Team <xnnpack-github-robot@google.com> | 2022-08-31 16:44:44 -0700 |
commit | 7497ff1ee99c4296c3bcb1d3765fa7e9851dfe88 (patch) | |
tree | 75d61524a93611f0cfea59bb1a991e797f3e9c33 | |
parent | d04252a13ed3cef5bd204d1e327fadae5020de43 (diff) | |
download | XNNPACK-7497ff1ee99c4296c3bcb1d3765fa7e9851dfe88.tar.gz |
Minor optimization in NEON U32 FILTERBANK-ACCUMULATE
PiperOrigin-RevId: 471388090
-rw-r--r-- | src/u32-filterbank-accumulate/gen/neon-x1.c | 10 | ||||
-rw-r--r-- | src/u32-filterbank-accumulate/gen/neon-x2.c | 12 | ||||
-rw-r--r-- | src/u32-filterbank-accumulate/neon.c.in | 15 |
3 files changed, 13 insertions, 24 deletions
diff --git a/src/u32-filterbank-accumulate/gen/neon-x1.c b/src/u32-filterbank-accumulate/gen/neon-x1.c index 1fa5ba7a1..306bc191e 100644 --- a/src/u32-filterbank-accumulate/gen/neon-x1.c +++ b/src/u32-filterbank-accumulate/gen/neon-x1.c @@ -30,12 +30,10 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x1( assert(weights != NULL); assert(output != NULL); - uint64x2_t weight_accumulator = vdupq_n_u64(0); - - // Compute unweight as initial weight size_t n = (size_t) *weight_widths++; assert(n != 0); + uint64x2_t weight_accumulator = vdupq_n_u64(0); do { const uint32x2_t vi = vld1_dup_u32(input); input += 1; @@ -45,11 +43,10 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x1( weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi); } while (--n != 0); - weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0)); - do { - n = (size_t) *weight_widths++; + size_t n = (size_t) *weight_widths++; assert(n != 0); + weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0)); do { const uint32x2_t vi = vld1_dup_u32(input); input += 1; @@ -60,7 +57,6 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x1( } while (--n != 0); vst1_u64(output, vget_low_u64(weight_accumulator)); output += 1; - weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0)); } while (--rows != 0); } diff --git a/src/u32-filterbank-accumulate/gen/neon-x2.c b/src/u32-filterbank-accumulate/gen/neon-x2.c index a88a1cdd4..b0804d592 100644 --- a/src/u32-filterbank-accumulate/gen/neon-x2.c +++ b/src/u32-filterbank-accumulate/gen/neon-x2.c @@ -30,12 +30,10 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x2( assert(weights != NULL); assert(output != NULL); - uint64x2_t weight_accumulator = vdupq_n_u64(0); - - // Compute unweight as initial weight size_t n = (size_t) *weight_widths++; assert(n != 0); + uint64x2_t weight_accumulator = vdupq_n_u64(0); do { const uint32x2_t vi = vld1_dup_u32(input); input += 1; @@ -45,13 +43,12 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x2( weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi); } while (--n != 0); - weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0)); - do { - n = (size_t) *weight_widths++; + size_t n = (size_t) *weight_widths++; assert(n != 0); + weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0)); - for (;n >= 2; n -= 2) { + for (; n >= 2; n -= 2) { const uint32x2_t vi = vld1_u32(input); input += 2; const uint16x4_t vw = vld1_u16(weights); weights += 4; const uint32x4_t vw32 = vmovl_u16(vw); @@ -69,7 +66,6 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x2( } vst1_u64(output, vget_low_u64(weight_accumulator)); output += 1; - weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0)); } while (--rows != 0); } diff --git a/src/u32-filterbank-accumulate/neon.c.in b/src/u32-filterbank-accumulate/neon.c.in index c45572f5a..c4e95eba6 100644 --- a/src/u32-filterbank-accumulate/neon.c.in +++ b/src/u32-filterbank-accumulate/neon.c.in @@ -3,6 +3,7 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. +$assert BATCH_TILE in [1, 2] #include <assert.h> #include <stddef.h> #include <stdint.h> @@ -26,12 +27,10 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x${BATCH_TILE}( assert(weights != NULL); assert(output != NULL); - uint64x2_t weight_accumulator = vdupq_n_u64(0); - - // Compute unweight as initial weight size_t n = (size_t) *weight_widths++; assert(n != 0); + uint64x2_t weight_accumulator = vdupq_n_u64(0); do { const uint32x2_t vi = vld1_dup_u32(input); input += 1; @@ -41,14 +40,13 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x${BATCH_TILE}( weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi); } while (--n != 0); - weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0)); - do { size_t n = (size_t) *weight_widths++; assert(n != 0); + weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0)); - $if BATCH_TILE > 1: - for (;n >= 2; n -= 2) { + $if BATCH_TILE == 2: + for (; n >= 2; n -= 2) { const uint32x2_t vi = vld1_u32(input); input += 2; const uint16x4_t vw = vld1_u16(weights); weights += 4; const uint32x4_t vw32 = vmovl_u16(vw); @@ -64,7 +62,7 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x${BATCH_TILE}( weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi); } - $else: + $elif BATCH_TILE == 1: do { const uint32x2_t vi = vld1_dup_u32(input); input += 1; const uint16x4_t vw = vreinterpret_u16_u32(vld1_dup_u32((const void*) weights)); weights += 2; @@ -74,7 +72,6 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x${BATCH_TILE}( } while (--n != 0); vst1_u64(output, vget_low_u64(weight_accumulator)); output += 1; - weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0)); } while (--rows != 0); } |