Minor optimization in NEON U32 FILTERBANK-ACCUMULATE

PiperOrigin-RevId: 471388090
author: Marat Dukhan <maratek@google.com> 2022-08-31 16:43:46 -0700
committer: XNNPACK Team <xnnpack-github-robot@google.com> 2022-08-31 16:44:44 -0700
commit: 7497ff1ee99c4296c3bcb1d3765fa7e9851dfe88 (patch)
tree: 75d61524a93611f0cfea59bb1a991e797f3e9c33
parent: d04252a13ed3cef5bd204d1e327fadae5020de43 (diff)
download: XNNPACK-7497ff1ee99c4296c3bcb1d3765fa7e9851dfe88.tar.gz
3 files changed, 13 insertions, 24 deletions
diff --git a/src/u32-filterbank-accumulate/gen/neon-x1.c b/src/u32-filterbank-accumulate/gen/neon-x1.c
index 1fa5ba7a1..306bc191e 100644
--- a/src/u32-filterbank-accumulate/gen/neon-x1.c
+++ b/src/u32-filterbank-accumulate/gen/neon-x1.c
@@ -30,12 +30,10 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x1(
   assert(weights != NULL);
   assert(output != NULL);
 
-  uint64x2_t weight_accumulator = vdupq_n_u64(0);
-
-
   // Compute unweight as initial weight
   size_t n = (size_t) *weight_widths++;
   assert(n != 0);
+  uint64x2_t weight_accumulator = vdupq_n_u64(0);
 
   do {
     const uint32x2_t vi = vld1_dup_u32(input); input += 1;
@@ -45,11 +43,10 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x1(
     weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi);
   } while (--n != 0);
 
-  weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0));
-
   do {
-    n = (size_t) *weight_widths++;
+    size_t n = (size_t) *weight_widths++;
     assert(n != 0);
+    weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0));
 
     do {
       const uint32x2_t vi = vld1_dup_u32(input); input += 1;
@@ -60,7 +57,6 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x1(
     } while (--n != 0);
 
     vst1_u64(output, vget_low_u64(weight_accumulator));  output += 1;
-    weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0));
 
   } while (--rows != 0);
 }
diff --git a/src/u32-filterbank-accumulate/gen/neon-x2.c b/src/u32-filterbank-accumulate/gen/neon-x2.c
index a88a1cdd4..b0804d592 100644
--- a/src/u32-filterbank-accumulate/gen/neon-x2.c
+++ b/src/u32-filterbank-accumulate/gen/neon-x2.c
@@ -30,12 +30,10 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x2(
   assert(weights != NULL);
   assert(output != NULL);
 
-  uint64x2_t weight_accumulator = vdupq_n_u64(0);
-
-
   // Compute unweight as initial weight
   size_t n = (size_t) *weight_widths++;
   assert(n != 0);
+  uint64x2_t weight_accumulator = vdupq_n_u64(0);
 
   do {
     const uint32x2_t vi = vld1_dup_u32(input); input += 1;
@@ -45,13 +43,12 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x2(
     weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi);
   } while (--n != 0);
 
-  weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0));
-
   do {
-    n = (size_t) *weight_widths++;
+    size_t n = (size_t) *weight_widths++;
     assert(n != 0);
+    weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0));
 
-    for (;n >= 2; n -= 2) {
+    for (; n >= 2; n -= 2) {
       const uint32x2_t vi = vld1_u32(input); input += 2;
       const uint16x4_t vw = vld1_u16(weights); weights += 4;
       const uint32x4_t vw32 = vmovl_u16(vw);
@@ -69,7 +66,6 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x2(
     }
 
     vst1_u64(output, vget_low_u64(weight_accumulator));  output += 1;
-    weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0));
 
   } while (--rows != 0);
 }
diff --git a/src/u32-filterbank-accumulate/neon.c.in b/src/u32-filterbank-accumulate/neon.c.in
index c45572f5a..c4e95eba6 100644
--- a/src/u32-filterbank-accumulate/neon.c.in
+++ b/src/u32-filterbank-accumulate/neon.c.in
@@ -3,6 +3,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+$assert BATCH_TILE in [1, 2]
 #include <assert.h>
 #include <stddef.h>
 #include <stdint.h>
@@ -26,12 +27,10 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x${BATCH_TILE}(
   assert(weights != NULL);
   assert(output != NULL);
 
-  uint64x2_t weight_accumulator = vdupq_n_u64(0);
-
-
   // Compute unweight as initial weight
   size_t n = (size_t) *weight_widths++;
   assert(n != 0);
+  uint64x2_t weight_accumulator = vdupq_n_u64(0);
 
   do {
     const uint32x2_t vi = vld1_dup_u32(input); input += 1;
@@ -41,14 +40,13 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x${BATCH_TILE}(
     weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi);
   } while (--n != 0);
 
-  weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0));
-
   do {
     size_t n = (size_t) *weight_widths++;
     assert(n != 0);
+    weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0));
 
-    $if BATCH_TILE > 1:
-      for (;n >= 2; n -= 2) {
+    $if BATCH_TILE == 2:
+      for (; n >= 2; n -= 2) {
         const uint32x2_t vi = vld1_u32(input); input += 2;
         const uint16x4_t vw = vld1_u16(weights); weights += 4;
         const uint32x4_t vw32 = vmovl_u16(vw);
@@ -64,7 +62,7 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x${BATCH_TILE}(
 
         weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi);
       }
-    $else:
+    $elif BATCH_TILE == 1:
       do {
         const uint32x2_t vi = vld1_dup_u32(input); input += 1;
         const uint16x4_t vw = vreinterpret_u16_u32(vld1_dup_u32((const void*) weights)); weights += 2;
@@ -74,7 +72,6 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x${BATCH_TILE}(
       } while (--n != 0);
 
     vst1_u64(output, vget_low_u64(weight_accumulator));  output += 1;
-    weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0));
 
   } while (--rows != 0);
 }
author	Marat Dukhan <maratek@google.com>	2022-08-31 16:43:46 -0700
committer	XNNPACK Team <xnnpack-github-robot@google.com>	2022-08-31 16:44:44 -0700
commit	7497ff1ee99c4296c3bcb1d3765fa7e9851dfe88 (patch)
tree	75d61524a93611f0cfea59bb1a991e797f3e9c33
parent	d04252a13ed3cef5bd204d1e327fadae5020de43 (diff)
download	XNNPACK-7497ff1ee99c4296c3bcb1d3765fa7e9851dfe88.tar.gz