Introduce math_clz_nonzero_u32 function

PiperOrigin-RevId: 463376790
author: Marat Dukhan <maratek@google.com> 2022-07-26 10:47:39 -0700
committer: XNNPACK Team <xnnpack-github-robot@google.com> 2022-07-26 10:48:31 -0700
commit: e7a0a811b389d96f353478ef1054dc567e503216 (patch)
tree: 6a5f9ae140c1414ecc06625030b3273159db20f9
parent: 3f6b0d4d65c2f98ef3957b92dd7e6fbd42828572 (diff)
download: XNNPACK-e7a0a811b389d96f353478ef1054dc567e503216.tar.gz
4 files changed, 15 insertions, 4 deletions
diff --git a/src/math/sqrt-u32-scalar-clz-newton.c b/src/math/sqrt-u32-scalar-clz-newton.c
index 9d4da5965..b09448532 100644
--- a/src/math/sqrt-u32-scalar-clz-newton.c
+++ b/src/math/sqrt-u32-scalar-clz-newton.c
@@ -23,8 +23,8 @@ void xnn_math_u32_sqrt__scalar_clz_newton(
     uint32_t vy = vx;
 
     // Based on Hacker's Delight, Figure 11-1.
-    if (vx > 1) {
-      const uint32_t vs = 16 - (math_clz_u32(vx - 1) >> 1);
+    if (vx != 0) {
+      const uint32_t vs = 16 - (math_clz_nonzero_u32(vx - 1) >> 1);
 
       uint32_t vg0 = UINT32_C(1) << vs;
       uint32_t vg1 = (vg0 + (vx >> vs)) >> 1;
diff --git a/src/math/sqrt-u32-scalar-hashemian.c b/src/math/sqrt-u32-scalar-hashemian.c
index fd679ad20..af5f1849d 100644
--- a/src/math/sqrt-u32-scalar-hashemian.c
+++ b/src/math/sqrt-u32-scalar-hashemian.c
@@ -27,7 +27,7 @@ void xnn_math_u32_sqrt__scalar_hashemian(
        * and StackOverflow answer https://stackoverflow.com/a/31149161
       */
 
-      const uint32_t vn = math_clz_u32(vx);
+      const uint32_t vn = math_clz_nonzero_u32(vx);
       const uint32_t vleft_shift = vn & 1;
       const uint32_t vm_minus_1 = 15 - (vn >> 1);
       const uint32_t vm_plus_1 = vm_minus_1 + 2;
diff --git a/src/math/sqrt-u32-scalar-tflm.c b/src/math/sqrt-u32-scalar-tflm.c
index 72cab03e6..9edfe15e9 100644
--- a/src/math/sqrt-u32-scalar-tflm.c
+++ b/src/math/sqrt-u32-scalar-tflm.c
@@ -23,7 +23,7 @@ void xnn_math_u32_sqrt__scalar_tflm(
     // Algorithm adapted from tensorflow/lite/experimental/microfrontend/lib/filterbank.c in TFLite-Micro
     uint32_t vy = 0;
     if (vx != 0) {
-      const uint32_t vn = (math_clz_u32(vx) | 1) ^ 31;
+      const uint32_t vn = (math_clz_nonzero_u32(vx) | 1) ^ 31;
       uint32_t vb = UINT32_C(1) << vn;
       uint32_t iterations = (vn >> 1) + 1;
       while (iterations--) {
diff --git a/src/xnnpack/math.h b/src/xnnpack/math.h
index deefacf6f..7704fd5ba 100644
--- a/src/xnnpack/math.h
+++ b/src/xnnpack/math.h
@@ -193,6 +193,17 @@ XNN_INLINE static uint32_t math_clz_u32(uint32_t x) {
   #endif
 }
 
+XNN_INLINE static uint32_t math_clz_nonzero_u32(uint32_t x) {
+  assert(x != 0);
+  #ifdef _MSC_VER
+    unsigned long index;
+    _BitScanReverse(&index, (unsigned long) x);
+    return (uint32_t) index ^ 31;
+  #else
+    return (uint32_t) __builtin_clz((unsigned int) x);
+  #endif
+}
+
 XNN_INLINE static uint32_t math_ctz_u32(uint32_t x) {
   #ifdef _MSC_VER
     unsigned long index;
author	Marat Dukhan <maratek@google.com>	2022-07-26 10:47:39 -0700
committer	XNNPACK Team <xnnpack-github-robot@google.com>	2022-07-26 10:48:31 -0700
commit	e7a0a811b389d96f353478ef1054dc567e503216 (patch)
tree	6a5f9ae140c1414ecc06625030b3273159db20f9
parent	3f6b0d4d65c2f98ef3957b92dd7e6fbd42828572 (diff)
download	XNNPACK-e7a0a811b389d96f353478ef1054dc567e503216.tar.gz