U32 SQRT evaluation stub for the Hashemian algorithm

PiperOrigin-RevId: 463282575
author: Marat Dukhan <maratek@google.com> 2022-07-26 02:02:56 -0700
committer: XNNPACK Team <xnnpack-github-robot@google.com> 2022-07-26 02:03:51 -0700
commit: 3f6b0d4d65c2f98ef3957b92dd7e6fbd42828572 (patch)
tree: 8cbbad8fb8b4a2c842e35c18aacf005644d73a19
parent: a68cd3fd7c2a1cd0c4ae4b168fb34c4cea65327b (diff)
download: XNNPACK-3f6b0d4d65c2f98ef3957b92dd7e6fbd42828572.tar.gz
5 files changed, 127 insertions, 0 deletions
diff --git a/BUILD.bazel b/BUILD.bazel
index 936391edf..6df946895 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -1035,6 +1035,7 @@ ALL_SCALAR_MICROKERNEL_SRCS = [
     "src/math/sqrt-u32-scalar-cvti32-sqrt-lrint.c",
     "src/math/sqrt-u32-scalar-cvti64-sqrt-lrint.c",
     "src/math/sqrt-u32-scalar-cvtu32-sqrt-lrint.c",
+    "src/math/sqrt-u32-scalar-hashemian.c",
     "src/math/sqrt-u32-scalar-tflm.c",
     "src/qc8-dwconv/gen/up1x9-minmax-fp32-scalar-fmagic.c",
     "src/qc8-dwconv/gen/up1x9-minmax-fp32-scalar-imagic.c",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2b9eecaee..8f879d425 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1023,6 +1023,7 @@ SET(ALL_SCALAR_MICROKERNEL_SRCS
   src/math/sqrt-u32-scalar-cvti32-sqrt-lrint.c
   src/math/sqrt-u32-scalar-cvti64-sqrt-lrint.c
   src/math/sqrt-u32-scalar-cvtu32-sqrt-lrint.c
+  src/math/sqrt-u32-scalar-hashemian.c
   src/math/sqrt-u32-scalar-tflm.c
   src/qc8-dwconv/gen/up1x9-minmax-fp32-scalar-fmagic.c
   src/qc8-dwconv/gen/up1x9-minmax-fp32-scalar-imagic.c
diff --git a/eval/u32-sqrt.cc b/eval/u32-sqrt.cc
index 8b4571654..7605f8c91 100644
--- a/eval/u32-sqrt.cc
+++ b/eval/u32-sqrt.cc
@@ -284,6 +284,50 @@ TEST(SQRT__SCALAR_CVTU32_SQRT_LRINT, 65536_output) {
 }
 
 
+TEST(SQRT__SCALAR_HASHEMIAN, uint16_output) {
+  std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> inputs(kBlockSize);
+  std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> outputs(kBlockSize);
+  for (uint32_t n = 0; n <= UINT32_C(4294901760); n += kBlockSize) {
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      inputs[i] = std::min<uint32_t>(n + i, UINT32_C(4294901760));
+    }
+    xnn_math_u32_sqrt__scalar_hashemian(kBlockSize * sizeof(uint32_t), inputs.data(), outputs.data());
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      const uint32_t input = inputs[i];
+      const uint32_t output = outputs[i];
+      const int64_t squared_output = int64_t(uint64_t(output) * uint64_t(output));
+
+      const uint32_t prev_output = output - 1;
+      const int64_t squared_prev_output = int64_t(uint64_t(prev_output) * uint64_t(prev_output));
+      ASSERT_LT(std::abs(squared_output - int64_t(input)), std::abs(squared_prev_output - int64_t(input)))
+        << "input = " << input << ", output = " << output;
+
+      const uint32_t next_output = output + 1;
+      const int64_t squared_next_output = int64_t(uint64_t(next_output) * uint64_t(next_output));
+      ASSERT_LT(std::abs(squared_output - int64_t(input)), std::abs(squared_next_output - int64_t(input)))
+        << "input = " << input << ", output = " << output;
+    }
+  }
+}
+
+TEST(SQRT__SCALAR_HASHEMIAN, 65536_output) {
+  std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> inputs(kBlockSize);
+  std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> outputs(kBlockSize);
+  for (uint32_t n = UINT32_C(4294901761); n >= UINT32_C(4294901761); n += kBlockSize) {
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      inputs[i] = std::max<uint32_t>(n + i, UINT32_C(4294901761));
+    }
+    xnn_math_u32_sqrt__scalar_hashemian(kBlockSize * sizeof(uint32_t), inputs.data(), outputs.data());
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      const uint32_t input = inputs[i];
+      const uint32_t output = outputs[i];
+      ASSERT_EQ(output, UINT32_C(0x00010000))
+        << "input = " << input << ", output = " << output;
+    }
+  }
+}
+
+
 TEST(SQRT__SCALAR_TFLM, uint16_output) {
   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> inputs(kBlockSize);
   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> outputs(kBlockSize);
diff --git a/src/math/sqrt-u32-scalar-hashemian.c b/src/math/sqrt-u32-scalar-hashemian.c
new file mode 100644
index 000000000..fd679ad20
--- /dev/null
+++ b/src/math/sqrt-u32-scalar-hashemian.c
@@ -0,0 +1,80 @@
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stddef.h>
+
+#include <xnnpack/math.h>
+#include <xnnpack/math-stubs.h>
+
+
+void xnn_math_u32_sqrt__scalar_hashemian(
+    size_t n,
+    const uint32_t* input,
+    uint32_t* output)
+{
+  assert(n % sizeof(uint32_t) == 0);
+
+  for (; n != 0; n -= sizeof(uint32_t)) {
+    const uint32_t vx = *input++;
+
+    uint32_t vy = vx;
+    if (vx != 0) {
+      /*
+       * Based on "Square Rooting Algorithms for Integer and Floating-Point Numbers" by Reza Hashemian
+       * and StackOverflow answer https://stackoverflow.com/a/31149161
+      */
+
+      const uint32_t vn = math_clz_u32(vx);
+      const uint32_t vleft_shift = vn & 1;
+      const uint32_t vm_minus_1 = 15 - (vn >> 1);
+      const uint32_t vm_plus_1 = vm_minus_1 + 2;
+      const uint32_t vexp2_m_minus_1 = UINT32_C(1) << vm_minus_1;
+      const uint32_t vz = vexp2_m_minus_1 - (vx >> (vm_plus_1 - vleft_shift));
+
+      vy = vz;
+      // Iterate until y[i] == y[i-1]. Alternatively, we can do 7 iterations:
+      //   for (uint32_t i = 0; i < 7; i++) {
+      //     vy = vz + ((vy * vy) >> vm_plus_1);
+      //   }
+      uint32_t vy_prev;
+      do {
+        vy_prev = vy;
+        vy = vz + ((vy * vy) >> vm_plus_1);
+      } while (vy != vy_prev);
+
+      // Reconstruct Y = 2**m - vy
+      vy = (vexp2_m_minus_1 << 1) - vy;
+      if XNN_UNPREDICTABLE(vleft_shift) {
+        // Multiply by sqrt(0.5) by subtracting vy * (1 - sqrt(0.5)), 1 - sqrt(0.5) is represented
+        // as a .16 fixed-point number to guarantee than the product doesn't overflow 32 bits.
+        // Using 1 - sqrt(0.5) under these constraints is 1 bit more accurate than using sqrt(0.5) directly.
+        vy -= (vy * UINT32_C(19195)) >> 16;
+      }
+
+      // When X has an even number of bits, Y can overestimate isqrt(X) by 1 due to truncations in fixed-point
+      // arithmetics. When X has an odd number of bits, Y can overestimate isqrt(X) by an extra 1 (2 total) due to
+      // truncation in the multiplication by sqrt(0.5).
+      // We decrement Y once if X < Y * Y and decrement it once again if Y * Y - X > X - (Y - 1) * (Y - 1).
+      uint32_t vsquared_y = vy * vy;
+      if XNN_UNPREDICTABLE(vsquared_y > vx) {
+        vsquared_y -= 2 * vy - 1;
+        vy -= 1;
+      }
+
+      // Y is within a distance of 1 from properly rounded sqrt(X).
+      // - Increment Y if (Y + 1) * (Y + 1) - X < X - Y * Y.
+      // - Decrement Y if Y * Y - X > X - (Y - 1) * (Y - 1).
+      // The increment + decrement are combined together to re-use the (Y * Y) value.
+      if XNN_UNPREDICTABLE(vsquared_y < vx - vy) {
+        vy += 1;
+      } else if XNN_UNPREDICTABLE(vsquared_y - vy >= vx) {
+        vy -= 1;
+      }
+    }
+
+    *output++ = vy;
+  }
+}
diff --git a/src/xnnpack/math-stubs.h b/src/xnnpack/math-stubs.h
index 695f572dd..e79635384 100644
--- a/src/xnnpack/math-stubs.h
+++ b/src/xnnpack/math-stubs.h
@@ -354,6 +354,7 @@ DECLARE_U32_UNARY_MATH_FUNCTION(xnn_math_u32_sqrt__scalar_clz_newton)
 DECLARE_U32_UNARY_MATH_FUNCTION(xnn_math_u32_sqrt__scalar_cvti32_sqrt_lrint)
 DECLARE_U32_UNARY_MATH_FUNCTION(xnn_math_u32_sqrt__scalar_cvti64_sqrt_lrint)
 DECLARE_U32_UNARY_MATH_FUNCTION(xnn_math_u32_sqrt__scalar_cvtu32_sqrt_lrint)
+DECLARE_U32_UNARY_MATH_FUNCTION(xnn_math_u32_sqrt__scalar_hashemian)
 DECLARE_U32_UNARY_MATH_FUNCTION(xnn_math_u32_sqrt__scalar_tflm)
 
 #ifdef __cplusplus
author	Marat Dukhan <maratek@google.com>	2022-07-26 02:02:56 -0700
committer	XNNPACK Team <xnnpack-github-robot@google.com>	2022-07-26 02:03:51 -0700
commit	3f6b0d4d65c2f98ef3957b92dd7e6fbd42828572 (patch)
tree	8cbbad8fb8b4a2c842e35c18aacf005644d73a19
parent	a68cd3fd7c2a1cd0c4ae4b168fb34c4cea65327b (diff)
download	XNNPACK-3f6b0d4d65c2f98ef3957b92dd7e6fbd42828572.tar.gz