aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarat Dukhan <maratek@google.com>2022-07-26 02:02:56 -0700
committerXNNPACK Team <xnnpack-github-robot@google.com>2022-07-26 02:03:51 -0700
commit3f6b0d4d65c2f98ef3957b92dd7e6fbd42828572 (patch)
tree8cbbad8fb8b4a2c842e35c18aacf005644d73a19
parenta68cd3fd7c2a1cd0c4ae4b168fb34c4cea65327b (diff)
downloadXNNPACK-3f6b0d4d65c2f98ef3957b92dd7e6fbd42828572.tar.gz
U32 SQRT evaluation stub for the Hashemian algorithm
PiperOrigin-RevId: 463282575
-rw-r--r--BUILD.bazel1
-rwxr-xr-xCMakeLists.txt1
-rw-r--r--eval/u32-sqrt.cc44
-rw-r--r--src/math/sqrt-u32-scalar-hashemian.c80
-rw-r--r--src/xnnpack/math-stubs.h1
5 files changed, 127 insertions, 0 deletions
diff --git a/BUILD.bazel b/BUILD.bazel
index 936391edf..6df946895 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -1035,6 +1035,7 @@ ALL_SCALAR_MICROKERNEL_SRCS = [
"src/math/sqrt-u32-scalar-cvti32-sqrt-lrint.c",
"src/math/sqrt-u32-scalar-cvti64-sqrt-lrint.c",
"src/math/sqrt-u32-scalar-cvtu32-sqrt-lrint.c",
+ "src/math/sqrt-u32-scalar-hashemian.c",
"src/math/sqrt-u32-scalar-tflm.c",
"src/qc8-dwconv/gen/up1x9-minmax-fp32-scalar-fmagic.c",
"src/qc8-dwconv/gen/up1x9-minmax-fp32-scalar-imagic.c",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2b9eecaee..8f879d425 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1023,6 +1023,7 @@ SET(ALL_SCALAR_MICROKERNEL_SRCS
src/math/sqrt-u32-scalar-cvti32-sqrt-lrint.c
src/math/sqrt-u32-scalar-cvti64-sqrt-lrint.c
src/math/sqrt-u32-scalar-cvtu32-sqrt-lrint.c
+ src/math/sqrt-u32-scalar-hashemian.c
src/math/sqrt-u32-scalar-tflm.c
src/qc8-dwconv/gen/up1x9-minmax-fp32-scalar-fmagic.c
src/qc8-dwconv/gen/up1x9-minmax-fp32-scalar-imagic.c
diff --git a/eval/u32-sqrt.cc b/eval/u32-sqrt.cc
index 8b4571654..7605f8c91 100644
--- a/eval/u32-sqrt.cc
+++ b/eval/u32-sqrt.cc
@@ -284,6 +284,50 @@ TEST(SQRT__SCALAR_CVTU32_SQRT_LRINT, 65536_output) {
}
+TEST(SQRT__SCALAR_HASHEMIAN, uint16_output) {
+ std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> inputs(kBlockSize);
+ std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> outputs(kBlockSize);
+ for (uint32_t n = 0; n <= UINT32_C(4294901760); n += kBlockSize) {
+ for (uint32_t i = 0; i < kBlockSize; i++) {
+ inputs[i] = std::min<uint32_t>(n + i, UINT32_C(4294901760));
+ }
+ xnn_math_u32_sqrt__scalar_hashemian(kBlockSize * sizeof(uint32_t), inputs.data(), outputs.data());
+ for (uint32_t i = 0; i < kBlockSize; i++) {
+ const uint32_t input = inputs[i];
+ const uint32_t output = outputs[i];
+ const int64_t squared_output = int64_t(uint64_t(output) * uint64_t(output));
+
+ const uint32_t prev_output = output - 1;
+ const int64_t squared_prev_output = int64_t(uint64_t(prev_output) * uint64_t(prev_output));
+ ASSERT_LT(std::abs(squared_output - int64_t(input)), std::abs(squared_prev_output - int64_t(input)))
+ << "input = " << input << ", output = " << output;
+
+ const uint32_t next_output = output + 1;
+ const int64_t squared_next_output = int64_t(uint64_t(next_output) * uint64_t(next_output));
+ ASSERT_LT(std::abs(squared_output - int64_t(input)), std::abs(squared_next_output - int64_t(input)))
+ << "input = " << input << ", output = " << output;
+ }
+ }
+}
+
+TEST(SQRT__SCALAR_HASHEMIAN, 65536_output) {
+ std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> inputs(kBlockSize);
+ std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> outputs(kBlockSize);
+ for (uint32_t n = UINT32_C(4294901761); n >= UINT32_C(4294901761); n += kBlockSize) {
+ for (uint32_t i = 0; i < kBlockSize; i++) {
+ inputs[i] = std::max<uint32_t>(n + i, UINT32_C(4294901761));
+ }
+ xnn_math_u32_sqrt__scalar_hashemian(kBlockSize * sizeof(uint32_t), inputs.data(), outputs.data());
+ for (uint32_t i = 0; i < kBlockSize; i++) {
+ const uint32_t input = inputs[i];
+ const uint32_t output = outputs[i];
+ ASSERT_EQ(output, UINT32_C(0x00010000))
+ << "input = " << input << ", output = " << output;
+ }
+ }
+}
+
+
TEST(SQRT__SCALAR_TFLM, uint16_output) {
std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> inputs(kBlockSize);
std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> outputs(kBlockSize);
diff --git a/src/math/sqrt-u32-scalar-hashemian.c b/src/math/sqrt-u32-scalar-hashemian.c
new file mode 100644
index 000000000..fd679ad20
--- /dev/null
+++ b/src/math/sqrt-u32-scalar-hashemian.c
@@ -0,0 +1,80 @@
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stddef.h>
+
+#include <xnnpack/math.h>
+#include <xnnpack/math-stubs.h>
+
+
+void xnn_math_u32_sqrt__scalar_hashemian(
+ size_t n,
+ const uint32_t* input,
+ uint32_t* output)
+{
+ assert(n % sizeof(uint32_t) == 0);
+
+ for (; n != 0; n -= sizeof(uint32_t)) {
+ const uint32_t vx = *input++;
+
+ uint32_t vy = vx;
+ if (vx != 0) {
+ /*
+ * Based on "Square Rooting Algorithms for Integer and Floating-Point Numbers" by Reza Hashemian
+ * and StackOverflow answer https://stackoverflow.com/a/31149161
+ */
+
+ const uint32_t vn = math_clz_u32(vx);
+ const uint32_t vleft_shift = vn & 1;
+ const uint32_t vm_minus_1 = 15 - (vn >> 1);
+ const uint32_t vm_plus_1 = vm_minus_1 + 2;
+ const uint32_t vexp2_m_minus_1 = UINT32_C(1) << vm_minus_1;
+ const uint32_t vz = vexp2_m_minus_1 - (vx >> (vm_plus_1 - vleft_shift));
+
+ vy = vz;
+ // Iterate until y[i] == y[i-1]. Alternatively, we can do 7 iterations:
+ // for (uint32_t i = 0; i < 7; i++) {
+ // vy = vz + ((vy * vy) >> vm_plus_1);
+ // }
+ uint32_t vy_prev;
+ do {
+ vy_prev = vy;
+ vy = vz + ((vy * vy) >> vm_plus_1);
+ } while (vy != vy_prev);
+
+ // Reconstruct Y = 2**m - vy
+ vy = (vexp2_m_minus_1 << 1) - vy;
+ if XNN_UNPREDICTABLE(vleft_shift) {
+ // Multiply by sqrt(0.5) by subtracting vy * (1 - sqrt(0.5)), 1 - sqrt(0.5) is represented
+ // as a .16 fixed-point number to guarantee than the product doesn't overflow 32 bits.
+ // Using 1 - sqrt(0.5) under these constraints is 1 bit more accurate than using sqrt(0.5) directly.
+ vy -= (vy * UINT32_C(19195)) >> 16;
+ }
+
+ // When X has an even number of bits, Y can overestimate isqrt(X) by 1 due to truncations in fixed-point
+ // arithmetics. When X has an odd number of bits, Y can overestimate isqrt(X) by an extra 1 (2 total) due to
+ // truncation in the multiplication by sqrt(0.5).
+ // We decrement Y once if X < Y * Y and decrement it once again if Y * Y - X > X - (Y - 1) * (Y - 1).
+ uint32_t vsquared_y = vy * vy;
+ if XNN_UNPREDICTABLE(vsquared_y > vx) {
+ vsquared_y -= 2 * vy - 1;
+ vy -= 1;
+ }
+
+ // Y is within a distance of 1 from properly rounded sqrt(X).
+ // - Increment Y if (Y + 1) * (Y + 1) - X < X - Y * Y.
+ // - Decrement Y if Y * Y - X > X - (Y - 1) * (Y - 1).
+ // The increment + decrement are combined together to re-use the (Y * Y) value.
+ if XNN_UNPREDICTABLE(vsquared_y < vx - vy) {
+ vy += 1;
+ } else if XNN_UNPREDICTABLE(vsquared_y - vy >= vx) {
+ vy -= 1;
+ }
+ }
+
+ *output++ = vy;
+ }
+}
diff --git a/src/xnnpack/math-stubs.h b/src/xnnpack/math-stubs.h
index 695f572dd..e79635384 100644
--- a/src/xnnpack/math-stubs.h
+++ b/src/xnnpack/math-stubs.h
@@ -354,6 +354,7 @@ DECLARE_U32_UNARY_MATH_FUNCTION(xnn_math_u32_sqrt__scalar_clz_newton)
DECLARE_U32_UNARY_MATH_FUNCTION(xnn_math_u32_sqrt__scalar_cvti32_sqrt_lrint)
DECLARE_U32_UNARY_MATH_FUNCTION(xnn_math_u32_sqrt__scalar_cvti64_sqrt_lrint)
DECLARE_U32_UNARY_MATH_FUNCTION(xnn_math_u32_sqrt__scalar_cvtu32_sqrt_lrint)
+DECLARE_U32_UNARY_MATH_FUNCTION(xnn_math_u32_sqrt__scalar_hashemian)
DECLARE_U32_UNARY_MATH_FUNCTION(xnn_math_u32_sqrt__scalar_tflm)
#ifdef __cplusplus