diff options
author | Marat Dukhan <maratek@google.com> | 2022-07-26 02:02:56 -0700 |
---|---|---|
committer | XNNPACK Team <xnnpack-github-robot@google.com> | 2022-07-26 02:03:51 -0700 |
commit | 3f6b0d4d65c2f98ef3957b92dd7e6fbd42828572 (patch) | |
tree | 8cbbad8fb8b4a2c842e35c18aacf005644d73a19 | |
parent | a68cd3fd7c2a1cd0c4ae4b168fb34c4cea65327b (diff) | |
download | XNNPACK-3f6b0d4d65c2f98ef3957b92dd7e6fbd42828572.tar.gz |
U32 SQRT evaluation stub for the Hashemian algorithm
PiperOrigin-RevId: 463282575
-rw-r--r-- | BUILD.bazel | 1 | ||||
-rwxr-xr-x | CMakeLists.txt | 1 | ||||
-rw-r--r-- | eval/u32-sqrt.cc | 44 | ||||
-rw-r--r-- | src/math/sqrt-u32-scalar-hashemian.c | 80 | ||||
-rw-r--r-- | src/xnnpack/math-stubs.h | 1 |
5 files changed, 127 insertions, 0 deletions
diff --git a/BUILD.bazel b/BUILD.bazel index 936391edf..6df946895 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -1035,6 +1035,7 @@ ALL_SCALAR_MICROKERNEL_SRCS = [ "src/math/sqrt-u32-scalar-cvti32-sqrt-lrint.c", "src/math/sqrt-u32-scalar-cvti64-sqrt-lrint.c", "src/math/sqrt-u32-scalar-cvtu32-sqrt-lrint.c", + "src/math/sqrt-u32-scalar-hashemian.c", "src/math/sqrt-u32-scalar-tflm.c", "src/qc8-dwconv/gen/up1x9-minmax-fp32-scalar-fmagic.c", "src/qc8-dwconv/gen/up1x9-minmax-fp32-scalar-imagic.c", diff --git a/CMakeLists.txt b/CMakeLists.txt index 2b9eecaee..8f879d425 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1023,6 +1023,7 @@ SET(ALL_SCALAR_MICROKERNEL_SRCS src/math/sqrt-u32-scalar-cvti32-sqrt-lrint.c src/math/sqrt-u32-scalar-cvti64-sqrt-lrint.c src/math/sqrt-u32-scalar-cvtu32-sqrt-lrint.c + src/math/sqrt-u32-scalar-hashemian.c src/math/sqrt-u32-scalar-tflm.c src/qc8-dwconv/gen/up1x9-minmax-fp32-scalar-fmagic.c src/qc8-dwconv/gen/up1x9-minmax-fp32-scalar-imagic.c diff --git a/eval/u32-sqrt.cc b/eval/u32-sqrt.cc index 8b4571654..7605f8c91 100644 --- a/eval/u32-sqrt.cc +++ b/eval/u32-sqrt.cc @@ -284,6 +284,50 @@ TEST(SQRT__SCALAR_CVTU32_SQRT_LRINT, 65536_output) { } +TEST(SQRT__SCALAR_HASHEMIAN, uint16_output) { + std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> inputs(kBlockSize); + std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> outputs(kBlockSize); + for (uint32_t n = 0; n <= UINT32_C(4294901760); n += kBlockSize) { + for (uint32_t i = 0; i < kBlockSize; i++) { + inputs[i] = std::min<uint32_t>(n + i, UINT32_C(4294901760)); + } + xnn_math_u32_sqrt__scalar_hashemian(kBlockSize * sizeof(uint32_t), inputs.data(), outputs.data()); + for (uint32_t i = 0; i < kBlockSize; i++) { + const uint32_t input = inputs[i]; + const uint32_t output = outputs[i]; + const int64_t squared_output = int64_t(uint64_t(output) * uint64_t(output)); + + const uint32_t prev_output = output - 1; + const int64_t squared_prev_output = int64_t(uint64_t(prev_output) * uint64_t(prev_output)); + ASSERT_LT(std::abs(squared_output - int64_t(input)), std::abs(squared_prev_output - int64_t(input))) + << "input = " << input << ", output = " << output; + + const uint32_t next_output = output + 1; + const int64_t squared_next_output = int64_t(uint64_t(next_output) * uint64_t(next_output)); + ASSERT_LT(std::abs(squared_output - int64_t(input)), std::abs(squared_next_output - int64_t(input))) + << "input = " << input << ", output = " << output; + } + } +} + +TEST(SQRT__SCALAR_HASHEMIAN, 65536_output) { + std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> inputs(kBlockSize); + std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> outputs(kBlockSize); + for (uint32_t n = UINT32_C(4294901761); n >= UINT32_C(4294901761); n += kBlockSize) { + for (uint32_t i = 0; i < kBlockSize; i++) { + inputs[i] = std::max<uint32_t>(n + i, UINT32_C(4294901761)); + } + xnn_math_u32_sqrt__scalar_hashemian(kBlockSize * sizeof(uint32_t), inputs.data(), outputs.data()); + for (uint32_t i = 0; i < kBlockSize; i++) { + const uint32_t input = inputs[i]; + const uint32_t output = outputs[i]; + ASSERT_EQ(output, UINT32_C(0x00010000)) + << "input = " << input << ", output = " << output; + } + } +} + + TEST(SQRT__SCALAR_TFLM, uint16_output) { std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> inputs(kBlockSize); std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> outputs(kBlockSize); diff --git a/src/math/sqrt-u32-scalar-hashemian.c b/src/math/sqrt-u32-scalar-hashemian.c new file mode 100644 index 000000000..fd679ad20 --- /dev/null +++ b/src/math/sqrt-u32-scalar-hashemian.c @@ -0,0 +1,80 @@ +// Copyright 2022 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include <assert.h> +#include <stddef.h> + +#include <xnnpack/math.h> +#include <xnnpack/math-stubs.h> + + +void xnn_math_u32_sqrt__scalar_hashemian( + size_t n, + const uint32_t* input, + uint32_t* output) +{ + assert(n % sizeof(uint32_t) == 0); + + for (; n != 0; n -= sizeof(uint32_t)) { + const uint32_t vx = *input++; + + uint32_t vy = vx; + if (vx != 0) { + /* + * Based on "Square Rooting Algorithms for Integer and Floating-Point Numbers" by Reza Hashemian + * and StackOverflow answer https://stackoverflow.com/a/31149161 + */ + + const uint32_t vn = math_clz_u32(vx); + const uint32_t vleft_shift = vn & 1; + const uint32_t vm_minus_1 = 15 - (vn >> 1); + const uint32_t vm_plus_1 = vm_minus_1 + 2; + const uint32_t vexp2_m_minus_1 = UINT32_C(1) << vm_minus_1; + const uint32_t vz = vexp2_m_minus_1 - (vx >> (vm_plus_1 - vleft_shift)); + + vy = vz; + // Iterate until y[i] == y[i-1]. Alternatively, we can do 7 iterations: + // for (uint32_t i = 0; i < 7; i++) { + // vy = vz + ((vy * vy) >> vm_plus_1); + // } + uint32_t vy_prev; + do { + vy_prev = vy; + vy = vz + ((vy * vy) >> vm_plus_1); + } while (vy != vy_prev); + + // Reconstruct Y = 2**m - vy + vy = (vexp2_m_minus_1 << 1) - vy; + if XNN_UNPREDICTABLE(vleft_shift) { + // Multiply by sqrt(0.5) by subtracting vy * (1 - sqrt(0.5)), 1 - sqrt(0.5) is represented + // as a .16 fixed-point number to guarantee than the product doesn't overflow 32 bits. + // Using 1 - sqrt(0.5) under these constraints is 1 bit more accurate than using sqrt(0.5) directly. + vy -= (vy * UINT32_C(19195)) >> 16; + } + + // When X has an even number of bits, Y can overestimate isqrt(X) by 1 due to truncations in fixed-point + // arithmetics. When X has an odd number of bits, Y can overestimate isqrt(X) by an extra 1 (2 total) due to + // truncation in the multiplication by sqrt(0.5). + // We decrement Y once if X < Y * Y and decrement it once again if Y * Y - X > X - (Y - 1) * (Y - 1). + uint32_t vsquared_y = vy * vy; + if XNN_UNPREDICTABLE(vsquared_y > vx) { + vsquared_y -= 2 * vy - 1; + vy -= 1; + } + + // Y is within a distance of 1 from properly rounded sqrt(X). + // - Increment Y if (Y + 1) * (Y + 1) - X < X - Y * Y. + // - Decrement Y if Y * Y - X > X - (Y - 1) * (Y - 1). + // The increment + decrement are combined together to re-use the (Y * Y) value. + if XNN_UNPREDICTABLE(vsquared_y < vx - vy) { + vy += 1; + } else if XNN_UNPREDICTABLE(vsquared_y - vy >= vx) { + vy -= 1; + } + } + + *output++ = vy; + } +} diff --git a/src/xnnpack/math-stubs.h b/src/xnnpack/math-stubs.h index 695f572dd..e79635384 100644 --- a/src/xnnpack/math-stubs.h +++ b/src/xnnpack/math-stubs.h @@ -354,6 +354,7 @@ DECLARE_U32_UNARY_MATH_FUNCTION(xnn_math_u32_sqrt__scalar_clz_newton) DECLARE_U32_UNARY_MATH_FUNCTION(xnn_math_u32_sqrt__scalar_cvti32_sqrt_lrint) DECLARE_U32_UNARY_MATH_FUNCTION(xnn_math_u32_sqrt__scalar_cvti64_sqrt_lrint) DECLARE_U32_UNARY_MATH_FUNCTION(xnn_math_u32_sqrt__scalar_cvtu32_sqrt_lrint) +DECLARE_U32_UNARY_MATH_FUNCTION(xnn_math_u32_sqrt__scalar_hashemian) DECLARE_U32_UNARY_MATH_FUNCTION(xnn_math_u32_sqrt__scalar_tflm) #ifdef __cplusplus |