Evaluation stubs for U32 SQRT using F32 SQRT

PiperOrigin-RevId: 469077131
author: Marat Dukhan <maratek@google.com> 2022-08-21 19:11:13 -0700
committer: XNNPACK Team <xnnpack-github-robot@google.com> 2022-08-21 19:12:05 -0700
commit: 9614006ee79cf8fc4688bde5be135ef906c3e21c (patch)
tree: 6c74b3af24ec4adf8eba24df38da6f09c6e61565
parent: 3fd2d48a6bb392fbe033c03ae82f190ba7d186a9 (diff)
download: XNNPACK-9614006ee79cf8fc4688bde5be135ef906c3e21c.tar.gz
6 files changed, 173 insertions, 0 deletions
diff --git a/BUILD.bazel b/BUILD.bazel
index 546fc4431..d9845f862 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -1051,6 +1051,8 @@ ALL_SCALAR_MICROKERNEL_SRCS = [
     "src/math/sqrt-u32-scalar-cvti32-sqrt-lrint.c",
     "src/math/sqrt-u32-scalar-cvti64-sqrt-lrint.c",
     "src/math/sqrt-u32-scalar-cvtu32-sqrt-lrint.c",
+    "src/math/sqrt-u32-scalar-cvti64-sqrtf-lrintf.c",
+    "src/math/sqrt-u32-scalar-cvtu32-sqrtf-lrintf.c",
     "src/math/sqrt-u32-scalar-hashemian.c",
     "src/math/sqrt-u32-scalar-tflm.c",
     "src/qc8-dwconv/gen/up1x3-minmax-fp32-scalar-fmagic.c",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 721e0ef94..153faf0cc 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1039,6 +1039,8 @@ SET(ALL_SCALAR_MICROKERNEL_SRCS
   src/math/sqrt-u32-scalar-cvti32-sqrt-lrint.c
   src/math/sqrt-u32-scalar-cvti64-sqrt-lrint.c
   src/math/sqrt-u32-scalar-cvtu32-sqrt-lrint.c
+  src/math/sqrt-u32-scalar-cvti64-sqrtf-lrintf.c
+  src/math/sqrt-u32-scalar-cvtu32-sqrtf-lrintf.c
   src/math/sqrt-u32-scalar-hashemian.c
   src/math/sqrt-u32-scalar-tflm.c
   src/qc8-dwconv/gen/up1x3-minmax-fp32-scalar-fmagic.c
diff --git a/eval/u32-sqrt.cc b/eval/u32-sqrt.cc
index 7605f8c91..5b4d87576 100644
--- a/eval/u32-sqrt.cc
+++ b/eval/u32-sqrt.cc
@@ -284,6 +284,94 @@ TEST(SQRT__SCALAR_CVTU32_SQRT_LRINT, 65536_output) {
 }
 
 
+TEST(SQRT__SCALAR_CVTI64_SQRTF_LRINTF, uint16_output) {
+  std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> inputs(kBlockSize);
+  std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> outputs(kBlockSize);
+  for (uint32_t n = 0; n <= UINT32_C(4294901760); n += kBlockSize) {
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      inputs[i] = std::min<uint32_t>(n + i, UINT32_C(4294901760));
+    }
+    xnn_math_u32_sqrt__scalar_cvti64_sqrtf_lrintf(kBlockSize * sizeof(uint32_t), inputs.data(), outputs.data());
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      const uint32_t input = inputs[i];
+      const uint32_t output = outputs[i];
+      const int64_t squared_output = int64_t(uint64_t(output) * uint64_t(output));
+
+      const uint32_t prev_output = output - 1;
+      const int64_t squared_prev_output = int64_t(uint64_t(prev_output) * uint64_t(prev_output));
+      ASSERT_LT(std::abs(squared_output - int64_t(input)), std::abs(squared_prev_output - int64_t(input)))
+        << "input = " << input << ", output = " << output;
+
+      const uint32_t next_output = output + 1;
+      const int64_t squared_next_output = int64_t(uint64_t(next_output) * uint64_t(next_output));
+      ASSERT_LT(std::abs(squared_output - int64_t(input)), std::abs(squared_next_output - int64_t(input)))
+        << "input = " << input << ", output = " << output;
+    }
+  }
+}
+
+TEST(SQRT__SCALAR_CVTI64_SQRTF_LRINTF, 65536_output) {
+  std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> inputs(kBlockSize);
+  std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> outputs(kBlockSize);
+  for (uint32_t n = UINT32_C(4294901761); n >= UINT32_C(4294901761); n += kBlockSize) {
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      inputs[i] = std::max<uint32_t>(n + i, UINT32_C(4294901761));
+    }
+    xnn_math_u32_sqrt__scalar_cvti64_sqrtf_lrintf(kBlockSize * sizeof(uint32_t), inputs.data(), outputs.data());
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      const uint32_t input = inputs[i];
+      const uint32_t output = outputs[i];
+      ASSERT_EQ(output, UINT32_C(0x00010000))
+        << "input = " << input << ", output = " << output;
+    }
+  }
+}
+
+
+TEST(SQRT__SCALAR_CVTU32_SQRTF_LRINTF, uint16_output) {
+  std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> inputs(kBlockSize);
+  std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> outputs(kBlockSize);
+  for (uint32_t n = 0; n <= UINT32_C(4294901760); n += kBlockSize) {
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      inputs[i] = std::min<uint32_t>(n + i, UINT32_C(4294901760));
+    }
+    xnn_math_u32_sqrt__scalar_cvtu32_sqrtf_lrintf(kBlockSize * sizeof(uint32_t), inputs.data(), outputs.data());
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      const uint32_t input = inputs[i];
+      const uint32_t output = outputs[i];
+      const int64_t squared_output = int64_t(uint64_t(output) * uint64_t(output));
+
+      const uint32_t prev_output = output - 1;
+      const int64_t squared_prev_output = int64_t(uint64_t(prev_output) * uint64_t(prev_output));
+      ASSERT_LT(std::abs(squared_output - int64_t(input)), std::abs(squared_prev_output - int64_t(input)))
+        << "input = " << input << ", output = " << output;
+
+      const uint32_t next_output = output + 1;
+      const int64_t squared_next_output = int64_t(uint64_t(next_output) * uint64_t(next_output));
+      ASSERT_LT(std::abs(squared_output - int64_t(input)), std::abs(squared_next_output - int64_t(input)))
+        << "input = " << input << ", output = " << output;
+    }
+  }
+}
+
+TEST(SQRT__SCALAR_CVTU32_SQRTF_LRINTF, 65536_output) {
+  std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> inputs(kBlockSize);
+  std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> outputs(kBlockSize);
+  for (uint32_t n = UINT32_C(4294901761); n >= UINT32_C(4294901761); n += kBlockSize) {
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      inputs[i] = std::max<uint32_t>(n + i, UINT32_C(4294901761));
+    }
+    xnn_math_u32_sqrt__scalar_cvtu32_sqrtf_lrintf(kBlockSize * sizeof(uint32_t), inputs.data(), outputs.data());
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      const uint32_t input = inputs[i];
+      const uint32_t output = outputs[i];
+      ASSERT_EQ(output, UINT32_C(0x00010000))
+        << "input = " << input << ", output = " << output;
+    }
+  }
+}
+
+
 TEST(SQRT__SCALAR_HASHEMIAN, uint16_output) {
   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> inputs(kBlockSize);
   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> outputs(kBlockSize);
diff --git a/src/math/sqrt-u32-scalar-cvti64-sqrtf-lrintf.c b/src/math/sqrt-u32-scalar-cvti64-sqrtf-lrintf.c
new file mode 100644
index 000000000..dd4e68529
--- /dev/null
+++ b/src/math/sqrt-u32-scalar-cvti64-sqrtf-lrintf.c
@@ -0,0 +1,39 @@
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stddef.h>
+#include <math.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math-stubs.h>
+
+
+void xnn_math_u32_sqrt__scalar_cvti64_sqrtf_lrintf(
+    size_t n,
+    const uint32_t* input,
+    uint32_t* output)
+{
+  assert(n % sizeof(uint32_t) == 0);
+
+  for (; n != 0; n -= sizeof(uint32_t)) {
+    const uint32_t vx = *input++;
+
+    uint32_t vy = vx;
+    if XNN_LIKELY(vx != 0) {
+      float vf = (float) (double) (int64_t) (uint64_t) vx;
+      vf = sqrtf(vf);
+      vy = (uint32_t) (int32_t) lrintf(vf);
+      const uint32_t vsquared_y_less_x = vy * vy - vx;
+      if XNN_UNPREDICTABLE((int32_t) (vsquared_y_less_x + vy) < 0) {
+        vy += 1;
+      } else if XNN_UNPREDICTABLE((int32_t) (vsquared_y_less_x - vy) >= 0) {
+        vy -= 1;
+      }
+    }
+
+    *output++ = vy;
+  }
+}
diff --git a/src/math/sqrt-u32-scalar-cvtu32-sqrtf-lrintf.c b/src/math/sqrt-u32-scalar-cvtu32-sqrtf-lrintf.c
new file mode 100644
index 000000000..cb3b9e94b
--- /dev/null
+++ b/src/math/sqrt-u32-scalar-cvtu32-sqrtf-lrintf.c
@@ -0,0 +1,39 @@
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stddef.h>
+#include <math.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math-stubs.h>
+
+
+void xnn_math_u32_sqrt__scalar_cvtu32_sqrtf_lrintf(
+    size_t n,
+    const uint32_t* input,
+    uint32_t* output)
+{
+  assert(n % sizeof(uint32_t) == 0);
+
+  for (; n != 0; n -= sizeof(uint32_t)) {
+    const uint32_t vx = *input++;
+
+    uint32_t vy = vx;
+    if XNN_LIKELY(vx != 0) {
+      float vf = (float) vx;
+      vf = sqrtf(vf);
+      vy = (uint32_t) (int32_t) lrintf(vf);
+      const uint32_t vsquared_y_less_x = vy * vy - vx;
+      if XNN_UNPREDICTABLE((int32_t) (vsquared_y_less_x + vy) < 0) {
+        vy += 1;
+      } else if XNN_UNPREDICTABLE((int32_t) (vsquared_y_less_x - vy) >= 0) {
+        vy -= 1;
+      }
+    }
+
+    *output++ = vy;
+  }
+}
diff --git a/src/xnnpack/math-stubs.h b/src/xnnpack/math-stubs.h
index e79635384..1bedd8501 100644
--- a/src/xnnpack/math-stubs.h
+++ b/src/xnnpack/math-stubs.h
@@ -354,6 +354,9 @@ DECLARE_U32_UNARY_MATH_FUNCTION(xnn_math_u32_sqrt__scalar_clz_newton)
 DECLARE_U32_UNARY_MATH_FUNCTION(xnn_math_u32_sqrt__scalar_cvti32_sqrt_lrint)
 DECLARE_U32_UNARY_MATH_FUNCTION(xnn_math_u32_sqrt__scalar_cvti64_sqrt_lrint)
 DECLARE_U32_UNARY_MATH_FUNCTION(xnn_math_u32_sqrt__scalar_cvtu32_sqrt_lrint)
+DECLARE_U32_UNARY_MATH_FUNCTION(xnn_math_u32_sqrt__scalar_cvti32_sqrtf_lrintf)
+DECLARE_U32_UNARY_MATH_FUNCTION(xnn_math_u32_sqrt__scalar_cvti64_sqrtf_lrintf)
+DECLARE_U32_UNARY_MATH_FUNCTION(xnn_math_u32_sqrt__scalar_cvtu32_sqrtf_lrintf)
 DECLARE_U32_UNARY_MATH_FUNCTION(xnn_math_u32_sqrt__scalar_hashemian)
 DECLARE_U32_UNARY_MATH_FUNCTION(xnn_math_u32_sqrt__scalar_tflm)
author	Marat Dukhan <maratek@google.com>	2022-08-21 19:11:13 -0700
committer	XNNPACK Team <xnnpack-github-robot@google.com>	2022-08-21 19:12:05 -0700
commit	9614006ee79cf8fc4688bde5be135ef906c3e21c (patch)
tree	6c74b3af24ec4adf8eba24df38da6f09c6e61565
parent	3fd2d48a6bb392fbe033c03ae82f190ba7d186a9 (diff)
download	XNNPACK-9614006ee79cf8fc4688bde5be135ef906c3e21c.tar.gz