From 04e91eca36b0a7dbbab78bf9401c978ab1b08b67 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Tue, 20 Dec 2022 09:26:47 +0000
Subject: pl/math: Add scalar & vector/Neon cbrt

New routines use the same algorithm, with simplified argument
reduction and recombination in the vector variant. Both are accurate
to 2 ULP.
---
 pl/math/cbrt_2u.c         | 70 ++++++++++++++++++++++++++++++++++
 pl/math/cbrt_data.c       | 15 ++++++++
 pl/math/include/mathlib.h |  5 +++
 pl/math/math_config.h     |  6 +++
 pl/math/s_cbrt_2u.c       |  6 +++
 pl/math/tools/cbrt.sollya | 20 ++++++++++
 pl/math/v_cbrt_2u.c       | 97 +++++++++++++++++++++++++++++++++++++++++++++++
 pl/math/vn_cbrt_2u.c      | 12 ++++++
 8 files changed, 231 insertions(+)
 create mode 100644 pl/math/cbrt_2u.c
 create mode 100644 pl/math/cbrt_data.c
 create mode 100644 pl/math/s_cbrt_2u.c
 create mode 100644 pl/math/tools/cbrt.sollya
 create mode 100644 pl/math/v_cbrt_2u.c
 create mode 100644 pl/math/vn_cbrt_2u.c

diff --git a/pl/math/cbrt_2u.c b/pl/math/cbrt_2u.c
new file mode 100644
index 0000000..f89dd87
--- /dev/null
+++ b/pl/math/cbrt_2u.c
@@ -0,0 +1,70 @@
+/*
+ * Double-precision cbrt(x) function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+PL_SIG (S, D, 1, cbrt, -10.0, 10.0)
+
+#define AbsMask 0x7fffffffffffffff
+#define TwoThirds 0x1.5555555555555p-1
+
+#define C(i) __cbrt_data.poly[i]
+#define T(i) __cbrt_data.table[i]
+
+/* Approximation for double-precision cbrt(x), using low-order polynomial and
+   two Newton iterations. Greatest observed error is 1.79 ULP. Errors repeat
+   according to the exponent, for instance an error observed for double value
+   m * 2^e will be observed for any input m * 2^(e + 3*i), where i is an
+   integer.
+   cbrt(0x1.fffff403f0bc6p+1) got 0x1.965fe72821e9bp+0
+			     want 0x1.965fe72821e99p+0.  */
+double
+cbrt (double x)
+{
+  uint64_t ix = asuint64 (x);
+  uint64_t iax = ix & AbsMask;
+  uint64_t sign = ix & ~AbsMask;
+
+  if (unlikely (iax == 0 || iax == 0x7f80000000000000))
+    return x;
+
+  /* |x| = m * 2^e, where m is in [0.5, 1.0].
+     We can easily decompose x into m and e using frexp.  */
+  int e;
+  double m = frexp (asdouble (iax), &e);
+
+  /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point for
+     Newton iterations.  */
+  double p_01 = fma (C (1), m, C (0));
+  double p_23 = fma (C (3), m, C (2));
+  double p = fma (p_23, m * m, p_01);
+
+  /* Two iterations of Newton's method for iteratively approximating cbrt.  */
+  double m_by_3 = m / 3;
+  double a = fma (TwoThirds, p, m_by_3 / (p * p));
+  a = fma (TwoThirds, a, m_by_3 / (a * a));
+
+  /* Assemble the result by the following:
+
+     cbrt(x) = cbrt(m) * 2 ^ (e / 3).
+
+     Let t = (2 ^ (e / 3)) / (2 ^ round(e / 3)).
+
+     Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3.
+     i is an integer in [-2, 2], so t can be looked up in the table T.
+     Hence the result is assembled as:
+
+     cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign.
+     Which can be done easily using ldexp.  */
+  return asdouble (asuint64 (ldexp (a * T (2 + e % 3), e / 3)) | sign);
+}
+
+PL_TEST_ULP (cbrt, 1.30)
+PL_TEST_INTERVAL (cbrt, 0, inf, 1000000)
+PL_TEST_INTERVAL (cbrt, -0, -inf, 1000000)
diff --git a/pl/math/cbrt_data.c b/pl/math/cbrt_data.c
new file mode 100644
index 0000000..1c6ca73
--- /dev/null
+++ b/pl/math/cbrt_data.c
@@ -0,0 +1,15 @@
+/*
+ * Coefficients and table entries for double-precision cbrt(x).
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+const struct cbrt_data __cbrt_data
+  = {.poly = { /* Coefficients for very rough approximation of cbrt(x) in [0.5, 1].
+                  See cbrt.sollya for details of generation.  */
+	      0x1.c14e8ee44767p-2, 0x1.dd2d3f99e4c0ep-1, -0x1.08e83026b7e74p-1, 0x1.2c74eaa3ba428p-3},
+     .table = { /* table[i] = 2^((i - 2) / 3).  */
+	         0x1.428a2f98d728bp-1, 0x1.965fea53d6e3dp-1, 0x1p0, 0x1.428a2f98d728bp0, 0x1.965fea53d6e3dp0}};
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 1266eb7..12d72fe 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -27,6 +27,7 @@ float tanhf (float);
 double acosh (double);
 double asinh (double);
 double atan2 (double, double);
+double cbrt (double);
 double cosh (double);
 double erfc (double);
 double expm1 (double);
@@ -53,6 +54,7 @@ float __s_tanhf (float);
 double __s_asinh (double);
 double __s_atan (double);
 double __s_atan2 (double, double);
+double __s_cbrt (double);
 double __s_cosh (double);
 double __s_erf (double);
 double __s_erfc (double);
@@ -82,6 +84,7 @@ __f32x4_t __v_atan2f (__f32x4_t, __f32x4_t);
 __f64x2_t __v_atan2 (__f64x2_t, __f64x2_t);
 __f32x4_t __v_atanhf (__f32x4_t);
 __f32x4_t __v_cbrtf (__f32x4_t);
+__f64x2_t __v_cbrt (__f64x2_t);
 __f32x4_t __v_coshf (__f32x4_t);
 __f64x2_t __v_cosh (__f64x2_t);
 __f32x4_t __v_erff (__f32x4_t);
@@ -113,6 +116,7 @@ __vpcs __f32x4_t __vn_atan2f (__f32x4_t, __f32x4_t);
 __vpcs __f64x2_t __vn_atan2 (__f64x2_t, __f64x2_t);
 __vpcs __f32x4_t __vn_atanhf (__f32x4_t);
 __vpcs __f32x4_t __vn_cbrtf (__f32x4_t);
+__vpcs __f64x2_t __vn_cbrt (__f64x2_t);
 __vpcs __f32x4_t __vn_coshf (__f32x4_t);
 __vpcs __f64x2_t __vn_cosh (__f64x2_t);
 __vpcs __f32x4_t __vn_erff (__f32x4_t);
@@ -141,6 +145,7 @@ __vpcs __f32x4_t _ZGVnN4vv_atan2f (__f32x4_t, __f32x4_t);
 __vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_atanhf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_cbrtf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_cbrt (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_cosh (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t);
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index 90d571c..92ccebf 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -558,4 +558,10 @@ extern const struct cbrtf_data
   float table[5];
 } __cbrtf_data HIDDEN;
 
+extern const struct cbrt_data
+{
+  double poly[4];
+  double table[5];
+} __cbrt_data HIDDEN;
+
 #endif
diff --git a/pl/math/s_cbrt_2u.c b/pl/math/s_cbrt_2u.c
new file mode 100644
index 0000000..22f726b
--- /dev/null
+++ b/pl/math/s_cbrt_2u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_cbrt_2u.c"
diff --git a/pl/math/tools/cbrt.sollya b/pl/math/tools/cbrt.sollya
new file mode 100644
index 0000000..7f179eb
--- /dev/null
+++ b/pl/math/tools/cbrt.sollya
@@ -0,0 +1,20 @@
+// polynomial for approximating cbrt(x) in double precision
+//
+// Copyright (c) 2022, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 3;
+
+a = 0.5;
+b = 1;
+
+
+f = x^(1/3);
+
+poly = fpminimax(f, deg, [|double ...|], [a;b]);
+
+display = hexadecimal;
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 0 to deg do round(coeff(poly,i), D, RN);
diff --git a/pl/math/v_cbrt_2u.c b/pl/math/v_cbrt_2u.c
new file mode 100644
index 0000000..b6e501c
--- /dev/null
+++ b/pl/math/v_cbrt_2u.c
@@ -0,0 +1,97 @@
+/*
+ * Double-precision vector cbrt(x) function.
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "mathlib.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#define AbsMask 0x7fffffffffffffff
+#define TwoThirds v_f64 (0x1.5555555555555p-1)
+#define TinyBound 0x001 /* top12 (smallest_normal).  */
+#define BigBound 0x7ff	/* top12 (infinity).  */
+#define MantissaMask v_u64 (0x000fffffffffffff)
+#define HalfExp v_u64 (0x3fe0000000000000)
+
+#define C(i) v_f64 (__cbrt_data.poly[i])
+#define T(i) v_lookup_f64 (__cbrt_data.table, i)
+
+static NOINLINE v_f64_t
+specialcase (v_f64_t x, v_f64_t y, v_u64_t special)
+{
+  return v_call_f64 (cbrt, x, y, special);
+}
+
+/* Approximation for double-precision vector cbrt(x), using low-order polynomial
+   and two Newton iterations. Greatest observed error is 1.79 ULP. Errors repeat
+   according to the exponent, for instance an error observed for double value
+   m * 2^e will be observed for any input m * 2^(e + 3*i), where i is an
+   integer.
+   __v_cbrt(0x1.fffff403f0bc6p+1) got 0x1.965fe72821e9bp+0
+				 want 0x1.965fe72821e99p+0.  */
+VPCS_ATTR v_f64_t V_NAME (cbrt) (v_f64_t x)
+{
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t iax = ix & AbsMask;
+  v_u64_t ia12 = iax >> 52;
+
+  /* Subnormal, +/-0 and special values.  */
+  v_u64_t special = v_cond_u64 ((ia12 < TinyBound) | (ia12 >= BigBound));
+
+  /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector
+     version of frexp, which gets subnormal values wrong - these have to be
+     special-cased as a result.  */
+  v_f64_t m = v_as_f64_u64 (v_bsl_u64 (MantissaMask, iax, HalfExp));
+  v_s64_t e = v_as_s64_u64 (iax >> 52) - 1022;
+
+  /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point for
+     Newton iterations.  */
+  v_f64_t p_01 = v_fma_f64 (C (1), m, C (0));
+  v_f64_t p_23 = v_fma_f64 (C (3), m, C (2));
+  v_f64_t p = v_fma_f64 (m * m, p_23, p_01);
+
+  /* Two iterations of Newton's method for iteratively approximating cbrt.  */
+  v_f64_t m_by_3 = m / 3;
+  v_f64_t a = v_fma_f64 (TwoThirds, p, m_by_3 / (p * p));
+  a = v_fma_f64 (TwoThirds, a, m_by_3 / (a * a));
+
+  /* Assemble the result by the following:
+
+     cbrt(x) = cbrt(m) * 2 ^ (e / 3).
+
+     We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is
+     not necessarily a multiple of 3 we lose some information.
+
+     Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q.
+
+     Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which is
+     an integer in [-2, 2], and can be looked up in the table T. Hence the
+     result is assembled as:
+
+     cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign.  */
+
+  v_s64_t ey = e / 3;
+  v_f64_t my = a * T (v_as_u64_s64 (e % 3 + 2));
+
+  /* Vector version of ldexp.  */
+  v_f64_t y = v_as_f64_u64 ((v_as_u64_s64 (ey + 1023) << 52)) * my;
+  /* Copy sign.  */
+  y = v_as_f64_u64 (v_bsl_u64 (v_u64 (AbsMask), v_as_u64_f64 (y), ix));
+
+  if (unlikely (v_any_u64 (special)))
+    return specialcase (x, y, special);
+  return y;
+}
+VPCS_ALIAS
+
+PL_TEST_ULP (V_NAME (cbrt), 1.30)
+PL_SIG (V, D, 1, cbrt, -10.0, 10.0)
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (cbrt))
+PL_TEST_INTERVAL (V_NAME (cbrt), 0, inf, 1000000)
+PL_TEST_INTERVAL (V_NAME (cbrt), -0, -inf, 1000000)
+#endif
diff --git a/pl/math/vn_cbrt_2u.c b/pl/math/vn_cbrt_2u.c
new file mode 100644
index 0000000..ccaa085
--- /dev/null
+++ b/pl/math/vn_cbrt_2u.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_cbrt.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_cbrt, _ZGVnN2v_cbrt)
+#include "v_cbrt_2u.c"
+#endif
-- 
cgit v1.2.3