aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJoe Ramsay <Joe.Ramsay@arm.com>2022-12-20 09:26:47 +0000
committerJoe Ramsay <joe.ramsay@arm.com>2022-12-20 09:26:47 +0000
commit04e91eca36b0a7dbbab78bf9401c978ab1b08b67 (patch)
tree8776137a2ff398663dc34f65ed6d842e57104b2e
parenta5fc3ed57ba4bc6df2e582f6a51c5fcc8e4459cd (diff)
downloadarm-optimized-routines-04e91eca36b0a7dbbab78bf9401c978ab1b08b67.tar.gz
pl/math: Add scalar & vector/Neon cbrt
New routines use the same algorithm, with simplified argument reduction and recombination in the vector variant. Both are accurate to 2 ULP.
-rw-r--r--pl/math/cbrt_2u.c70
-rw-r--r--pl/math/cbrt_data.c15
-rw-r--r--pl/math/include/mathlib.h5
-rw-r--r--pl/math/math_config.h6
-rw-r--r--pl/math/s_cbrt_2u.c6
-rw-r--r--pl/math/tools/cbrt.sollya20
-rw-r--r--pl/math/v_cbrt_2u.c97
-rw-r--r--pl/math/vn_cbrt_2u.c12
8 files changed, 231 insertions, 0 deletions
diff --git a/pl/math/cbrt_2u.c b/pl/math/cbrt_2u.c
new file mode 100644
index 0000000..f89dd87
--- /dev/null
+++ b/pl/math/cbrt_2u.c
@@ -0,0 +1,70 @@
+/*
+ * Double-precision cbrt(x) function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+PL_SIG (S, D, 1, cbrt, -10.0, 10.0)
+
+#define AbsMask 0x7fffffffffffffff
+#define TwoThirds 0x1.5555555555555p-1
+
+#define C(i) __cbrt_data.poly[i]
+#define T(i) __cbrt_data.table[i]
+
+/* Approximation for double-precision cbrt(x), using low-order polynomial and
+ two Newton iterations. Greatest observed error is 1.79 ULP. Errors repeat
+ according to the exponent, for instance an error observed for double value
+ m * 2^e will be observed for any input m * 2^(e + 3*i), where i is an
+ integer.
+ cbrt(0x1.fffff403f0bc6p+1) got 0x1.965fe72821e9bp+0
+ want 0x1.965fe72821e99p+0. */
+double
+cbrt (double x)
+{
+ uint64_t ix = asuint64 (x);
+ uint64_t iax = ix & AbsMask;
+ uint64_t sign = ix & ~AbsMask;
+
+ if (unlikely (iax == 0 || iax == 0x7f80000000000000))
+ return x;
+
+ /* |x| = m * 2^e, where m is in [0.5, 1.0].
+ We can easily decompose x into m and e using frexp. */
+ int e;
+ double m = frexp (asdouble (iax), &e);
+
+ /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point for
+ Newton iterations. */
+ double p_01 = fma (C (1), m, C (0));
+ double p_23 = fma (C (3), m, C (2));
+ double p = fma (p_23, m * m, p_01);
+
+ /* Two iterations of Newton's method for iteratively approximating cbrt. */
+ double m_by_3 = m / 3;
+ double a = fma (TwoThirds, p, m_by_3 / (p * p));
+ a = fma (TwoThirds, a, m_by_3 / (a * a));
+
+ /* Assemble the result by the following:
+
+ cbrt(x) = cbrt(m) * 2 ^ (e / 3).
+
+ Let t = (2 ^ (e / 3)) / (2 ^ round(e / 3)).
+
+ Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3.
+ i is an integer in [-2, 2], so t can be looked up in the table T.
+ Hence the result is assembled as:
+
+ cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign.
+ Which can be done easily using ldexp. */
+ return asdouble (asuint64 (ldexp (a * T (2 + e % 3), e / 3)) | sign);
+}
+
+PL_TEST_ULP (cbrt, 1.30)
+PL_TEST_INTERVAL (cbrt, 0, inf, 1000000)
+PL_TEST_INTERVAL (cbrt, -0, -inf, 1000000)
diff --git a/pl/math/cbrt_data.c b/pl/math/cbrt_data.c
new file mode 100644
index 0000000..1c6ca73
--- /dev/null
+++ b/pl/math/cbrt_data.c
@@ -0,0 +1,15 @@
+/*
+ * Coefficients and table entries for double-precision cbrt(x).
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+const struct cbrt_data __cbrt_data
+ = {.poly = { /* Coefficients for very rough approximation of cbrt(x) in [0.5, 1].
+ See cbrt.sollya for details of generation. */
+ 0x1.c14e8ee44767p-2, 0x1.dd2d3f99e4c0ep-1, -0x1.08e83026b7e74p-1, 0x1.2c74eaa3ba428p-3},
+ .table = { /* table[i] = 2^((i - 2) / 3). */
+ 0x1.428a2f98d728bp-1, 0x1.965fea53d6e3dp-1, 0x1p0, 0x1.428a2f98d728bp0, 0x1.965fea53d6e3dp0}};
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 1266eb7..12d72fe 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -27,6 +27,7 @@ float tanhf (float);
double acosh (double);
double asinh (double);
double atan2 (double, double);
+double cbrt (double);
double cosh (double);
double erfc (double);
double expm1 (double);
@@ -53,6 +54,7 @@ float __s_tanhf (float);
double __s_asinh (double);
double __s_atan (double);
double __s_atan2 (double, double);
+double __s_cbrt (double);
double __s_cosh (double);
double __s_erf (double);
double __s_erfc (double);
@@ -82,6 +84,7 @@ __f32x4_t __v_atan2f (__f32x4_t, __f32x4_t);
__f64x2_t __v_atan2 (__f64x2_t, __f64x2_t);
__f32x4_t __v_atanhf (__f32x4_t);
__f32x4_t __v_cbrtf (__f32x4_t);
+__f64x2_t __v_cbrt (__f64x2_t);
__f32x4_t __v_coshf (__f32x4_t);
__f64x2_t __v_cosh (__f64x2_t);
__f32x4_t __v_erff (__f32x4_t);
@@ -113,6 +116,7 @@ __vpcs __f32x4_t __vn_atan2f (__f32x4_t, __f32x4_t);
__vpcs __f64x2_t __vn_atan2 (__f64x2_t, __f64x2_t);
__vpcs __f32x4_t __vn_atanhf (__f32x4_t);
__vpcs __f32x4_t __vn_cbrtf (__f32x4_t);
+__vpcs __f64x2_t __vn_cbrt (__f64x2_t);
__vpcs __f32x4_t __vn_coshf (__f32x4_t);
__vpcs __f64x2_t __vn_cosh (__f64x2_t);
__vpcs __f32x4_t __vn_erff (__f32x4_t);
@@ -141,6 +145,7 @@ __vpcs __f32x4_t _ZGVnN4vv_atan2f (__f32x4_t, __f32x4_t);
__vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t);
__vpcs __f32x4_t _ZGVnN4v_atanhf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_cbrtf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_cbrt (__f64x2_t);
__vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t);
__vpcs __f64x2_t _ZGVnN2v_cosh (__f64x2_t);
__vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t);
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index 90d571c..92ccebf 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -558,4 +558,10 @@ extern const struct cbrtf_data
float table[5];
} __cbrtf_data HIDDEN;
+extern const struct cbrt_data
+{
+ double poly[4];
+ double table[5];
+} __cbrt_data HIDDEN;
+
#endif
diff --git a/pl/math/s_cbrt_2u.c b/pl/math/s_cbrt_2u.c
new file mode 100644
index 0000000..22f726b
--- /dev/null
+++ b/pl/math/s_cbrt_2u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_cbrt_2u.c"
diff --git a/pl/math/tools/cbrt.sollya b/pl/math/tools/cbrt.sollya
new file mode 100644
index 0000000..7f179eb
--- /dev/null
+++ b/pl/math/tools/cbrt.sollya
@@ -0,0 +1,20 @@
+// polynomial for approximating cbrt(x) in double precision
+//
+// Copyright (c) 2022, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 3;
+
+a = 0.5;
+b = 1;
+
+
+f = x^(1/3);
+
+poly = fpminimax(f, deg, [|double ...|], [a;b]);
+
+display = hexadecimal;
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 0 to deg do round(coeff(poly,i), D, RN);
diff --git a/pl/math/v_cbrt_2u.c b/pl/math/v_cbrt_2u.c
new file mode 100644
index 0000000..b6e501c
--- /dev/null
+++ b/pl/math/v_cbrt_2u.c
@@ -0,0 +1,97 @@
+/*
+ * Double-precision vector cbrt(x) function.
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "mathlib.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#define AbsMask 0x7fffffffffffffff
+#define TwoThirds v_f64 (0x1.5555555555555p-1)
+#define TinyBound 0x001 /* top12 (smallest_normal). */
+#define BigBound 0x7ff /* top12 (infinity). */
+#define MantissaMask v_u64 (0x000fffffffffffff)
+#define HalfExp v_u64 (0x3fe0000000000000)
+
+#define C(i) v_f64 (__cbrt_data.poly[i])
+#define T(i) v_lookup_f64 (__cbrt_data.table, i)
+
+static NOINLINE v_f64_t
+specialcase (v_f64_t x, v_f64_t y, v_u64_t special)
+{
+ return v_call_f64 (cbrt, x, y, special);
+}
+
+/* Approximation for double-precision vector cbrt(x), using low-order polynomial
+ and two Newton iterations. Greatest observed error is 1.79 ULP. Errors repeat
+ according to the exponent, for instance an error observed for double value
+ m * 2^e will be observed for any input m * 2^(e + 3*i), where i is an
+ integer.
+ __v_cbrt(0x1.fffff403f0bc6p+1) got 0x1.965fe72821e9bp+0
+ want 0x1.965fe72821e99p+0. */
+VPCS_ATTR v_f64_t V_NAME (cbrt) (v_f64_t x)
+{
+ v_u64_t ix = v_as_u64_f64 (x);
+ v_u64_t iax = ix & AbsMask;
+ v_u64_t ia12 = iax >> 52;
+
+ /* Subnormal, +/-0 and special values. */
+ v_u64_t special = v_cond_u64 ((ia12 < TinyBound) | (ia12 >= BigBound));
+
+ /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector
+ version of frexp, which gets subnormal values wrong - these have to be
+ special-cased as a result. */
+ v_f64_t m = v_as_f64_u64 (v_bsl_u64 (MantissaMask, iax, HalfExp));
+ v_s64_t e = v_as_s64_u64 (iax >> 52) - 1022;
+
+ /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point for
+ Newton iterations. */
+ v_f64_t p_01 = v_fma_f64 (C (1), m, C (0));
+ v_f64_t p_23 = v_fma_f64 (C (3), m, C (2));
+ v_f64_t p = v_fma_f64 (m * m, p_23, p_01);
+
+ /* Two iterations of Newton's method for iteratively approximating cbrt. */
+ v_f64_t m_by_3 = m / 3;
+ v_f64_t a = v_fma_f64 (TwoThirds, p, m_by_3 / (p * p));
+ a = v_fma_f64 (TwoThirds, a, m_by_3 / (a * a));
+
+ /* Assemble the result by the following:
+
+ cbrt(x) = cbrt(m) * 2 ^ (e / 3).
+
+ We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is
+ not necessarily a multiple of 3 we lose some information.
+
+ Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q.
+
+ Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which is
+ an integer in [-2, 2], and can be looked up in the table T. Hence the
+ result is assembled as:
+
+ cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */
+
+ v_s64_t ey = e / 3;
+ v_f64_t my = a * T (v_as_u64_s64 (e % 3 + 2));
+
+ /* Vector version of ldexp. */
+ v_f64_t y = v_as_f64_u64 ((v_as_u64_s64 (ey + 1023) << 52)) * my;
+ /* Copy sign. */
+ y = v_as_f64_u64 (v_bsl_u64 (v_u64 (AbsMask), v_as_u64_f64 (y), ix));
+
+ if (unlikely (v_any_u64 (special)))
+ return specialcase (x, y, special);
+ return y;
+}
+VPCS_ALIAS
+
+PL_TEST_ULP (V_NAME (cbrt), 1.30)
+PL_SIG (V, D, 1, cbrt, -10.0, 10.0)
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (cbrt))
+PL_TEST_INTERVAL (V_NAME (cbrt), 0, inf, 1000000)
+PL_TEST_INTERVAL (V_NAME (cbrt), -0, -inf, 1000000)
+#endif
diff --git a/pl/math/vn_cbrt_2u.c b/pl/math/vn_cbrt_2u.c
new file mode 100644
index 0000000..ccaa085
--- /dev/null
+++ b/pl/math/vn_cbrt_2u.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_cbrt.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_cbrt, _ZGVnN2v_cbrt)
+#include "v_cbrt_2u.c"
+#endif