aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJoe Ramsay <Joe.Ramsay@arm.com>2022-11-22 13:40:13 +0000
committerJoe Ramsay <joe.ramsay@arm.com>2022-11-22 13:40:13 +0000
commit6e875e8b27fd103bb41590a580c8ee03ea5d7138 (patch)
tree4d75a7148f44b8dc9a1e1ea19b18a92a6c3acdf6
parentd5844d863d498cb1c43fde4ae3e47f3eeb5d0024 (diff)
downloadarm-optimized-routines-6e875e8b27fd103bb41590a580c8ee03ea5d7138.tar.gz
pl/math: Add scalar & vector/Neon cbrtf
Both routines use the same algorithm - one Newton iteration with the initial guess obtained by a low-order polynomial. Scalar is used as a fallback for subnormal and special cases for the vector routine, which allows vastly simplified argument reduction and reassembly. Both routines accurate to 1.5 ULP.
-rw-r--r--pl/math/cbrtf_1u5.c64
-rw-r--r--pl/math/cbrtf_data.c15
-rw-r--r--pl/math/include/mathlib.h5
-rw-r--r--pl/math/math_config.h6
-rw-r--r--pl/math/s_cbrtf_1u5.c6
-rw-r--r--pl/math/test/mathbench_funcs.h2
-rwxr-xr-xpl/math/test/runulp.sh14
-rw-r--r--pl/math/test/testcases/directed/cbrtf.tst29
-rw-r--r--pl/math/test/ulp_funcs.h2
-rw-r--r--pl/math/test/ulp_wrappers.h1
-rw-r--r--pl/math/tools/cbrtf.sollya20
-rw-r--r--pl/math/v_cbrtf_1u5.c88
-rw-r--r--pl/math/vn_cbrtf_1u5.c12
13 files changed, 264 insertions, 0 deletions
diff --git a/pl/math/cbrtf_1u5.c b/pl/math/cbrtf_1u5.c
new file mode 100644
index 0000000..73b9049
--- /dev/null
+++ b/pl/math/cbrtf_1u5.c
@@ -0,0 +1,64 @@
+/*
+ * Single-precision cbrt(x) function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include <math.h>
+
+#include "math_config.h"
+
+#define AbsMask 0x7fffffff
+#define SignMask 0x80000000
+#define TwoThirds 0x1.555556p-1f
+
+#define C(i) __cbrtf_data.poly[i]
+#define T(i) __cbrtf_data.table[i]
+
+/* Approximation for single-precision cbrt(x), using low-order polynomial and
+ one Newton iteration on a reduced interval. Greatest error is 1.5 ULP. This
+ is observed for every value where the mantissa is 0x1.81410e and the exponent
+ is a multiple of 3, for example:
+ cbrtf(0x1.81410ep+30) got 0x1.255d96p+10
+ want 0x1.255d92p+10. */
+float
+cbrtf (float x)
+{
+ uint32_t ix = asuint (x);
+ uint32_t iax = ix & AbsMask;
+ uint32_t sign = ix & SignMask;
+
+ if (unlikely (iax == 0 || iax == 0x7f800000))
+ return x;
+
+ /* |x| = m * 2^e, where m is in [0.5, 1.0].
+ We can easily decompose x into m and e using frexpf. */
+ int e;
+ float m = frexpf (asfloat (iax), &e);
+
+ /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is,
+ the less accurate the next stage of the algorithm needs to be. An order-4
+ polynomial is enough for one Newton iteration. */
+ float p_01 = fmaf (C (1), m, C (0));
+ float p_23 = fmaf (C (3), m, C (2));
+ float p = fmaf (m * m, p_23, p_01);
+
+ /* One iteration of Newton's method for iteratively approximating cbrt. */
+ float m_by_3 = m / 3;
+ float a = fmaf (TwoThirds, p, m_by_3 / (p * p));
+
+ /* Assemble the result by the following:
+
+ cbrt(x) = cbrt(m) * 2 ^ (e / 3).
+
+ Let t = (2 ^ (e / 3)) / (2 ^ round(e / 3)).
+
+ Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3.
+ i is an integer in [-2, 2], so t can be looked up in the table T.
+ Hence the result is assembled as:
+
+ cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign.
+ Which can be done easily using ldexpf. */
+ return asfloat (asuint (ldexpf (a * T (2 + e % 3), e / 3)) | sign);
+}
diff --git a/pl/math/cbrtf_data.c b/pl/math/cbrtf_data.c
new file mode 100644
index 0000000..386a2b4
--- /dev/null
+++ b/pl/math/cbrtf_data.c
@@ -0,0 +1,15 @@
+/*
+ * Coefficients and table entries for single-precision cbrt(x).
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+const struct cbrtf_data __cbrtf_data
+ = {.poly = { /* Coefficients for very rough approximation of cbrt(x) in [0.5, 1].
+ See cbrtf.sollya for details of generation. */
+ 0x1.c14e96p-2, 0x1.dd2d3p-1, -0x1.08e81ap-1, 0x1.2c74c2p-3},
+ .table = { /* table[i] = 2^((i - 2) / 3). */
+ 0x1.428a3p-1, 0x1.965feap-1, 0x1p0, 0x1.428a3p0, 0x1.965feap0}};
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index f9faf2f..7bec5e1 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -13,6 +13,7 @@ float acoshf (float);
float asinhf (float);
float atan2f (float, float);
float atanhf (float);
+float cbrtf (float);
float coshf (float);
float erfcf (float);
float erff (float);
@@ -36,6 +37,7 @@ float __s_asinhf (float);
float __s_atanf (float);
float __s_atan2f (float, float);
float __s_atanhf (float);
+float __s_cbrtf (float);
float __s_coshf (float);
float __s_erfcf (float);
float __s_erff (float);
@@ -75,6 +77,7 @@ __f64x2_t __v_atan (__f64x2_t);
__f32x4_t __v_atan2f (__f32x4_t, __f32x4_t);
__f64x2_t __v_atan2 (__f64x2_t, __f64x2_t);
__f32x4_t __v_atanhf (__f32x4_t);
+__f32x4_t __v_cbrtf (__f32x4_t);
__f32x4_t __v_coshf (__f32x4_t);
__f64x2_t __v_cosh (__f64x2_t);
__f32x4_t __v_erff (__f32x4_t);
@@ -103,6 +106,7 @@ __vpcs __f64x2_t __vn_atan (__f64x2_t);
__vpcs __f32x4_t __vn_atan2f (__f32x4_t, __f32x4_t);
__vpcs __f64x2_t __vn_atan2 (__f64x2_t, __f64x2_t);
__vpcs __f32x4_t __vn_atanhf (__f32x4_t);
+__vpcs __f32x4_t __vn_cbrtf (__f32x4_t);
__vpcs __f32x4_t __vn_coshf (__f32x4_t);
__vpcs __f64x2_t __vn_cosh (__f64x2_t);
__vpcs __f32x4_t __vn_erff (__f32x4_t);
@@ -128,6 +132,7 @@ __vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
__vpcs __f32x4_t _ZGVnN4vv_atan2f (__f32x4_t, __f32x4_t);
__vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t);
__vpcs __f32x4_t _ZGVnN4v_atanhf (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_cbrtf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t);
__vpcs __f64x2_t _ZGVnN2v_cosh (__f64x2_t);
__vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t);
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index 7472395..99132a0 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -553,4 +553,10 @@ extern const struct expf_data
#define EXPM1_POLY_ORDER 11
extern const double __expm1_poly[EXPM1_POLY_ORDER] HIDDEN;
+extern const struct cbrtf_data
+{
+ float poly[4];
+ float table[5];
+} __cbrtf_data HIDDEN;
+
#endif
diff --git a/pl/math/s_cbrtf_1u5.c b/pl/math/s_cbrtf_1u5.c
new file mode 100644
index 0000000..d60508e
--- /dev/null
+++ b/pl/math/s_cbrtf_1u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_cbrtf_1u5.c"
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index de009a3..42dc292 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -31,6 +31,7 @@ F (asinhf, -10.0, 10.0)
F (atanf, -10.0, 10.0)
{"atan2f", 'f', 0, -10.0, 10.0, {.f = atan2f_wrap}},
F (atanhf, -1.0, 1.0)
+F (cbrtf, -10.0, 10.0)
F (cosf, -3.1, 3.1)
F (coshf, -10.0, 10.0)
F (erfcf, -4.0, 10.0)
@@ -64,6 +65,7 @@ ZVNF (asinhf, -10.0, 10.0)
ZVNF (atanf, -10.0, 10.0)
ZVNF (atanhf, -1.0, 1.0)
ZVND (atan, -10.0, 10.0)
+ZVNF (cbrtf, -10.0, 10.0)
ZVNF (coshf, -10.0, 10.0)
ZVND (cosh, -10.0, 10.0)
ZVNF (erff, -4.0, 4.0)
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 53b4a43..6410dd9 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -211,6 +211,10 @@ t atanhf -0 -0x1p-12 500
t atanhf -0x1p-12 -1 200000
t atanhf -1 -inf 1000
+L=1.03
+t cbrtf 0 inf 1000000
+t cbrtf -0 -inf 1000000
+
done
# vector functions
@@ -441,6 +445,11 @@ range_atanhf='
-1 -inf 1000
'
+range_cbrtf='
+ 0 inf 1000000
+ -0 -inf 1000000
+'
+
range_sve_cosf='
0 0xffff0000 10000
0x1p-4 0x1p4 500000
@@ -607,6 +616,7 @@ L_expm1=1.68
L_sinh=2.08
L_cosh=1.43
L_atanhf=2.59
+L_cbrtf=1.03
L_sve_cosf=1.57
L_sve_cos=1.61
@@ -758,6 +768,10 @@ atanhf __s_atanhf $runs fenv -c 0
atanhf __v_atanhf $runv fenv -c 0
atanhf __vn_atanhf $runvn fenv -c 0
atanhf _ZGVnN4v_atanhf $runvn fenv -c 0
+cbrtf __s_cbrtf $runs fenv
+cbrtf __v_cbrtf $runv fenv
+cbrtf __vn_cbrtf $runvn fenv
+cbrtf _ZGVnN4v_cbrtf $runvn fenv
sve_cosf __sv_cosf $runsv
sve_cosf _ZGVsMxv_cosf $runsv
diff --git a/pl/math/test/testcases/directed/cbrtf.tst b/pl/math/test/testcases/directed/cbrtf.tst
new file mode 100644
index 0000000..5f8b97f
--- /dev/null
+++ b/pl/math/test/testcases/directed/cbrtf.tst
@@ -0,0 +1,29 @@
+; cbrtf.tst
+;
+; Copyright 2009-2022, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=cbrtf op1=7f800000 result=7f800000 errno=0
+func=cbrtf op1=ff800000 result=ff800000 errno=0
+func=cbrtf op1=7f800001 result=7fc00001 errno=0 status=i
+func=cbrtf op1=7fc00001 result=7fc00001 errno=0
+func=cbrtf op1=00000000 result=00000000 errno=0
+func=cbrtf op1=00000001 result=26a14517.cc7 errno=0
+func=cbrtf op1=00000002 result=26cb2ff5.29f errno=0
+func=cbrtf op1=00000003 result=26e89768.579 errno=0
+func=cbrtf op1=00000004 result=27000000.000 errno=0
+func=cbrtf op1=00400000 result=2a4b2ff5.29f errno=0
+func=cbrtf op1=00800000 result=2a800000.000 errno=0
+func=cbrtf op1=3f800000 result=3f800000.000 errno=0
+func=cbrtf op1=40000000 result=3fa14517.cc7 errno=0
+func=cbrtf op1=7f7fffff result=54cb2ff4.e63 errno=0
+func=cbrtf op1=80000000 result=80000000 errno=0
+func=cbrtf op1=80000001 result=a6a14517.cc7 errno=0
+func=cbrtf op1=80000002 result=a6cb2ff5.29f errno=0
+func=cbrtf op1=80000003 result=a6e89768.579 errno=0
+func=cbrtf op1=80000004 result=a7000000.000 errno=0
+func=cbrtf op1=80400000 result=aa4b2ff5.29f errno=0
+func=cbrtf op1=80800000 result=aa800000.000 errno=0
+func=cbrtf op1=bf800000 result=bf800000.000 errno=0
+func=cbrtf op1=c0000000 result=bfa14517.cc7 errno=0
+func=cbrtf op1=ff7fffff result=d4cb2ff4.e63 errno=0
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 84f4182..8e41ccf 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -37,6 +37,7 @@ F1 (acosh)
F1 (asinh)
F2 (atan2)
F1 (atanh)
+F1 (cbrt)
F1 (cosh)
F1 (erfc)
F1 (erf)
@@ -61,6 +62,7 @@ _ZVND1 (atan)
_ZVNF2 (atan2)
_ZVND2 (atan2)
_ZVNF1 (atanh)
+_ZVNF1 (cbrt)
_ZVNF1 (cosh)
_ZVND1 (cosh)
_ZVNF1 (erf)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 0ee07e9..fec18a7 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -119,6 +119,7 @@ ZVNF1_WRAP(asinh)
ZVNF1_WRAP(atan)
ZVNF2_WRAP(atan2)
ZVNF1_WRAP(atanh)
+ZVNF1_WRAP(cbrt)
ZVNF1_WRAP(cosh)
ZVNF1_WRAP(erf)
ZVNF1_WRAP(erfc)
diff --git a/pl/math/tools/cbrtf.sollya b/pl/math/tools/cbrtf.sollya
new file mode 100644
index 0000000..9cd1259
--- /dev/null
+++ b/pl/math/tools/cbrtf.sollya
@@ -0,0 +1,20 @@
+// polynomial for approximating cbrt(x) in single precision
+//
+// Copyright (c) 2022, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 3;
+
+a = 0.5;
+b = 1;
+
+
+f = x^(1/3);
+
+poly = fpminimax(f, deg, [|single ...|], [a;b]);
+
+display = hexadecimal;
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 0 to deg do round(coeff(poly,i), SG, RN);
diff --git a/pl/math/v_cbrtf_1u5.c b/pl/math/v_cbrtf_1u5.c
new file mode 100644
index 0000000..fd43051
--- /dev/null
+++ b/pl/math/v_cbrtf_1u5.c
@@ -0,0 +1,88 @@
+/*
+ * Single-precision vector cbrt(x) function.
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "mathlib.h"
+
+#if V_SUPPORTED
+
+#define AbsMask 0x7fffffff
+#define SignMask v_u32 (0x80000000)
+#define TwoThirds v_f32 (0x1.555556p-1f)
+#define SmallestNormal 0x00800000
+#define MantissaMask 0x007fffff
+#define HalfExp 0x3f000000
+
+#define C(i) v_f32 (__cbrtf_data.poly[i])
+#define T(i) v_lookup_f32 (__cbrtf_data.table, i)
+
+static NOINLINE v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t special)
+{
+ return v_call_f32 (cbrtf, x, y, special);
+}
+
+/* Approximation for vector single-precision cbrt(x) using Newton iteration with
+ initial guess obtained by a low-order polynomial. Greatest error is 1.5 ULP.
+ This is observed for every value where the mantissa is 0x1.81410e and the
+ exponent is a multiple of 3, for example:
+ __v_cbrtf(0x1.81410ep+30) got 0x1.255d96p+10
+ want 0x1.255d92p+10. */
+VPCS_ATTR v_f32_t V_NAME (cbrtf) (v_f32_t x)
+{
+ v_u32_t ix = v_as_u32_f32 (x);
+ v_u32_t iax = ix & AbsMask;
+
+ /* Subnormal, +/-0 and special values. */
+ v_u32_t special = v_cond_u32 ((iax < SmallestNormal) | (iax >= 0x7f800000));
+
+ /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector
+ version of frexpf, which gets subnormal values wrong - these have to be
+ special-cased as a result. */
+ v_f32_t m = v_as_f32_u32 ((iax & MantissaMask) | HalfExp);
+ v_s32_t e = v_as_s32_u32 (iax >> 23) - 126;
+
+ /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is,
+ the less accurate the next stage of the algorithm needs to be. An order-4
+ polynomial is enough for one Newton iteration. */
+ v_f32_t p_01 = v_fma_f32 (C (1), m, C (0));
+ v_f32_t p_23 = v_fma_f32 (C (3), m, C (2));
+ v_f32_t p = v_fma_f32 (m * m, p_23, p_01);
+
+ /* One iteration of Newton's method for iteratively approximating cbrt. */
+ v_f32_t m_by_3 = m / 3;
+ v_f32_t a = v_fma_f32 (TwoThirds, p, m_by_3 / (p * p));
+
+ /* Assemble the result by the following:
+
+ cbrt(x) = cbrt(m) * 2 ^ (e / 3).
+
+ We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is
+ not necessarily a multiple of 3 we lose some information.
+
+ Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q.
+
+ Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which is
+ an integer in [-2, 2], and can be looked up in the table T. Hence the
+ result is assembled as:
+
+ cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */
+
+ v_s32_t ey = e / 3;
+ v_f32_t my = a * T (v_as_u32_s32 (e % 3 + 2));
+
+ /* Vector version of ldexpf. */
+ v_f32_t y = v_as_f32_u32 ((v_as_u32_s32 (ey + 127) << 23)) * my;
+ /* Copy sign. */
+ y = v_as_f32_u32 (v_bsl_u32 (SignMask, ix, v_as_u32_f32 (y)));
+
+ if (unlikely (v_any_u32 (special)))
+ return specialcase (x, y, special);
+ return y;
+}
+VPCS_ALIAS
+
+#endif
diff --git a/pl/math/vn_cbrtf_1u5.c b/pl/math/vn_cbrtf_1u5.c
new file mode 100644
index 0000000..3452807
--- /dev/null
+++ b/pl/math/vn_cbrtf_1u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_cbrtf.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_cbrtf, _ZGVnN4v_cbrtf)
+#include "v_cbrtf_1u5.c"
+#endif