pl/math: Add scalar & vector/Neon cbrtf

Both routines use the same algorithm - one Newton iteration with the initial guess obtained by a low-order polynomial. Scalar is used as a fallback for subnormal and special cases for the vector routine, which allows vastly simplified argument reduction and reassembly. Both routines accurate to 1.5 ULP.
author: Joe Ramsay <Joe.Ramsay@arm.com> 2022-11-22 13:40:13 +0000
committer: Joe Ramsay <joe.ramsay@arm.com> 2022-11-22 13:40:13 +0000
commit: 6e875e8b27fd103bb41590a580c8ee03ea5d7138 (patch)
tree: 4d75a7148f44b8dc9a1e1ea19b18a92a6c3acdf6
parent: d5844d863d498cb1c43fde4ae3e47f3eeb5d0024 (diff)
download: arm-optimized-routines-6e875e8b27fd103bb41590a580c8ee03ea5d7138.tar.gz
13 files changed, 264 insertions, 0 deletions
diff --git a/pl/math/cbrtf_1u5.c b/pl/math/cbrtf_1u5.c
new file mode 100644
index 0000000..73b9049
--- /dev/null
+++ b/pl/math/cbrtf_1u5.c
@@ -0,0 +1,64 @@
+/*
+ * Single-precision cbrt(x) function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include <math.h>
+
+#include "math_config.h"
+
+#define AbsMask 0x7fffffff
+#define SignMask 0x80000000
+#define TwoThirds 0x1.555556p-1f
+
+#define C(i) __cbrtf_data.poly[i]
+#define T(i) __cbrtf_data.table[i]
+
+/* Approximation for single-precision cbrt(x), using low-order polynomial and
+   one Newton iteration on a reduced interval. Greatest error is 1.5 ULP. This
+   is observed for every value where the mantissa is 0x1.81410e and the exponent
+   is a multiple of 3, for example:
+   cbrtf(0x1.81410ep+30) got 0x1.255d96p+10
+			want 0x1.255d92p+10.  */
+float
+cbrtf (float x)
+{
+  uint32_t ix = asuint (x);
+  uint32_t iax = ix & AbsMask;
+  uint32_t sign = ix & SignMask;
+
+  if (unlikely (iax == 0 || iax == 0x7f800000))
+    return x;
+
+  /* |x| = m * 2^e, where m is in [0.5, 1.0].
+     We can easily decompose x into m and e using frexpf.  */
+  int e;
+  float m = frexpf (asfloat (iax), &e);
+
+  /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is,
+     the less accurate the next stage of the algorithm needs to be. An order-4
+     polynomial is enough for one Newton iteration.  */
+  float p_01 = fmaf (C (1), m, C (0));
+  float p_23 = fmaf (C (3), m, C (2));
+  float p = fmaf (m * m, p_23, p_01);
+
+  /* One iteration of Newton's method for iteratively approximating cbrt.  */
+  float m_by_3 = m / 3;
+  float a = fmaf (TwoThirds, p, m_by_3 / (p * p));
+
+  /* Assemble the result by the following:
+
+     cbrt(x) = cbrt(m) * 2 ^ (e / 3).
+
+     Let t = (2 ^ (e / 3)) / (2 ^ round(e / 3)).
+
+     Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3.
+     i is an integer in [-2, 2], so t can be looked up in the table T.
+     Hence the result is assembled as:
+
+     cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign.
+     Which can be done easily using ldexpf.  */
+  return asfloat (asuint (ldexpf (a * T (2 + e % 3), e / 3)) | sign);
+}
diff --git a/pl/math/cbrtf_data.c b/pl/math/cbrtf_data.c
new file mode 100644
index 0000000..386a2b4
--- /dev/null
+++ b/pl/math/cbrtf_data.c
@@ -0,0 +1,15 @@
+/*
+ * Coefficients and table entries for single-precision cbrt(x).
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+const struct cbrtf_data __cbrtf_data
+  = {.poly = { /* Coefficients for very rough approximation of cbrt(x) in [0.5, 1].
+                  See cbrtf.sollya for details of generation.  */
+	        0x1.c14e96p-2, 0x1.dd2d3p-1, -0x1.08e81ap-1, 0x1.2c74c2p-3},
+     .table = { /* table[i] = 2^((i - 2) / 3).  */
+	        0x1.428a3p-1, 0x1.965feap-1, 0x1p0, 0x1.428a3p0, 0x1.965feap0}};
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index f9faf2f..7bec5e1 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -13,6 +13,7 @@ float acoshf (float);
 float asinhf (float);
 float atan2f (float, float);
 float atanhf (float);
+float cbrtf (float);
 float coshf (float);
 float erfcf (float);
 float erff (float);
@@ -36,6 +37,7 @@ float __s_asinhf (float);
 float __s_atanf (float);
 float __s_atan2f (float, float);
 float __s_atanhf (float);
+float __s_cbrtf (float);
 float __s_coshf (float);
 float __s_erfcf (float);
 float __s_erff (float);
@@ -75,6 +77,7 @@ __f64x2_t __v_atan (__f64x2_t);
 __f32x4_t __v_atan2f (__f32x4_t, __f32x4_t);
 __f64x2_t __v_atan2 (__f64x2_t, __f64x2_t);
 __f32x4_t __v_atanhf (__f32x4_t);
+__f32x4_t __v_cbrtf (__f32x4_t);
 __f32x4_t __v_coshf (__f32x4_t);
 __f64x2_t __v_cosh (__f64x2_t);
 __f32x4_t __v_erff (__f32x4_t);
@@ -103,6 +106,7 @@ __vpcs __f64x2_t __vn_atan (__f64x2_t);
 __vpcs __f32x4_t __vn_atan2f (__f32x4_t, __f32x4_t);
 __vpcs __f64x2_t __vn_atan2 (__f64x2_t, __f64x2_t);
 __vpcs __f32x4_t __vn_atanhf (__f32x4_t);
+__vpcs __f32x4_t __vn_cbrtf (__f32x4_t);
 __vpcs __f32x4_t __vn_coshf (__f32x4_t);
 __vpcs __f64x2_t __vn_cosh (__f64x2_t);
 __vpcs __f32x4_t __vn_erff (__f32x4_t);
@@ -128,6 +132,7 @@ __vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4vv_atan2f (__f32x4_t, __f32x4_t);
 __vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_atanhf (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_cbrtf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_cosh (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t);
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index 7472395..99132a0 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -553,4 +553,10 @@ extern const struct expf_data
 #define EXPM1_POLY_ORDER 11
 extern const double __expm1_poly[EXPM1_POLY_ORDER] HIDDEN;
 
+extern const struct cbrtf_data
+{
+  float poly[4];
+  float table[5];
+} __cbrtf_data HIDDEN;
+
 #endif
diff --git a/pl/math/s_cbrtf_1u5.c b/pl/math/s_cbrtf_1u5.c
new file mode 100644
index 0000000..d60508e
--- /dev/null
+++ b/pl/math/s_cbrtf_1u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_cbrtf_1u5.c"
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index de009a3..42dc292 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -31,6 +31,7 @@ F (asinhf, -10.0, 10.0)
 F (atanf, -10.0, 10.0)
 {"atan2f", 'f', 0, -10.0, 10.0, {.f = atan2f_wrap}},
 F (atanhf, -1.0, 1.0)
+F (cbrtf, -10.0, 10.0)
 F (cosf, -3.1, 3.1)
 F (coshf, -10.0, 10.0)
 F (erfcf, -4.0, 10.0)
@@ -64,6 +65,7 @@ ZVNF (asinhf, -10.0, 10.0)
 ZVNF (atanf, -10.0, 10.0)
 ZVNF (atanhf, -1.0, 1.0)
 ZVND (atan, -10.0, 10.0)
+ZVNF (cbrtf, -10.0, 10.0)
 ZVNF (coshf, -10.0, 10.0)
 ZVND (cosh, -10.0, 10.0)
 ZVNF (erff, -4.0, 4.0)
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 53b4a43..6410dd9 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -211,6 +211,10 @@ t atanhf -0       -0x1p-12 500
 t atanhf -0x1p-12 -1       200000
 t atanhf -1       -inf     1000
 
+L=1.03
+t cbrtf  0  inf 1000000
+t cbrtf -0 -inf 1000000
+
 done
 
 # vector functions
@@ -441,6 +445,11 @@ range_atanhf='
  -1       -inf     1000
 '
 
+range_cbrtf='
+  0  inf 1000000
+ -0 -inf 1000000
+'
+
 range_sve_cosf='
  0    0xffff0000    10000
  0x1p-4    0x1p4    500000
@@ -607,6 +616,7 @@ L_expm1=1.68
 L_sinh=2.08
 L_cosh=1.43
 L_atanhf=2.59
+L_cbrtf=1.03
 
 L_sve_cosf=1.57
 L_sve_cos=1.61
@@ -758,6 +768,10 @@ atanhf __s_atanhf      $runs    fenv -c 0
 atanhf __v_atanhf      $runv    fenv -c 0
 atanhf __vn_atanhf     $runvn   fenv -c 0
 atanhf _ZGVnN4v_atanhf $runvn   fenv -c 0
+cbrtf  __s_cbrtf       $runs    fenv
+cbrtf  __v_cbrtf       $runv    fenv
+cbrtf  __vn_cbrtf      $runvn   fenv
+cbrtf  _ZGVnN4v_cbrtf  $runvn   fenv
 
 sve_cosf     __sv_cosf         $runsv
 sve_cosf     _ZGVsMxv_cosf     $runsv
diff --git a/pl/math/test/testcases/directed/cbrtf.tst b/pl/math/test/testcases/directed/cbrtf.tst
new file mode 100644
index 0000000..5f8b97f
--- /dev/null
+++ b/pl/math/test/testcases/directed/cbrtf.tst
@@ -0,0 +1,29 @@
+; cbrtf.tst
+;
+; Copyright 2009-2022, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=cbrtf op1=7f800000 result=7f800000 errno=0
+func=cbrtf op1=ff800000 result=ff800000 errno=0
+func=cbrtf op1=7f800001 result=7fc00001 errno=0 status=i
+func=cbrtf op1=7fc00001 result=7fc00001 errno=0
+func=cbrtf op1=00000000 result=00000000 errno=0
+func=cbrtf op1=00000001 result=26a14517.cc7 errno=0
+func=cbrtf op1=00000002 result=26cb2ff5.29f errno=0
+func=cbrtf op1=00000003 result=26e89768.579 errno=0
+func=cbrtf op1=00000004 result=27000000.000 errno=0
+func=cbrtf op1=00400000 result=2a4b2ff5.29f errno=0
+func=cbrtf op1=00800000 result=2a800000.000 errno=0
+func=cbrtf op1=3f800000 result=3f800000.000 errno=0
+func=cbrtf op1=40000000 result=3fa14517.cc7 errno=0
+func=cbrtf op1=7f7fffff result=54cb2ff4.e63 errno=0
+func=cbrtf op1=80000000 result=80000000 errno=0
+func=cbrtf op1=80000001 result=a6a14517.cc7 errno=0
+func=cbrtf op1=80000002 result=a6cb2ff5.29f errno=0
+func=cbrtf op1=80000003 result=a6e89768.579 errno=0
+func=cbrtf op1=80000004 result=a7000000.000 errno=0
+func=cbrtf op1=80400000 result=aa4b2ff5.29f errno=0
+func=cbrtf op1=80800000 result=aa800000.000 errno=0
+func=cbrtf op1=bf800000 result=bf800000.000 errno=0
+func=cbrtf op1=c0000000 result=bfa14517.cc7 errno=0
+func=cbrtf op1=ff7fffff result=d4cb2ff4.e63 errno=0
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 84f4182..8e41ccf 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -37,6 +37,7 @@ F1 (acosh)
 F1 (asinh)
 F2 (atan2)
 F1 (atanh)
+F1 (cbrt)
 F1 (cosh)
 F1 (erfc)
 F1 (erf)
@@ -61,6 +62,7 @@ _ZVND1 (atan)
 _ZVNF2 (atan2)
 _ZVND2 (atan2)
 _ZVNF1 (atanh)
+_ZVNF1 (cbrt)
 _ZVNF1 (cosh)
 _ZVND1 (cosh)
 _ZVNF1 (erf)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 0ee07e9..fec18a7 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -119,6 +119,7 @@ ZVNF1_WRAP(asinh)
 ZVNF1_WRAP(atan)
 ZVNF2_WRAP(atan2)
 ZVNF1_WRAP(atanh)
+ZVNF1_WRAP(cbrt)
 ZVNF1_WRAP(cosh)
 ZVNF1_WRAP(erf)
 ZVNF1_WRAP(erfc)
diff --git a/pl/math/tools/cbrtf.sollya b/pl/math/tools/cbrtf.sollya
new file mode 100644
index 0000000..9cd1259
--- /dev/null
+++ b/pl/math/tools/cbrtf.sollya
@@ -0,0 +1,20 @@
+// polynomial for approximating cbrt(x) in single precision
+//
+// Copyright (c) 2022, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 3;
+
+a = 0.5;
+b = 1;
+
+
+f = x^(1/3);
+
+poly = fpminimax(f, deg, [|single ...|], [a;b]);
+
+display = hexadecimal;
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 0 to deg do round(coeff(poly,i), SG, RN);
diff --git a/pl/math/v_cbrtf_1u5.c b/pl/math/v_cbrtf_1u5.c
new file mode 100644
index 0000000..fd43051
--- /dev/null
+++ b/pl/math/v_cbrtf_1u5.c
@@ -0,0 +1,88 @@
+/*
+ * Single-precision vector cbrt(x) function.
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "mathlib.h"
+
+#if V_SUPPORTED
+
+#define AbsMask 0x7fffffff
+#define SignMask v_u32 (0x80000000)
+#define TwoThirds v_f32 (0x1.555556p-1f)
+#define SmallestNormal 0x00800000
+#define MantissaMask 0x007fffff
+#define HalfExp 0x3f000000
+
+#define C(i) v_f32 (__cbrtf_data.poly[i])
+#define T(i) v_lookup_f32 (__cbrtf_data.table, i)
+
+static NOINLINE v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t special)
+{
+  return v_call_f32 (cbrtf, x, y, special);
+}
+
+/* Approximation for vector single-precision cbrt(x) using Newton iteration with
+   initial guess obtained by a low-order polynomial. Greatest error is 1.5 ULP.
+   This is observed for every value where the mantissa is 0x1.81410e and the
+   exponent is a multiple of 3, for example:
+   __v_cbrtf(0x1.81410ep+30) got 0x1.255d96p+10
+			    want 0x1.255d92p+10.  */
+VPCS_ATTR v_f32_t V_NAME (cbrtf) (v_f32_t x)
+{
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t iax = ix & AbsMask;
+
+  /* Subnormal, +/-0 and special values.  */
+  v_u32_t special = v_cond_u32 ((iax < SmallestNormal) | (iax >= 0x7f800000));
+
+  /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector
+     version of frexpf, which gets subnormal values wrong - these have to be
+     special-cased as a result.  */
+  v_f32_t m = v_as_f32_u32 ((iax & MantissaMask) | HalfExp);
+  v_s32_t e = v_as_s32_u32 (iax >> 23) - 126;
+
+  /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is,
+     the less accurate the next stage of the algorithm needs to be. An order-4
+     polynomial is enough for one Newton iteration.  */
+  v_f32_t p_01 = v_fma_f32 (C (1), m, C (0));
+  v_f32_t p_23 = v_fma_f32 (C (3), m, C (2));
+  v_f32_t p = v_fma_f32 (m * m, p_23, p_01);
+
+  /* One iteration of Newton's method for iteratively approximating cbrt.  */
+  v_f32_t m_by_3 = m / 3;
+  v_f32_t a = v_fma_f32 (TwoThirds, p, m_by_3 / (p * p));
+
+  /* Assemble the result by the following:
+
+     cbrt(x) = cbrt(m) * 2 ^ (e / 3).
+
+     We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is
+     not necessarily a multiple of 3 we lose some information.
+
+     Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q.
+
+     Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which is
+     an integer in [-2, 2], and can be looked up in the table T. Hence the
+     result is assembled as:
+
+     cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign.  */
+
+  v_s32_t ey = e / 3;
+  v_f32_t my = a * T (v_as_u32_s32 (e % 3 + 2));
+
+  /* Vector version of ldexpf.  */
+  v_f32_t y = v_as_f32_u32 ((v_as_u32_s32 (ey + 127) << 23)) * my;
+  /* Copy sign.  */
+  y = v_as_f32_u32 (v_bsl_u32 (SignMask, ix, v_as_u32_f32 (y)));
+
+  if (unlikely (v_any_u32 (special)))
+    return specialcase (x, y, special);
+  return y;
+}
+VPCS_ALIAS
+
+#endif
diff --git a/pl/math/vn_cbrtf_1u5.c b/pl/math/vn_cbrtf_1u5.c
new file mode 100644
index 0000000..3452807
--- /dev/null
+++ b/pl/math/vn_cbrtf_1u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_cbrtf.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_cbrtf, _ZGVnN4v_cbrtf)
+#include "v_cbrtf_1u5.c"
+#endif
author	Joe Ramsay <Joe.Ramsay@arm.com>	2022-11-22 13:40:13 +0000
committer	Joe Ramsay <joe.ramsay@arm.com>	2022-11-22 13:40:13 +0000
commit	6e875e8b27fd103bb41590a580c8ee03ea5d7138 (patch)
tree	4d75a7148f44b8dc9a1e1ea19b18a92a6c3acdf6
parent	d5844d863d498cb1c43fde4ae3e47f3eeb5d0024 (diff)
download	arm-optimized-routines-6e875e8b27fd103bb41590a580c8ee03ea5d7138.tar.gz