diff options
author | Joe Ramsay <Joe.Ramsay@arm.com> | 2022-11-22 13:40:13 +0000 |
---|---|---|
committer | Joe Ramsay <joe.ramsay@arm.com> | 2022-11-22 13:40:13 +0000 |
commit | 6e875e8b27fd103bb41590a580c8ee03ea5d7138 (patch) | |
tree | 4d75a7148f44b8dc9a1e1ea19b18a92a6c3acdf6 | |
parent | d5844d863d498cb1c43fde4ae3e47f3eeb5d0024 (diff) | |
download | arm-optimized-routines-6e875e8b27fd103bb41590a580c8ee03ea5d7138.tar.gz |
pl/math: Add scalar & vector/Neon cbrtf
Both routines use the same algorithm - one Newton iteration with the
initial guess obtained by a low-order polynomial. Scalar is used as a
fallback for subnormal and special cases for the vector routine, which
allows vastly simplified argument reduction and reassembly. Both
routines accurate to 1.5 ULP.
-rw-r--r-- | pl/math/cbrtf_1u5.c | 64 | ||||
-rw-r--r-- | pl/math/cbrtf_data.c | 15 | ||||
-rw-r--r-- | pl/math/include/mathlib.h | 5 | ||||
-rw-r--r-- | pl/math/math_config.h | 6 | ||||
-rw-r--r-- | pl/math/s_cbrtf_1u5.c | 6 | ||||
-rw-r--r-- | pl/math/test/mathbench_funcs.h | 2 | ||||
-rwxr-xr-x | pl/math/test/runulp.sh | 14 | ||||
-rw-r--r-- | pl/math/test/testcases/directed/cbrtf.tst | 29 | ||||
-rw-r--r-- | pl/math/test/ulp_funcs.h | 2 | ||||
-rw-r--r-- | pl/math/test/ulp_wrappers.h | 1 | ||||
-rw-r--r-- | pl/math/tools/cbrtf.sollya | 20 | ||||
-rw-r--r-- | pl/math/v_cbrtf_1u5.c | 88 | ||||
-rw-r--r-- | pl/math/vn_cbrtf_1u5.c | 12 |
13 files changed, 264 insertions, 0 deletions
diff --git a/pl/math/cbrtf_1u5.c b/pl/math/cbrtf_1u5.c new file mode 100644 index 0000000..73b9049 --- /dev/null +++ b/pl/math/cbrtf_1u5.c @@ -0,0 +1,64 @@ +/* + * Single-precision cbrt(x) function. + * + * Copyright (c) 2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include <math.h> + +#include "math_config.h" + +#define AbsMask 0x7fffffff +#define SignMask 0x80000000 +#define TwoThirds 0x1.555556p-1f + +#define C(i) __cbrtf_data.poly[i] +#define T(i) __cbrtf_data.table[i] + +/* Approximation for single-precision cbrt(x), using low-order polynomial and + one Newton iteration on a reduced interval. Greatest error is 1.5 ULP. This + is observed for every value where the mantissa is 0x1.81410e and the exponent + is a multiple of 3, for example: + cbrtf(0x1.81410ep+30) got 0x1.255d96p+10 + want 0x1.255d92p+10. */ +float +cbrtf (float x) +{ + uint32_t ix = asuint (x); + uint32_t iax = ix & AbsMask; + uint32_t sign = ix & SignMask; + + if (unlikely (iax == 0 || iax == 0x7f800000)) + return x; + + /* |x| = m * 2^e, where m is in [0.5, 1.0]. + We can easily decompose x into m and e using frexpf. */ + int e; + float m = frexpf (asfloat (iax), &e); + + /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is, + the less accurate the next stage of the algorithm needs to be. An order-4 + polynomial is enough for one Newton iteration. */ + float p_01 = fmaf (C (1), m, C (0)); + float p_23 = fmaf (C (3), m, C (2)); + float p = fmaf (m * m, p_23, p_01); + + /* One iteration of Newton's method for iteratively approximating cbrt. */ + float m_by_3 = m / 3; + float a = fmaf (TwoThirds, p, m_by_3 / (p * p)); + + /* Assemble the result by the following: + + cbrt(x) = cbrt(m) * 2 ^ (e / 3). + + Let t = (2 ^ (e / 3)) / (2 ^ round(e / 3)). + + Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3. + i is an integer in [-2, 2], so t can be looked up in the table T. + Hence the result is assembled as: + + cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. + Which can be done easily using ldexpf. */ + return asfloat (asuint (ldexpf (a * T (2 + e % 3), e / 3)) | sign); +} diff --git a/pl/math/cbrtf_data.c b/pl/math/cbrtf_data.c new file mode 100644 index 0000000..386a2b4 --- /dev/null +++ b/pl/math/cbrtf_data.c @@ -0,0 +1,15 @@ +/* + * Coefficients and table entries for single-precision cbrt(x). + * + * Copyright (c) 2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +const struct cbrtf_data __cbrtf_data + = {.poly = { /* Coefficients for very rough approximation of cbrt(x) in [0.5, 1]. + See cbrtf.sollya for details of generation. */ + 0x1.c14e96p-2, 0x1.dd2d3p-1, -0x1.08e81ap-1, 0x1.2c74c2p-3}, + .table = { /* table[i] = 2^((i - 2) / 3). */ + 0x1.428a3p-1, 0x1.965feap-1, 0x1p0, 0x1.428a3p0, 0x1.965feap0}}; diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h index f9faf2f..7bec5e1 100644 --- a/pl/math/include/mathlib.h +++ b/pl/math/include/mathlib.h @@ -13,6 +13,7 @@ float acoshf (float); float asinhf (float); float atan2f (float, float); float atanhf (float); +float cbrtf (float); float coshf (float); float erfcf (float); float erff (float); @@ -36,6 +37,7 @@ float __s_asinhf (float); float __s_atanf (float); float __s_atan2f (float, float); float __s_atanhf (float); +float __s_cbrtf (float); float __s_coshf (float); float __s_erfcf (float); float __s_erff (float); @@ -75,6 +77,7 @@ __f64x2_t __v_atan (__f64x2_t); __f32x4_t __v_atan2f (__f32x4_t, __f32x4_t); __f64x2_t __v_atan2 (__f64x2_t, __f64x2_t); __f32x4_t __v_atanhf (__f32x4_t); +__f32x4_t __v_cbrtf (__f32x4_t); __f32x4_t __v_coshf (__f32x4_t); __f64x2_t __v_cosh (__f64x2_t); __f32x4_t __v_erff (__f32x4_t); @@ -103,6 +106,7 @@ __vpcs __f64x2_t __vn_atan (__f64x2_t); __vpcs __f32x4_t __vn_atan2f (__f32x4_t, __f32x4_t); __vpcs __f64x2_t __vn_atan2 (__f64x2_t, __f64x2_t); __vpcs __f32x4_t __vn_atanhf (__f32x4_t); +__vpcs __f32x4_t __vn_cbrtf (__f32x4_t); __vpcs __f32x4_t __vn_coshf (__f32x4_t); __vpcs __f64x2_t __vn_cosh (__f64x2_t); __vpcs __f32x4_t __vn_erff (__f32x4_t); @@ -128,6 +132,7 @@ __vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t); __vpcs __f32x4_t _ZGVnN4vv_atan2f (__f32x4_t, __f32x4_t); __vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t); __vpcs __f32x4_t _ZGVnN4v_atanhf (__f32x4_t); +__vpcs __f32x4_t _ZGVnN4v_cbrtf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t); __vpcs __f64x2_t _ZGVnN2v_cosh (__f64x2_t); __vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t); diff --git a/pl/math/math_config.h b/pl/math/math_config.h index 7472395..99132a0 100644 --- a/pl/math/math_config.h +++ b/pl/math/math_config.h @@ -553,4 +553,10 @@ extern const struct expf_data #define EXPM1_POLY_ORDER 11 extern const double __expm1_poly[EXPM1_POLY_ORDER] HIDDEN; +extern const struct cbrtf_data +{ + float poly[4]; + float table[5]; +} __cbrtf_data HIDDEN; + #endif diff --git a/pl/math/s_cbrtf_1u5.c b/pl/math/s_cbrtf_1u5.c new file mode 100644 index 0000000..d60508e --- /dev/null +++ b/pl/math/s_cbrtf_1u5.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define SCALAR 1 +#include "v_cbrtf_1u5.c" diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h index de009a3..42dc292 100644 --- a/pl/math/test/mathbench_funcs.h +++ b/pl/math/test/mathbench_funcs.h @@ -31,6 +31,7 @@ F (asinhf, -10.0, 10.0) F (atanf, -10.0, 10.0) {"atan2f", 'f', 0, -10.0, 10.0, {.f = atan2f_wrap}}, F (atanhf, -1.0, 1.0) +F (cbrtf, -10.0, 10.0) F (cosf, -3.1, 3.1) F (coshf, -10.0, 10.0) F (erfcf, -4.0, 10.0) @@ -64,6 +65,7 @@ ZVNF (asinhf, -10.0, 10.0) ZVNF (atanf, -10.0, 10.0) ZVNF (atanhf, -1.0, 1.0) ZVND (atan, -10.0, 10.0) +ZVNF (cbrtf, -10.0, 10.0) ZVNF (coshf, -10.0, 10.0) ZVND (cosh, -10.0, 10.0) ZVNF (erff, -4.0, 4.0) diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh index 53b4a43..6410dd9 100755 --- a/pl/math/test/runulp.sh +++ b/pl/math/test/runulp.sh @@ -211,6 +211,10 @@ t atanhf -0 -0x1p-12 500 t atanhf -0x1p-12 -1 200000 t atanhf -1 -inf 1000 +L=1.03 +t cbrtf 0 inf 1000000 +t cbrtf -0 -inf 1000000 + done # vector functions @@ -441,6 +445,11 @@ range_atanhf=' -1 -inf 1000 ' +range_cbrtf=' + 0 inf 1000000 + -0 -inf 1000000 +' + range_sve_cosf=' 0 0xffff0000 10000 0x1p-4 0x1p4 500000 @@ -607,6 +616,7 @@ L_expm1=1.68 L_sinh=2.08 L_cosh=1.43 L_atanhf=2.59 +L_cbrtf=1.03 L_sve_cosf=1.57 L_sve_cos=1.61 @@ -758,6 +768,10 @@ atanhf __s_atanhf $runs fenv -c 0 atanhf __v_atanhf $runv fenv -c 0 atanhf __vn_atanhf $runvn fenv -c 0 atanhf _ZGVnN4v_atanhf $runvn fenv -c 0 +cbrtf __s_cbrtf $runs fenv +cbrtf __v_cbrtf $runv fenv +cbrtf __vn_cbrtf $runvn fenv +cbrtf _ZGVnN4v_cbrtf $runvn fenv sve_cosf __sv_cosf $runsv sve_cosf _ZGVsMxv_cosf $runsv diff --git a/pl/math/test/testcases/directed/cbrtf.tst b/pl/math/test/testcases/directed/cbrtf.tst new file mode 100644 index 0000000..5f8b97f --- /dev/null +++ b/pl/math/test/testcases/directed/cbrtf.tst @@ -0,0 +1,29 @@ +; cbrtf.tst +; +; Copyright 2009-2022, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=cbrtf op1=7f800000 result=7f800000 errno=0 +func=cbrtf op1=ff800000 result=ff800000 errno=0 +func=cbrtf op1=7f800001 result=7fc00001 errno=0 status=i +func=cbrtf op1=7fc00001 result=7fc00001 errno=0 +func=cbrtf op1=00000000 result=00000000 errno=0 +func=cbrtf op1=00000001 result=26a14517.cc7 errno=0 +func=cbrtf op1=00000002 result=26cb2ff5.29f errno=0 +func=cbrtf op1=00000003 result=26e89768.579 errno=0 +func=cbrtf op1=00000004 result=27000000.000 errno=0 +func=cbrtf op1=00400000 result=2a4b2ff5.29f errno=0 +func=cbrtf op1=00800000 result=2a800000.000 errno=0 +func=cbrtf op1=3f800000 result=3f800000.000 errno=0 +func=cbrtf op1=40000000 result=3fa14517.cc7 errno=0 +func=cbrtf op1=7f7fffff result=54cb2ff4.e63 errno=0 +func=cbrtf op1=80000000 result=80000000 errno=0 +func=cbrtf op1=80000001 result=a6a14517.cc7 errno=0 +func=cbrtf op1=80000002 result=a6cb2ff5.29f errno=0 +func=cbrtf op1=80000003 result=a6e89768.579 errno=0 +func=cbrtf op1=80000004 result=a7000000.000 errno=0 +func=cbrtf op1=80400000 result=aa4b2ff5.29f errno=0 +func=cbrtf op1=80800000 result=aa800000.000 errno=0 +func=cbrtf op1=bf800000 result=bf800000.000 errno=0 +func=cbrtf op1=c0000000 result=bfa14517.cc7 errno=0 +func=cbrtf op1=ff7fffff result=d4cb2ff4.e63 errno=0 diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h index 84f4182..8e41ccf 100644 --- a/pl/math/test/ulp_funcs.h +++ b/pl/math/test/ulp_funcs.h @@ -37,6 +37,7 @@ F1 (acosh) F1 (asinh) F2 (atan2) F1 (atanh) +F1 (cbrt) F1 (cosh) F1 (erfc) F1 (erf) @@ -61,6 +62,7 @@ _ZVND1 (atan) _ZVNF2 (atan2) _ZVND2 (atan2) _ZVNF1 (atanh) +_ZVNF1 (cbrt) _ZVNF1 (cosh) _ZVND1 (cosh) _ZVNF1 (erf) diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h index 0ee07e9..fec18a7 100644 --- a/pl/math/test/ulp_wrappers.h +++ b/pl/math/test/ulp_wrappers.h @@ -119,6 +119,7 @@ ZVNF1_WRAP(asinh) ZVNF1_WRAP(atan) ZVNF2_WRAP(atan2) ZVNF1_WRAP(atanh) +ZVNF1_WRAP(cbrt) ZVNF1_WRAP(cosh) ZVNF1_WRAP(erf) ZVNF1_WRAP(erfc) diff --git a/pl/math/tools/cbrtf.sollya b/pl/math/tools/cbrtf.sollya new file mode 100644 index 0000000..9cd1259 --- /dev/null +++ b/pl/math/tools/cbrtf.sollya @@ -0,0 +1,20 @@ +// polynomial for approximating cbrt(x) in single precision +// +// Copyright (c) 2022, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +deg = 3; + +a = 0.5; +b = 1; + + +f = x^(1/3); + +poly = fpminimax(f, deg, [|single ...|], [a;b]); + +display = hexadecimal; +print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30)); +print("in [",a,b,"]"); +print("coeffs:"); +for i from 0 to deg do round(coeff(poly,i), SG, RN); diff --git a/pl/math/v_cbrtf_1u5.c b/pl/math/v_cbrtf_1u5.c new file mode 100644 index 0000000..fd43051 --- /dev/null +++ b/pl/math/v_cbrtf_1u5.c @@ -0,0 +1,88 @@ +/* + * Single-precision vector cbrt(x) function. + * Copyright (c) 2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "mathlib.h" + +#if V_SUPPORTED + +#define AbsMask 0x7fffffff +#define SignMask v_u32 (0x80000000) +#define TwoThirds v_f32 (0x1.555556p-1f) +#define SmallestNormal 0x00800000 +#define MantissaMask 0x007fffff +#define HalfExp 0x3f000000 + +#define C(i) v_f32 (__cbrtf_data.poly[i]) +#define T(i) v_lookup_f32 (__cbrtf_data.table, i) + +static NOINLINE v_f32_t +specialcase (v_f32_t x, v_f32_t y, v_u32_t special) +{ + return v_call_f32 (cbrtf, x, y, special); +} + +/* Approximation for vector single-precision cbrt(x) using Newton iteration with + initial guess obtained by a low-order polynomial. Greatest error is 1.5 ULP. + This is observed for every value where the mantissa is 0x1.81410e and the + exponent is a multiple of 3, for example: + __v_cbrtf(0x1.81410ep+30) got 0x1.255d96p+10 + want 0x1.255d92p+10. */ +VPCS_ATTR v_f32_t V_NAME (cbrtf) (v_f32_t x) +{ + v_u32_t ix = v_as_u32_f32 (x); + v_u32_t iax = ix & AbsMask; + + /* Subnormal, +/-0 and special values. */ + v_u32_t special = v_cond_u32 ((iax < SmallestNormal) | (iax >= 0x7f800000)); + + /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector + version of frexpf, which gets subnormal values wrong - these have to be + special-cased as a result. */ + v_f32_t m = v_as_f32_u32 ((iax & MantissaMask) | HalfExp); + v_s32_t e = v_as_s32_u32 (iax >> 23) - 126; + + /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is, + the less accurate the next stage of the algorithm needs to be. An order-4 + polynomial is enough for one Newton iteration. */ + v_f32_t p_01 = v_fma_f32 (C (1), m, C (0)); + v_f32_t p_23 = v_fma_f32 (C (3), m, C (2)); + v_f32_t p = v_fma_f32 (m * m, p_23, p_01); + + /* One iteration of Newton's method for iteratively approximating cbrt. */ + v_f32_t m_by_3 = m / 3; + v_f32_t a = v_fma_f32 (TwoThirds, p, m_by_3 / (p * p)); + + /* Assemble the result by the following: + + cbrt(x) = cbrt(m) * 2 ^ (e / 3). + + We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is + not necessarily a multiple of 3 we lose some information. + + Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q. + + Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which is + an integer in [-2, 2], and can be looked up in the table T. Hence the + result is assembled as: + + cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */ + + v_s32_t ey = e / 3; + v_f32_t my = a * T (v_as_u32_s32 (e % 3 + 2)); + + /* Vector version of ldexpf. */ + v_f32_t y = v_as_f32_u32 ((v_as_u32_s32 (ey + 127) << 23)) * my; + /* Copy sign. */ + y = v_as_f32_u32 (v_bsl_u32 (SignMask, ix, v_as_u32_f32 (y))); + + if (unlikely (v_any_u32 (special))) + return specialcase (x, y, special); + return y; +} +VPCS_ALIAS + +#endif diff --git a/pl/math/vn_cbrtf_1u5.c b/pl/math/vn_cbrtf_1u5.c new file mode 100644 index 0000000..3452807 --- /dev/null +++ b/pl/math/vn_cbrtf_1u5.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_cbrtf. + * + * Copyright (c) 2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "include/mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS strong_alias (__vn_cbrtf, _ZGVnN4v_cbrtf) +#include "v_cbrtf_1u5.c" +#endif |