From 04e91eca36b0a7dbbab78bf9401c978ab1b08b67 Mon Sep 17 00:00:00 2001 From: Joe Ramsay Date: Tue, 20 Dec 2022 09:26:47 +0000 Subject: pl/math: Add scalar & vector/Neon cbrt New routines use the same algorithm, with simplified argument reduction and recombination in the vector variant. Both are accurate to 2 ULP. --- pl/math/cbrt_2u.c | 70 ++++++++++++++++++++++++++++++++++ pl/math/cbrt_data.c | 15 ++++++++ pl/math/include/mathlib.h | 5 +++ pl/math/math_config.h | 6 +++ pl/math/s_cbrt_2u.c | 6 +++ pl/math/tools/cbrt.sollya | 20 ++++++++++ pl/math/v_cbrt_2u.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++ pl/math/vn_cbrt_2u.c | 12 ++++++ 8 files changed, 231 insertions(+) create mode 100644 pl/math/cbrt_2u.c create mode 100644 pl/math/cbrt_data.c create mode 100644 pl/math/s_cbrt_2u.c create mode 100644 pl/math/tools/cbrt.sollya create mode 100644 pl/math/v_cbrt_2u.c create mode 100644 pl/math/vn_cbrt_2u.c diff --git a/pl/math/cbrt_2u.c b/pl/math/cbrt_2u.c new file mode 100644 index 0000000..f89dd87 --- /dev/null +++ b/pl/math/cbrt_2u.c @@ -0,0 +1,70 @@ +/* + * Double-precision cbrt(x) function. + * + * Copyright (c) 2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +PL_SIG (S, D, 1, cbrt, -10.0, 10.0) + +#define AbsMask 0x7fffffffffffffff +#define TwoThirds 0x1.5555555555555p-1 + +#define C(i) __cbrt_data.poly[i] +#define T(i) __cbrt_data.table[i] + +/* Approximation for double-precision cbrt(x), using low-order polynomial and + two Newton iterations. Greatest observed error is 1.79 ULP. Errors repeat + according to the exponent, for instance an error observed for double value + m * 2^e will be observed for any input m * 2^(e + 3*i), where i is an + integer. + cbrt(0x1.fffff403f0bc6p+1) got 0x1.965fe72821e9bp+0 + want 0x1.965fe72821e99p+0. */ +double +cbrt (double x) +{ + uint64_t ix = asuint64 (x); + uint64_t iax = ix & AbsMask; + uint64_t sign = ix & ~AbsMask; + + if (unlikely (iax == 0 || iax == 0x7f80000000000000)) + return x; + + /* |x| = m * 2^e, where m is in [0.5, 1.0]. + We can easily decompose x into m and e using frexp. */ + int e; + double m = frexp (asdouble (iax), &e); + + /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point for + Newton iterations. */ + double p_01 = fma (C (1), m, C (0)); + double p_23 = fma (C (3), m, C (2)); + double p = fma (p_23, m * m, p_01); + + /* Two iterations of Newton's method for iteratively approximating cbrt. */ + double m_by_3 = m / 3; + double a = fma (TwoThirds, p, m_by_3 / (p * p)); + a = fma (TwoThirds, a, m_by_3 / (a * a)); + + /* Assemble the result by the following: + + cbrt(x) = cbrt(m) * 2 ^ (e / 3). + + Let t = (2 ^ (e / 3)) / (2 ^ round(e / 3)). + + Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3. + i is an integer in [-2, 2], so t can be looked up in the table T. + Hence the result is assembled as: + + cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. + Which can be done easily using ldexp. */ + return asdouble (asuint64 (ldexp (a * T (2 + e % 3), e / 3)) | sign); +} + +PL_TEST_ULP (cbrt, 1.30) +PL_TEST_INTERVAL (cbrt, 0, inf, 1000000) +PL_TEST_INTERVAL (cbrt, -0, -inf, 1000000) diff --git a/pl/math/cbrt_data.c b/pl/math/cbrt_data.c new file mode 100644 index 0000000..1c6ca73 --- /dev/null +++ b/pl/math/cbrt_data.c @@ -0,0 +1,15 @@ +/* + * Coefficients and table entries for double-precision cbrt(x). + * + * Copyright (c) 2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +const struct cbrt_data __cbrt_data + = {.poly = { /* Coefficients for very rough approximation of cbrt(x) in [0.5, 1]. + See cbrt.sollya for details of generation. */ + 0x1.c14e8ee44767p-2, 0x1.dd2d3f99e4c0ep-1, -0x1.08e83026b7e74p-1, 0x1.2c74eaa3ba428p-3}, + .table = { /* table[i] = 2^((i - 2) / 3). */ + 0x1.428a2f98d728bp-1, 0x1.965fea53d6e3dp-1, 0x1p0, 0x1.428a2f98d728bp0, 0x1.965fea53d6e3dp0}}; diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h index 1266eb7..12d72fe 100644 --- a/pl/math/include/mathlib.h +++ b/pl/math/include/mathlib.h @@ -27,6 +27,7 @@ float tanhf (float); double acosh (double); double asinh (double); double atan2 (double, double); +double cbrt (double); double cosh (double); double erfc (double); double expm1 (double); @@ -53,6 +54,7 @@ float __s_tanhf (float); double __s_asinh (double); double __s_atan (double); double __s_atan2 (double, double); +double __s_cbrt (double); double __s_cosh (double); double __s_erf (double); double __s_erfc (double); @@ -82,6 +84,7 @@ __f32x4_t __v_atan2f (__f32x4_t, __f32x4_t); __f64x2_t __v_atan2 (__f64x2_t, __f64x2_t); __f32x4_t __v_atanhf (__f32x4_t); __f32x4_t __v_cbrtf (__f32x4_t); +__f64x2_t __v_cbrt (__f64x2_t); __f32x4_t __v_coshf (__f32x4_t); __f64x2_t __v_cosh (__f64x2_t); __f32x4_t __v_erff (__f32x4_t); @@ -113,6 +116,7 @@ __vpcs __f32x4_t __vn_atan2f (__f32x4_t, __f32x4_t); __vpcs __f64x2_t __vn_atan2 (__f64x2_t, __f64x2_t); __vpcs __f32x4_t __vn_atanhf (__f32x4_t); __vpcs __f32x4_t __vn_cbrtf (__f32x4_t); +__vpcs __f64x2_t __vn_cbrt (__f64x2_t); __vpcs __f32x4_t __vn_coshf (__f32x4_t); __vpcs __f64x2_t __vn_cosh (__f64x2_t); __vpcs __f32x4_t __vn_erff (__f32x4_t); @@ -141,6 +145,7 @@ __vpcs __f32x4_t _ZGVnN4vv_atan2f (__f32x4_t, __f32x4_t); __vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t); __vpcs __f32x4_t _ZGVnN4v_atanhf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_cbrtf (__f32x4_t); +__vpcs __f64x2_t _ZGVnN2v_cbrt (__f64x2_t); __vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t); __vpcs __f64x2_t _ZGVnN2v_cosh (__f64x2_t); __vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t); diff --git a/pl/math/math_config.h b/pl/math/math_config.h index 90d571c..92ccebf 100644 --- a/pl/math/math_config.h +++ b/pl/math/math_config.h @@ -558,4 +558,10 @@ extern const struct cbrtf_data float table[5]; } __cbrtf_data HIDDEN; +extern const struct cbrt_data +{ + double poly[4]; + double table[5]; +} __cbrt_data HIDDEN; + #endif diff --git a/pl/math/s_cbrt_2u.c b/pl/math/s_cbrt_2u.c new file mode 100644 index 0000000..22f726b --- /dev/null +++ b/pl/math/s_cbrt_2u.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define SCALAR 1 +#include "v_cbrt_2u.c" diff --git a/pl/math/tools/cbrt.sollya b/pl/math/tools/cbrt.sollya new file mode 100644 index 0000000..7f179eb --- /dev/null +++ b/pl/math/tools/cbrt.sollya @@ -0,0 +1,20 @@ +// polynomial for approximating cbrt(x) in double precision +// +// Copyright (c) 2022, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +deg = 3; + +a = 0.5; +b = 1; + + +f = x^(1/3); + +poly = fpminimax(f, deg, [|double ...|], [a;b]); + +display = hexadecimal; +print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30)); +print("in [",a,b,"]"); +print("coeffs:"); +for i from 0 to deg do round(coeff(poly,i), D, RN); diff --git a/pl/math/v_cbrt_2u.c b/pl/math/v_cbrt_2u.c new file mode 100644 index 0000000..b6e501c --- /dev/null +++ b/pl/math/v_cbrt_2u.c @@ -0,0 +1,97 @@ +/* + * Double-precision vector cbrt(x) function. + * Copyright (c) 2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "mathlib.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if V_SUPPORTED + +#define AbsMask 0x7fffffffffffffff +#define TwoThirds v_f64 (0x1.5555555555555p-1) +#define TinyBound 0x001 /* top12 (smallest_normal). */ +#define BigBound 0x7ff /* top12 (infinity). */ +#define MantissaMask v_u64 (0x000fffffffffffff) +#define HalfExp v_u64 (0x3fe0000000000000) + +#define C(i) v_f64 (__cbrt_data.poly[i]) +#define T(i) v_lookup_f64 (__cbrt_data.table, i) + +static NOINLINE v_f64_t +specialcase (v_f64_t x, v_f64_t y, v_u64_t special) +{ + return v_call_f64 (cbrt, x, y, special); +} + +/* Approximation for double-precision vector cbrt(x), using low-order polynomial + and two Newton iterations. Greatest observed error is 1.79 ULP. Errors repeat + according to the exponent, for instance an error observed for double value + m * 2^e will be observed for any input m * 2^(e + 3*i), where i is an + integer. + __v_cbrt(0x1.fffff403f0bc6p+1) got 0x1.965fe72821e9bp+0 + want 0x1.965fe72821e99p+0. */ +VPCS_ATTR v_f64_t V_NAME (cbrt) (v_f64_t x) +{ + v_u64_t ix = v_as_u64_f64 (x); + v_u64_t iax = ix & AbsMask; + v_u64_t ia12 = iax >> 52; + + /* Subnormal, +/-0 and special values. */ + v_u64_t special = v_cond_u64 ((ia12 < TinyBound) | (ia12 >= BigBound)); + + /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector + version of frexp, which gets subnormal values wrong - these have to be + special-cased as a result. */ + v_f64_t m = v_as_f64_u64 (v_bsl_u64 (MantissaMask, iax, HalfExp)); + v_s64_t e = v_as_s64_u64 (iax >> 52) - 1022; + + /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point for + Newton iterations. */ + v_f64_t p_01 = v_fma_f64 (C (1), m, C (0)); + v_f64_t p_23 = v_fma_f64 (C (3), m, C (2)); + v_f64_t p = v_fma_f64 (m * m, p_23, p_01); + + /* Two iterations of Newton's method for iteratively approximating cbrt. */ + v_f64_t m_by_3 = m / 3; + v_f64_t a = v_fma_f64 (TwoThirds, p, m_by_3 / (p * p)); + a = v_fma_f64 (TwoThirds, a, m_by_3 / (a * a)); + + /* Assemble the result by the following: + + cbrt(x) = cbrt(m) * 2 ^ (e / 3). + + We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is + not necessarily a multiple of 3 we lose some information. + + Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q. + + Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which is + an integer in [-2, 2], and can be looked up in the table T. Hence the + result is assembled as: + + cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */ + + v_s64_t ey = e / 3; + v_f64_t my = a * T (v_as_u64_s64 (e % 3 + 2)); + + /* Vector version of ldexp. */ + v_f64_t y = v_as_f64_u64 ((v_as_u64_s64 (ey + 1023) << 52)) * my; + /* Copy sign. */ + y = v_as_f64_u64 (v_bsl_u64 (v_u64 (AbsMask), v_as_u64_f64 (y), ix)); + + if (unlikely (v_any_u64 (special))) + return specialcase (x, y, special); + return y; +} +VPCS_ALIAS + +PL_TEST_ULP (V_NAME (cbrt), 1.30) +PL_SIG (V, D, 1, cbrt, -10.0, 10.0) +PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (cbrt)) +PL_TEST_INTERVAL (V_NAME (cbrt), 0, inf, 1000000) +PL_TEST_INTERVAL (V_NAME (cbrt), -0, -inf, 1000000) +#endif diff --git a/pl/math/vn_cbrt_2u.c b/pl/math/vn_cbrt_2u.c new file mode 100644 index 0000000..ccaa085 --- /dev/null +++ b/pl/math/vn_cbrt_2u.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_cbrt. + * + * Copyright (c) 2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "include/mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS PL_ALIAS (__vn_cbrt, _ZGVnN2v_cbrt) +#include "v_cbrt_2u.c" +#endif -- cgit v1.2.3