diff options
author | Joe Ramsay <Joe.Ramsay@arm.com> | 2022-11-22 13:40:07 +0000 |
---|---|---|
committer | Joe Ramsay <joe.ramsay@arm.com> | 2022-11-22 13:40:07 +0000 |
commit | d5844d863d498cb1c43fde4ae3e47f3eeb5d0024 (patch) | |
tree | 83c1b8c9ae3cea23c50f8b1288594af31e8922fc | |
parent | 9ace52019cab223febddb3fcd166b6b4ba64081f (diff) | |
download | arm-optimized-routines-d5844d863d498cb1c43fde4ae3e47f3eeb5d0024.tar.gz |
pl/math: Add scalar and vector/Neon atanhf
Both routines are based on a simplified version of log1pf, and are
accurate to 3.1 ULP. Also enabled -c flag from runulp.sh - we need
this for atanhf so that we can set the control lane to something other
than 1, since atanh(1) is infinite.
-rw-r--r-- | pl/math/atanhf_3u1.c | 76 | ||||
-rw-r--r-- | pl/math/include/mathlib.h | 5 | ||||
-rw-r--r-- | pl/math/s_atanhf_3u1.c | 6 | ||||
-rw-r--r-- | pl/math/test/mathbench_funcs.h | 2 | ||||
-rwxr-xr-x | pl/math/test/runulp.sh | 30 | ||||
-rw-r--r-- | pl/math/test/testcases/directed/atanhf.tst | 23 | ||||
-rw-r--r-- | pl/math/test/ulp_funcs.h | 2 | ||||
-rw-r--r-- | pl/math/test/ulp_wrappers.h | 1 | ||||
-rw-r--r-- | pl/math/v_atanhf_3u1.c | 88 | ||||
-rw-r--r-- | pl/math/v_math.h | 10 | ||||
-rw-r--r-- | pl/math/vn_atanhf_3u1.c | 12 |
11 files changed, 253 insertions, 2 deletions
diff --git a/pl/math/atanhf_3u1.c b/pl/math/atanhf_3u1.c new file mode 100644 index 0000000..77795c8 --- /dev/null +++ b/pl/math/atanhf_3u1.c @@ -0,0 +1,76 @@ +/* + * Single-precision atanh(x) function. + * Copyright (c) 2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" +#include "mathlib.h" + +#define AbsMask 0x7fffffff +#define Half 0x3f000000 +#define One 0x3f800000 +#define Four 0x40800000 +#define Ln2 0x1.62e43p-1f +#define TinyBound 0x39800000 /* 0x1p-12, below which atanhf(x) rounds to x. */ + +#define C(i) __log1pf_data.coeffs[i] + +static inline float +eval_poly (float m) +{ + /* Approximate log(1+m) on [-0.25, 0.5] using Estrin scheme. */ + float p_12 = fmaf (m, C (1), C (0)); + float p_34 = fmaf (m, C (3), C (2)); + float p_56 = fmaf (m, C (5), C (4)); + float p_78 = fmaf (m, C (7), C (6)); + + float m2 = m * m; + float p_02 = fmaf (m2, p_12, m); + float p_36 = fmaf (m2, p_56, p_34); + float p_79 = fmaf (m2, C (8), p_78); + + float m4 = m2 * m2; + float p_06 = fmaf (m4, p_36, p_02); + + return fmaf (m4 * p_79, m4, p_06); +} + +static inline float +log1pf_inline (float x) +{ + /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no + special-case handling. See that file for details of the algorithm. */ + float m = x + 1.0f; + int k = (asuint (m) - 0x3f400000) & 0xff800000; + float s = asfloat (Four - k); + float m_scale = asfloat (asuint (x) - k) + fmaf (0.25f, s, -1.0f); + float p = eval_poly (m_scale); + float scale_back = (float) k * 0x1.0p-23f; + return fmaf (scale_back, Ln2, p); +} + +/* Approximation for single-precision inverse tanh(x), using a simplified + version of log1p. Maximum error is 3.08 ULP: + atanhf(0x1.ff0d5p-5) got 0x1.ffb768p-5 + want 0x1.ffb76ep-5. */ +float +atanhf (float x) +{ + uint32_t ix = asuint (x); + uint32_t iax = ix & AbsMask; + uint32_t sign = ix & ~AbsMask; + + if (unlikely (iax < TinyBound)) + return x; + + if (iax == One) + return __math_divzero (sign); + + if (unlikely (iax > One)) + return __math_invalidf (x); + + float halfsign = asfloat (Half | sign); + float ax = asfloat (iax); + return halfsign * log1pf_inline ((2 * ax) / (1 - ax)); +} diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h index e74cd6f..f9faf2f 100644 --- a/pl/math/include/mathlib.h +++ b/pl/math/include/mathlib.h @@ -12,6 +12,7 @@ float acoshf (float); float asinhf (float); float atan2f (float, float); +float atanhf (float); float coshf (float); float erfcf (float); float erff (float); @@ -34,6 +35,7 @@ double sinh (double); float __s_asinhf (float); float __s_atanf (float); float __s_atan2f (float, float); +float __s_atanhf (float); float __s_coshf (float); float __s_erfcf (float); float __s_erff (float); @@ -72,6 +74,7 @@ __f32x4_t __v_atanf (__f32x4_t); __f64x2_t __v_atan (__f64x2_t); __f32x4_t __v_atan2f (__f32x4_t, __f32x4_t); __f64x2_t __v_atan2 (__f64x2_t, __f64x2_t); +__f32x4_t __v_atanhf (__f32x4_t); __f32x4_t __v_coshf (__f32x4_t); __f64x2_t __v_cosh (__f64x2_t); __f32x4_t __v_erff (__f32x4_t); @@ -99,6 +102,7 @@ __vpcs __f32x4_t __vn_atanf (__f32x4_t); __vpcs __f64x2_t __vn_atan (__f64x2_t); __vpcs __f32x4_t __vn_atan2f (__f32x4_t, __f32x4_t); __vpcs __f64x2_t __vn_atan2 (__f64x2_t, __f64x2_t); +__vpcs __f32x4_t __vn_atanhf (__f32x4_t); __vpcs __f32x4_t __vn_coshf (__f32x4_t); __vpcs __f64x2_t __vn_cosh (__f64x2_t); __vpcs __f32x4_t __vn_erff (__f32x4_t); @@ -123,6 +127,7 @@ __vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t); __vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t); __vpcs __f32x4_t _ZGVnN4vv_atan2f (__f32x4_t, __f32x4_t); __vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t); +__vpcs __f32x4_t _ZGVnN4v_atanhf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t); __vpcs __f64x2_t _ZGVnN2v_cosh (__f64x2_t); __vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t); diff --git a/pl/math/s_atanhf_3u1.c b/pl/math/s_atanhf_3u1.c new file mode 100644 index 0000000..9f75962 --- /dev/null +++ b/pl/math/s_atanhf_3u1.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define SCALAR 1 +#include "v_atanhf_3u1.c" diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h index 583c8fb..de009a3 100644 --- a/pl/math/test/mathbench_funcs.h +++ b/pl/math/test/mathbench_funcs.h @@ -30,6 +30,7 @@ F (acoshf, 1.0, 10.0) F (asinhf, -10.0, 10.0) F (atanf, -10.0, 10.0) {"atan2f", 'f', 0, -10.0, 10.0, {.f = atan2f_wrap}}, +F (atanhf, -1.0, 1.0) F (cosf, -3.1, 3.1) F (coshf, -10.0, 10.0) F (erfcf, -4.0, 10.0) @@ -61,6 +62,7 @@ D (sinh, -10.0, 10.0) #if WANT_VMATH ZVNF (asinhf, -10.0, 10.0) ZVNF (atanf, -10.0, 10.0) +ZVNF (atanhf, -1.0, 1.0) ZVND (atan, -10.0, 10.0) ZVNF (coshf, -10.0, 10.0) ZVND (cosh, -10.0, 10.0) diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh index 02a5a97..53b4a43 100755 --- a/pl/math/test/runulp.sh +++ b/pl/math/test/runulp.sh @@ -203,6 +203,14 @@ t cosh -0x1.61da04cbafe44p+9 -0x1p10 1000 t cosh 0x1p10 inf 100 t cosh -0x1p10 -inf 100 +L=2.59 +t atanhf 0 0x1p-12 500 +t atanhf 0x1p-12 1 200000 +t atanhf 1 inf 1000 +t atanhf -0 -0x1p-12 500 +t atanhf -0x1p-12 -1 200000 +t atanhf -1 -inf 1000 + done # vector functions @@ -424,6 +432,15 @@ range_cosh=' -0x1.6p9 -inf 1000 ' +range_atanhf=' + 0 0x1p-12 500 + 0x1p-12 1 200000 + 1 inf 1000 + -0 -0x1p-12 500 + -0x1p-12 -1 200000 + -1 -inf 1000 +' + range_sve_cosf=' 0 0xffff0000 10000 0x1p-4 0x1p4 500000 @@ -589,6 +606,7 @@ L_coshf=1.89 L_expm1=1.68 L_sinh=2.08 L_cosh=1.43 +L_atanhf=2.59 L_sve_cosf=1.57 L_sve_cos=1.61 @@ -608,7 +626,7 @@ L_sve_erf=1.97 L_sve_tanf=2.7 L_sve_erfc=3.15 -while read G F R D +while read G F R D A do [ "$R" = 1 ] && { [[ $G != sve_* ]] || [ $WANT_SVE_MATH -eq 1 ]; } || continue case "$G" in \#*) continue ;; esac @@ -630,13 +648,17 @@ do if [ $WANT_ERRNO -eq 1 ]; then if [ "$D" = "fenv" ]; then f="" + elif [ "$D" = "nofenv" ]; then + # Need to pass this if you want additional + # arguments but keep fenv checking disabled. + f="-f" elif [ ! -z "$D" ]; then echo "Unrecognised 4th argument: $D" exit 1 fi fi case "$X" in \#*) continue ;; esac - t $f $F $X + t $A $f $F $X done << EOF $range EOF @@ -732,6 +754,10 @@ coshf __s_coshf $runs fenv coshf __v_coshf $runv fenv coshf __vn_coshf $runvn fenv coshf _ZGVnN4v_coshf $runvn fenv +atanhf __s_atanhf $runs fenv -c 0 +atanhf __v_atanhf $runv fenv -c 0 +atanhf __vn_atanhf $runvn fenv -c 0 +atanhf _ZGVnN4v_atanhf $runvn fenv -c 0 sve_cosf __sv_cosf $runsv sve_cosf _ZGVsMxv_cosf $runsv diff --git a/pl/math/test/testcases/directed/atanhf.tst b/pl/math/test/testcases/directed/atanhf.tst new file mode 100644 index 0000000..616b59d --- /dev/null +++ b/pl/math/test/testcases/directed/atanhf.tst @@ -0,0 +1,23 @@ +; atanhf.tst +; +; Copyright 2009-2022, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=atanhf op1=7fc00001 result=7fc00001 errno=0 +func=atanhf op1=ffc00001 result=7fc00001 errno=0 +func=atanhf op1=7f800001 result=7fc00001 errno=0 status=i +func=atanhf op1=ff800001 result=7fc00001 errno=0 status=i +func=atanhf op1=7f800000 result=7fc00001 errno=EDOM status=i +func=atanhf op1=ff800000 result=7fc00001 errno=EDOM status=i +func=atanhf op1=3f800001 result=7fc00001 errno=EDOM status=i +func=atanhf op1=bf800001 result=7fc00001 errno=EDOM status=i +func=atanhf op1=3f800000 result=7f800000 errno=ERANGE status=z +func=atanhf op1=bf800000 result=ff800000 errno=ERANGE status=z +func=atanhf op1=00000000 result=00000000 errno=0 +func=atanhf op1=80000000 result=80000000 errno=0 + +; No exception is raised with certain versions of glibc. Functions +; approximated by x near zero may not generate/implement flops and +; thus may not raise exceptions. +func=atanhf op1=00000001 result=00000001 errno=0 maybestatus=ux +func=atanhf op1=80000001 result=80000001 errno=0 maybestatus=ux diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h index 1b674a6..84f4182 100644 --- a/pl/math/test/ulp_funcs.h +++ b/pl/math/test/ulp_funcs.h @@ -36,6 +36,7 @@ F1 (acosh) F1 (asinh) F2 (atan2) +F1 (atanh) F1 (cosh) F1 (erfc) F1 (erf) @@ -59,6 +60,7 @@ _ZVNF1 (atan) _ZVND1 (atan) _ZVNF2 (atan2) _ZVND2 (atan2) +_ZVNF1 (atanh) _ZVNF1 (cosh) _ZVND1 (cosh) _ZVNF1 (erf) diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h index a9f2d15..0ee07e9 100644 --- a/pl/math/test/ulp_wrappers.h +++ b/pl/math/test/ulp_wrappers.h @@ -118,6 +118,7 @@ DECL_POW_INT_REF(ref_powi, long double, double, int) ZVNF1_WRAP(asinh) ZVNF1_WRAP(atan) ZVNF2_WRAP(atan2) +ZVNF1_WRAP(atanh) ZVNF1_WRAP(cosh) ZVNF1_WRAP(erf) ZVNF1_WRAP(erfc) diff --git a/pl/math/v_atanhf_3u1.c b/pl/math/v_atanhf_3u1.c new file mode 100644 index 0000000..54dcb9b --- /dev/null +++ b/pl/math/v_atanhf_3u1.c @@ -0,0 +1,88 @@ +/* + * Single-precision vector atanh(x) function. + * Copyright (c) 2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "mathlib.h" + +#if V_SUPPORTED + +#define AbsMask 0x7fffffff +#define Half 0x3f000000 +#define One 0x3f800000 +#define Four 0x40800000 +#define Ln2 0x1.62e43p-1f +#define TinyBound 0x39800000 /* 0x1p-12, below which atanhf(x) rounds to x. */ + +#define C(i) v_f32 (__log1pf_data.coeffs[i]) + +static inline v_f32_t +eval_poly (v_f32_t m) +{ + /* Approximate log(1+m) on [-0.25, 0.5] using Estrin scheme. */ + v_f32_t p_12 = v_fma_f32 (m, C (1), C (0)); + v_f32_t p_34 = v_fma_f32 (m, C (3), C (2)); + v_f32_t p_56 = v_fma_f32 (m, C (5), C (4)); + v_f32_t p_78 = v_fma_f32 (m, C (7), C (6)); + + v_f32_t m2 = m * m; + v_f32_t p_02 = v_fma_f32 (m2, p_12, m); + v_f32_t p_36 = v_fma_f32 (m2, p_56, p_34); + v_f32_t p_79 = v_fma_f32 (m2, C (8), p_78); + + v_f32_t m4 = m2 * m2; + v_f32_t p_06 = v_fma_f32 (m4, p_36, p_02); + + return v_fma_f32 (m4, m4 * p_79, p_06); +} + +static inline v_f32_t +log1pf_inline (v_f32_t x) +{ + /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no + special-case handling. See that file for details of the algorithm. */ + v_f32_t m = x + 1.0f; + v_u32_t k = (v_as_u32_f32 (m) - 0x3f400000) & 0xff800000; + v_f32_t s = v_as_f32_u32 (v_u32 (Four) - k); + v_f32_t m_scale = v_as_f32_u32 (v_as_u32_f32 (x) - k) + + v_fma_f32 (v_f32 (0.25f), s, v_f32 (-1.0f)); + v_f32_t p = eval_poly (m_scale); + v_f32_t scale_back = v_to_f32_u32 (k) * 0x1.0p-23f; + return v_fma_f32 (scale_back, v_f32 (Ln2), p); +} + +/* Approximation for vector single-precision atanh(x) using modified log1p. + The maximum error is 3.08 ULP: + __v_atanhf(0x1.ff215p-5) got 0x1.ffcb7cp-5 + want 0x1.ffcb82p-5. */ +VPCS_ATTR v_f32_t V_NAME (atanhf) (v_f32_t x) +{ + v_u32_t ix = v_as_u32_f32 (x); + v_f32_t halfsign + = v_as_f32_u32 (v_bsl_u32 (v_u32 (AbsMask), v_u32 (Half), ix)); + v_u32_t iax = ix & AbsMask; + + v_f32_t ax = v_as_f32_u32 (iax); + +#if WANT_ERRNO + v_u32_t special = v_cond_u32 ((iax >= One) | (iax <= TinyBound)); + /* Side-step special cases by setting those lanes to 0, which will trigger no + exceptions. These will be fixed up later. */ + if (unlikely (v_any_u32 (special))) + ax = v_sel_f32 (special, v_f32 (0), ax); +#else + v_u32_t special = v_cond_u32 (iax >= One); +#endif + + v_f32_t y = halfsign * log1pf_inline ((2 * ax) / (1 - ax)); + + if (unlikely (v_any_u32 (special))) + return v_call_f32 (atanhf, x, y, special); + return y; +} + +VPCS_ALIAS + +#endif diff --git a/pl/math/v_math.h b/pl/math/v_math.h index d4597c8..3ed6244 100644 --- a/pl/math/v_math.h +++ b/pl/math/v_math.h @@ -174,6 +174,11 @@ v_abs_f32 (v_f32_t x) return __builtin_fabsf (x); } static inline v_u32_t +v_bsl_u32 (v_u32_t m, v_u32_t x, v_u32_t y) +{ + return (y & ~m) | (x & m); +} +static inline v_u32_t v_cagt_f32 (v_f32_t x, v_f32_t y) { return fabsf (x) > fabsf (y); @@ -536,6 +541,11 @@ v_abs_f32 (v_f32_t x) return vabsq_f32 (x); } static inline v_u32_t +v_bsl_u32 (v_u32_t m, v_u32_t x, v_u32_t y) +{ + return vbslq_u32 (m, x, y); +} +static inline v_u32_t v_cagt_f32 (v_f32_t x, v_f32_t y) { return vcagtq_f32 (x, y); diff --git a/pl/math/vn_atanhf_3u1.c b/pl/math/vn_atanhf_3u1.c new file mode 100644 index 0000000..d4ad391 --- /dev/null +++ b/pl/math/vn_atanhf_3u1.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_atanhf. + * + * Copyright (c) 2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "include/mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS strong_alias (__vn_atanhf, _ZGVnN4v_atanhf) +#include "v_atanhf_3u1.c" +#endif |