aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJoe Ramsay <Joe.Ramsay@arm.com>2022-11-22 13:40:07 +0000
committerJoe Ramsay <joe.ramsay@arm.com>2022-11-22 13:40:07 +0000
commitd5844d863d498cb1c43fde4ae3e47f3eeb5d0024 (patch)
tree83c1b8c9ae3cea23c50f8b1288594af31e8922fc
parent9ace52019cab223febddb3fcd166b6b4ba64081f (diff)
downloadarm-optimized-routines-d5844d863d498cb1c43fde4ae3e47f3eeb5d0024.tar.gz
pl/math: Add scalar and vector/Neon atanhf
Both routines are based on a simplified version of log1pf, and are accurate to 3.1 ULP. Also enabled -c flag from runulp.sh - we need this for atanhf so that we can set the control lane to something other than 1, since atanh(1) is infinite.
-rw-r--r--pl/math/atanhf_3u1.c76
-rw-r--r--pl/math/include/mathlib.h5
-rw-r--r--pl/math/s_atanhf_3u1.c6
-rw-r--r--pl/math/test/mathbench_funcs.h2
-rwxr-xr-xpl/math/test/runulp.sh30
-rw-r--r--pl/math/test/testcases/directed/atanhf.tst23
-rw-r--r--pl/math/test/ulp_funcs.h2
-rw-r--r--pl/math/test/ulp_wrappers.h1
-rw-r--r--pl/math/v_atanhf_3u1.c88
-rw-r--r--pl/math/v_math.h10
-rw-r--r--pl/math/vn_atanhf_3u1.c12
11 files changed, 253 insertions, 2 deletions
diff --git a/pl/math/atanhf_3u1.c b/pl/math/atanhf_3u1.c
new file mode 100644
index 0000000..77795c8
--- /dev/null
+++ b/pl/math/atanhf_3u1.c
@@ -0,0 +1,76 @@
+/*
+ * Single-precision atanh(x) function.
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "mathlib.h"
+
+#define AbsMask 0x7fffffff
+#define Half 0x3f000000
+#define One 0x3f800000
+#define Four 0x40800000
+#define Ln2 0x1.62e43p-1f
+#define TinyBound 0x39800000 /* 0x1p-12, below which atanhf(x) rounds to x. */
+
+#define C(i) __log1pf_data.coeffs[i]
+
+static inline float
+eval_poly (float m)
+{
+ /* Approximate log(1+m) on [-0.25, 0.5] using Estrin scheme. */
+ float p_12 = fmaf (m, C (1), C (0));
+ float p_34 = fmaf (m, C (3), C (2));
+ float p_56 = fmaf (m, C (5), C (4));
+ float p_78 = fmaf (m, C (7), C (6));
+
+ float m2 = m * m;
+ float p_02 = fmaf (m2, p_12, m);
+ float p_36 = fmaf (m2, p_56, p_34);
+ float p_79 = fmaf (m2, C (8), p_78);
+
+ float m4 = m2 * m2;
+ float p_06 = fmaf (m4, p_36, p_02);
+
+ return fmaf (m4 * p_79, m4, p_06);
+}
+
+static inline float
+log1pf_inline (float x)
+{
+ /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no
+ special-case handling. See that file for details of the algorithm. */
+ float m = x + 1.0f;
+ int k = (asuint (m) - 0x3f400000) & 0xff800000;
+ float s = asfloat (Four - k);
+ float m_scale = asfloat (asuint (x) - k) + fmaf (0.25f, s, -1.0f);
+ float p = eval_poly (m_scale);
+ float scale_back = (float) k * 0x1.0p-23f;
+ return fmaf (scale_back, Ln2, p);
+}
+
+/* Approximation for single-precision inverse tanh(x), using a simplified
+ version of log1p. Maximum error is 3.08 ULP:
+ atanhf(0x1.ff0d5p-5) got 0x1.ffb768p-5
+ want 0x1.ffb76ep-5. */
+float
+atanhf (float x)
+{
+ uint32_t ix = asuint (x);
+ uint32_t iax = ix & AbsMask;
+ uint32_t sign = ix & ~AbsMask;
+
+ if (unlikely (iax < TinyBound))
+ return x;
+
+ if (iax == One)
+ return __math_divzero (sign);
+
+ if (unlikely (iax > One))
+ return __math_invalidf (x);
+
+ float halfsign = asfloat (Half | sign);
+ float ax = asfloat (iax);
+ return halfsign * log1pf_inline ((2 * ax) / (1 - ax));
+}
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index e74cd6f..f9faf2f 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -12,6 +12,7 @@
float acoshf (float);
float asinhf (float);
float atan2f (float, float);
+float atanhf (float);
float coshf (float);
float erfcf (float);
float erff (float);
@@ -34,6 +35,7 @@ double sinh (double);
float __s_asinhf (float);
float __s_atanf (float);
float __s_atan2f (float, float);
+float __s_atanhf (float);
float __s_coshf (float);
float __s_erfcf (float);
float __s_erff (float);
@@ -72,6 +74,7 @@ __f32x4_t __v_atanf (__f32x4_t);
__f64x2_t __v_atan (__f64x2_t);
__f32x4_t __v_atan2f (__f32x4_t, __f32x4_t);
__f64x2_t __v_atan2 (__f64x2_t, __f64x2_t);
+__f32x4_t __v_atanhf (__f32x4_t);
__f32x4_t __v_coshf (__f32x4_t);
__f64x2_t __v_cosh (__f64x2_t);
__f32x4_t __v_erff (__f32x4_t);
@@ -99,6 +102,7 @@ __vpcs __f32x4_t __vn_atanf (__f32x4_t);
__vpcs __f64x2_t __vn_atan (__f64x2_t);
__vpcs __f32x4_t __vn_atan2f (__f32x4_t, __f32x4_t);
__vpcs __f64x2_t __vn_atan2 (__f64x2_t, __f64x2_t);
+__vpcs __f32x4_t __vn_atanhf (__f32x4_t);
__vpcs __f32x4_t __vn_coshf (__f32x4_t);
__vpcs __f64x2_t __vn_cosh (__f64x2_t);
__vpcs __f32x4_t __vn_erff (__f32x4_t);
@@ -123,6 +127,7 @@ __vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t);
__vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
__vpcs __f32x4_t _ZGVnN4vv_atan2f (__f32x4_t, __f32x4_t);
__vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_atanhf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t);
__vpcs __f64x2_t _ZGVnN2v_cosh (__f64x2_t);
__vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t);
diff --git a/pl/math/s_atanhf_3u1.c b/pl/math/s_atanhf_3u1.c
new file mode 100644
index 0000000..9f75962
--- /dev/null
+++ b/pl/math/s_atanhf_3u1.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_atanhf_3u1.c"
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 583c8fb..de009a3 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -30,6 +30,7 @@ F (acoshf, 1.0, 10.0)
F (asinhf, -10.0, 10.0)
F (atanf, -10.0, 10.0)
{"atan2f", 'f', 0, -10.0, 10.0, {.f = atan2f_wrap}},
+F (atanhf, -1.0, 1.0)
F (cosf, -3.1, 3.1)
F (coshf, -10.0, 10.0)
F (erfcf, -4.0, 10.0)
@@ -61,6 +62,7 @@ D (sinh, -10.0, 10.0)
#if WANT_VMATH
ZVNF (asinhf, -10.0, 10.0)
ZVNF (atanf, -10.0, 10.0)
+ZVNF (atanhf, -1.0, 1.0)
ZVND (atan, -10.0, 10.0)
ZVNF (coshf, -10.0, 10.0)
ZVND (cosh, -10.0, 10.0)
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 02a5a97..53b4a43 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -203,6 +203,14 @@ t cosh -0x1.61da04cbafe44p+9 -0x1p10 1000
t cosh 0x1p10 inf 100
t cosh -0x1p10 -inf 100
+L=2.59
+t atanhf 0 0x1p-12 500
+t atanhf 0x1p-12 1 200000
+t atanhf 1 inf 1000
+t atanhf -0 -0x1p-12 500
+t atanhf -0x1p-12 -1 200000
+t atanhf -1 -inf 1000
+
done
# vector functions
@@ -424,6 +432,15 @@ range_cosh='
-0x1.6p9 -inf 1000
'
+range_atanhf='
+ 0 0x1p-12 500
+ 0x1p-12 1 200000
+ 1 inf 1000
+ -0 -0x1p-12 500
+ -0x1p-12 -1 200000
+ -1 -inf 1000
+'
+
range_sve_cosf='
0 0xffff0000 10000
0x1p-4 0x1p4 500000
@@ -589,6 +606,7 @@ L_coshf=1.89
L_expm1=1.68
L_sinh=2.08
L_cosh=1.43
+L_atanhf=2.59
L_sve_cosf=1.57
L_sve_cos=1.61
@@ -608,7 +626,7 @@ L_sve_erf=1.97
L_sve_tanf=2.7
L_sve_erfc=3.15
-while read G F R D
+while read G F R D A
do
[ "$R" = 1 ] && { [[ $G != sve_* ]] || [ $WANT_SVE_MATH -eq 1 ]; } || continue
case "$G" in \#*) continue ;; esac
@@ -630,13 +648,17 @@ do
if [ $WANT_ERRNO -eq 1 ]; then
if [ "$D" = "fenv" ]; then
f=""
+ elif [ "$D" = "nofenv" ]; then
+ # Need to pass this if you want additional
+ # arguments but keep fenv checking disabled.
+ f="-f"
elif [ ! -z "$D" ]; then
echo "Unrecognised 4th argument: $D"
exit 1
fi
fi
case "$X" in \#*) continue ;; esac
- t $f $F $X
+ t $A $f $F $X
done << EOF
$range
EOF
@@ -732,6 +754,10 @@ coshf __s_coshf $runs fenv
coshf __v_coshf $runv fenv
coshf __vn_coshf $runvn fenv
coshf _ZGVnN4v_coshf $runvn fenv
+atanhf __s_atanhf $runs fenv -c 0
+atanhf __v_atanhf $runv fenv -c 0
+atanhf __vn_atanhf $runvn fenv -c 0
+atanhf _ZGVnN4v_atanhf $runvn fenv -c 0
sve_cosf __sv_cosf $runsv
sve_cosf _ZGVsMxv_cosf $runsv
diff --git a/pl/math/test/testcases/directed/atanhf.tst b/pl/math/test/testcases/directed/atanhf.tst
new file mode 100644
index 0000000..616b59d
--- /dev/null
+++ b/pl/math/test/testcases/directed/atanhf.tst
@@ -0,0 +1,23 @@
+; atanhf.tst
+;
+; Copyright 2009-2022, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=atanhf op1=7fc00001 result=7fc00001 errno=0
+func=atanhf op1=ffc00001 result=7fc00001 errno=0
+func=atanhf op1=7f800001 result=7fc00001 errno=0 status=i
+func=atanhf op1=ff800001 result=7fc00001 errno=0 status=i
+func=atanhf op1=7f800000 result=7fc00001 errno=EDOM status=i
+func=atanhf op1=ff800000 result=7fc00001 errno=EDOM status=i
+func=atanhf op1=3f800001 result=7fc00001 errno=EDOM status=i
+func=atanhf op1=bf800001 result=7fc00001 errno=EDOM status=i
+func=atanhf op1=3f800000 result=7f800000 errno=ERANGE status=z
+func=atanhf op1=bf800000 result=ff800000 errno=ERANGE status=z
+func=atanhf op1=00000000 result=00000000 errno=0
+func=atanhf op1=80000000 result=80000000 errno=0
+
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+func=atanhf op1=00000001 result=00000001 errno=0 maybestatus=ux
+func=atanhf op1=80000001 result=80000001 errno=0 maybestatus=ux
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 1b674a6..84f4182 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -36,6 +36,7 @@
F1 (acosh)
F1 (asinh)
F2 (atan2)
+F1 (atanh)
F1 (cosh)
F1 (erfc)
F1 (erf)
@@ -59,6 +60,7 @@ _ZVNF1 (atan)
_ZVND1 (atan)
_ZVNF2 (atan2)
_ZVND2 (atan2)
+_ZVNF1 (atanh)
_ZVNF1 (cosh)
_ZVND1 (cosh)
_ZVNF1 (erf)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index a9f2d15..0ee07e9 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -118,6 +118,7 @@ DECL_POW_INT_REF(ref_powi, long double, double, int)
ZVNF1_WRAP(asinh)
ZVNF1_WRAP(atan)
ZVNF2_WRAP(atan2)
+ZVNF1_WRAP(atanh)
ZVNF1_WRAP(cosh)
ZVNF1_WRAP(erf)
ZVNF1_WRAP(erfc)
diff --git a/pl/math/v_atanhf_3u1.c b/pl/math/v_atanhf_3u1.c
new file mode 100644
index 0000000..54dcb9b
--- /dev/null
+++ b/pl/math/v_atanhf_3u1.c
@@ -0,0 +1,88 @@
+/*
+ * Single-precision vector atanh(x) function.
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "mathlib.h"
+
+#if V_SUPPORTED
+
+#define AbsMask 0x7fffffff
+#define Half 0x3f000000
+#define One 0x3f800000
+#define Four 0x40800000
+#define Ln2 0x1.62e43p-1f
+#define TinyBound 0x39800000 /* 0x1p-12, below which atanhf(x) rounds to x. */
+
+#define C(i) v_f32 (__log1pf_data.coeffs[i])
+
+static inline v_f32_t
+eval_poly (v_f32_t m)
+{
+ /* Approximate log(1+m) on [-0.25, 0.5] using Estrin scheme. */
+ v_f32_t p_12 = v_fma_f32 (m, C (1), C (0));
+ v_f32_t p_34 = v_fma_f32 (m, C (3), C (2));
+ v_f32_t p_56 = v_fma_f32 (m, C (5), C (4));
+ v_f32_t p_78 = v_fma_f32 (m, C (7), C (6));
+
+ v_f32_t m2 = m * m;
+ v_f32_t p_02 = v_fma_f32 (m2, p_12, m);
+ v_f32_t p_36 = v_fma_f32 (m2, p_56, p_34);
+ v_f32_t p_79 = v_fma_f32 (m2, C (8), p_78);
+
+ v_f32_t m4 = m2 * m2;
+ v_f32_t p_06 = v_fma_f32 (m4, p_36, p_02);
+
+ return v_fma_f32 (m4, m4 * p_79, p_06);
+}
+
+static inline v_f32_t
+log1pf_inline (v_f32_t x)
+{
+ /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no
+ special-case handling. See that file for details of the algorithm. */
+ v_f32_t m = x + 1.0f;
+ v_u32_t k = (v_as_u32_f32 (m) - 0x3f400000) & 0xff800000;
+ v_f32_t s = v_as_f32_u32 (v_u32 (Four) - k);
+ v_f32_t m_scale = v_as_f32_u32 (v_as_u32_f32 (x) - k)
+ + v_fma_f32 (v_f32 (0.25f), s, v_f32 (-1.0f));
+ v_f32_t p = eval_poly (m_scale);
+ v_f32_t scale_back = v_to_f32_u32 (k) * 0x1.0p-23f;
+ return v_fma_f32 (scale_back, v_f32 (Ln2), p);
+}
+
+/* Approximation for vector single-precision atanh(x) using modified log1p.
+ The maximum error is 3.08 ULP:
+ __v_atanhf(0x1.ff215p-5) got 0x1.ffcb7cp-5
+ want 0x1.ffcb82p-5. */
+VPCS_ATTR v_f32_t V_NAME (atanhf) (v_f32_t x)
+{
+ v_u32_t ix = v_as_u32_f32 (x);
+ v_f32_t halfsign
+ = v_as_f32_u32 (v_bsl_u32 (v_u32 (AbsMask), v_u32 (Half), ix));
+ v_u32_t iax = ix & AbsMask;
+
+ v_f32_t ax = v_as_f32_u32 (iax);
+
+#if WANT_ERRNO
+ v_u32_t special = v_cond_u32 ((iax >= One) | (iax <= TinyBound));
+ /* Side-step special cases by setting those lanes to 0, which will trigger no
+ exceptions. These will be fixed up later. */
+ if (unlikely (v_any_u32 (special)))
+ ax = v_sel_f32 (special, v_f32 (0), ax);
+#else
+ v_u32_t special = v_cond_u32 (iax >= One);
+#endif
+
+ v_f32_t y = halfsign * log1pf_inline ((2 * ax) / (1 - ax));
+
+ if (unlikely (v_any_u32 (special)))
+ return v_call_f32 (atanhf, x, y, special);
+ return y;
+}
+
+VPCS_ALIAS
+
+#endif
diff --git a/pl/math/v_math.h b/pl/math/v_math.h
index d4597c8..3ed6244 100644
--- a/pl/math/v_math.h
+++ b/pl/math/v_math.h
@@ -174,6 +174,11 @@ v_abs_f32 (v_f32_t x)
return __builtin_fabsf (x);
}
static inline v_u32_t
+v_bsl_u32 (v_u32_t m, v_u32_t x, v_u32_t y)
+{
+ return (y & ~m) | (x & m);
+}
+static inline v_u32_t
v_cagt_f32 (v_f32_t x, v_f32_t y)
{
return fabsf (x) > fabsf (y);
@@ -536,6 +541,11 @@ v_abs_f32 (v_f32_t x)
return vabsq_f32 (x);
}
static inline v_u32_t
+v_bsl_u32 (v_u32_t m, v_u32_t x, v_u32_t y)
+{
+ return vbslq_u32 (m, x, y);
+}
+static inline v_u32_t
v_cagt_f32 (v_f32_t x, v_f32_t y)
{
return vcagtq_f32 (x, y);
diff --git a/pl/math/vn_atanhf_3u1.c b/pl/math/vn_atanhf_3u1.c
new file mode 100644
index 0000000..d4ad391
--- /dev/null
+++ b/pl/math/vn_atanhf_3u1.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_atanhf.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_atanhf, _ZGVnN4v_atanhf)
+#include "v_atanhf_3u1.c"
+#endif