pl/math: Add scalar and vector/Neon atanhf

Both routines are based on a simplified version of log1pf, and are accurate to 3.1 ULP. Also enabled -c flag from runulp.sh - we need this for atanhf so that we can set the control lane to something other than 1, since atanh(1) is infinite.
author: Joe Ramsay <Joe.Ramsay@arm.com> 2022-11-22 13:40:07 +0000
committer: Joe Ramsay <joe.ramsay@arm.com> 2022-11-22 13:40:07 +0000
commit: d5844d863d498cb1c43fde4ae3e47f3eeb5d0024 (patch)
tree: 83c1b8c9ae3cea23c50f8b1288594af31e8922fc
parent: 9ace52019cab223febddb3fcd166b6b4ba64081f (diff)
download: arm-optimized-routines-d5844d863d498cb1c43fde4ae3e47f3eeb5d0024.tar.gz
11 files changed, 253 insertions, 2 deletions
diff --git a/pl/math/atanhf_3u1.c b/pl/math/atanhf_3u1.c
new file mode 100644
index 0000000..77795c8
--- /dev/null
+++ b/pl/math/atanhf_3u1.c
@@ -0,0 +1,76 @@
+/*
+ * Single-precision atanh(x) function.
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "mathlib.h"
+
+#define AbsMask 0x7fffffff
+#define Half 0x3f000000
+#define One 0x3f800000
+#define Four 0x40800000
+#define Ln2 0x1.62e43p-1f
+#define TinyBound 0x39800000 /* 0x1p-12, below which atanhf(x) rounds to x. */
+
+#define C(i) __log1pf_data.coeffs[i]
+
+static inline float
+eval_poly (float m)
+{
+  /* Approximate log(1+m) on [-0.25, 0.5] using Estrin scheme.  */
+  float p_12 = fmaf (m, C (1), C (0));
+  float p_34 = fmaf (m, C (3), C (2));
+  float p_56 = fmaf (m, C (5), C (4));
+  float p_78 = fmaf (m, C (7), C (6));
+
+  float m2 = m * m;
+  float p_02 = fmaf (m2, p_12, m);
+  float p_36 = fmaf (m2, p_56, p_34);
+  float p_79 = fmaf (m2, C (8), p_78);
+
+  float m4 = m2 * m2;
+  float p_06 = fmaf (m4, p_36, p_02);
+
+  return fmaf (m4 * p_79, m4, p_06);
+}
+
+static inline float
+log1pf_inline (float x)
+{
+  /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no
+     special-case handling. See that file for details of the algorithm.  */
+  float m = x + 1.0f;
+  int k = (asuint (m) - 0x3f400000) & 0xff800000;
+  float s = asfloat (Four - k);
+  float m_scale = asfloat (asuint (x) - k) + fmaf (0.25f, s, -1.0f);
+  float p = eval_poly (m_scale);
+  float scale_back = (float) k * 0x1.0p-23f;
+  return fmaf (scale_back, Ln2, p);
+}
+
+/* Approximation for single-precision inverse tanh(x), using a simplified
+   version of log1p. Maximum error is 3.08 ULP:
+   atanhf(0x1.ff0d5p-5) got 0x1.ffb768p-5
+		       want 0x1.ffb76ep-5.  */
+float
+atanhf (float x)
+{
+  uint32_t ix = asuint (x);
+  uint32_t iax = ix & AbsMask;
+  uint32_t sign = ix & ~AbsMask;
+
+  if (unlikely (iax < TinyBound))
+    return x;
+
+  if (iax == One)
+    return __math_divzero (sign);
+
+  if (unlikely (iax > One))
+    return __math_invalidf (x);
+
+  float halfsign = asfloat (Half | sign);
+  float ax = asfloat (iax);
+  return halfsign * log1pf_inline ((2 * ax) / (1 - ax));
+}
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index e74cd6f..f9faf2f 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -12,6 +12,7 @@
 float acoshf (float);
 float asinhf (float);
 float atan2f (float, float);
+float atanhf (float);
 float coshf (float);
 float erfcf (float);
 float erff (float);
@@ -34,6 +35,7 @@ double sinh (double);
 float __s_asinhf (float);
 float __s_atanf (float);
 float __s_atan2f (float, float);
+float __s_atanhf (float);
 float __s_coshf (float);
 float __s_erfcf (float);
 float __s_erff (float);
@@ -72,6 +74,7 @@ __f32x4_t __v_atanf (__f32x4_t);
 __f64x2_t __v_atan (__f64x2_t);
 __f32x4_t __v_atan2f (__f32x4_t, __f32x4_t);
 __f64x2_t __v_atan2 (__f64x2_t, __f64x2_t);
+__f32x4_t __v_atanhf (__f32x4_t);
 __f32x4_t __v_coshf (__f32x4_t);
 __f64x2_t __v_cosh (__f64x2_t);
 __f32x4_t __v_erff (__f32x4_t);
@@ -99,6 +102,7 @@ __vpcs __f32x4_t __vn_atanf (__f32x4_t);
 __vpcs __f64x2_t __vn_atan (__f64x2_t);
 __vpcs __f32x4_t __vn_atan2f (__f32x4_t, __f32x4_t);
 __vpcs __f64x2_t __vn_atan2 (__f64x2_t, __f64x2_t);
+__vpcs __f32x4_t __vn_atanhf (__f32x4_t);
 __vpcs __f32x4_t __vn_coshf (__f32x4_t);
 __vpcs __f64x2_t __vn_cosh (__f64x2_t);
 __vpcs __f32x4_t __vn_erff (__f32x4_t);
@@ -123,6 +127,7 @@ __vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4vv_atan2f (__f32x4_t, __f32x4_t);
 __vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_atanhf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_cosh (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t);
diff --git a/pl/math/s_atanhf_3u1.c b/pl/math/s_atanhf_3u1.c
new file mode 100644
index 0000000..9f75962
--- /dev/null
+++ b/pl/math/s_atanhf_3u1.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_atanhf_3u1.c"
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 583c8fb..de009a3 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -30,6 +30,7 @@ F (acoshf, 1.0, 10.0)
 F (asinhf, -10.0, 10.0)
 F (atanf, -10.0, 10.0)
 {"atan2f", 'f', 0, -10.0, 10.0, {.f = atan2f_wrap}},
+F (atanhf, -1.0, 1.0)
 F (cosf, -3.1, 3.1)
 F (coshf, -10.0, 10.0)
 F (erfcf, -4.0, 10.0)
@@ -61,6 +62,7 @@ D (sinh, -10.0, 10.0)
 #if WANT_VMATH
 ZVNF (asinhf, -10.0, 10.0)
 ZVNF (atanf, -10.0, 10.0)
+ZVNF (atanhf, -1.0, 1.0)
 ZVND (atan, -10.0, 10.0)
 ZVNF (coshf, -10.0, 10.0)
 ZVND (cosh, -10.0, 10.0)
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 02a5a97..53b4a43 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -203,6 +203,14 @@ t cosh -0x1.61da04cbafe44p+9 -0x1p10               1000
 t cosh  0x1p10                inf                  100
 t cosh -0x1p10               -inf                  100
 
+L=2.59
+t atanhf  0        0x1p-12 500
+t atanhf  0x1p-12  1       200000
+t atanhf  1        inf     1000
+t atanhf -0       -0x1p-12 500
+t atanhf -0x1p-12 -1       200000
+t atanhf -1       -inf     1000
+
 done
 
 # vector functions
@@ -424,6 +432,15 @@ range_cosh='
  -0x1.6p9 -inf       1000
 '
 
+range_atanhf='
+  0        0x1p-12 500
+  0x1p-12  1       200000
+  1        inf     1000
+ -0       -0x1p-12 500
+ -0x1p-12 -1       200000
+ -1       -inf     1000
+'
+
 range_sve_cosf='
  0    0xffff0000    10000
  0x1p-4    0x1p4    500000
@@ -589,6 +606,7 @@ L_coshf=1.89
 L_expm1=1.68
 L_sinh=2.08
 L_cosh=1.43
+L_atanhf=2.59
 
 L_sve_cosf=1.57
 L_sve_cos=1.61
@@ -608,7 +626,7 @@ L_sve_erf=1.97
 L_sve_tanf=2.7
 L_sve_erfc=3.15
 
-while read G F R D
+while read G F R D A
 do
 	[ "$R" = 1 ] && { [[ $G != sve_* ]] || [ $WANT_SVE_MATH -eq 1 ]; } || continue
 	case "$G" in \#*) continue ;; esac
@@ -630,13 +648,17 @@ do
 		if [ $WANT_ERRNO -eq 1 ]; then
 			if [ "$D" = "fenv" ]; then
 				f=""
+			elif [ "$D" = "nofenv" ]; then
+				# Need to pass this if you want additional
+				# arguments but keep fenv checking disabled.
+				f="-f"
 			elif [ ! -z "$D" ]; then
 				echo "Unrecognised 4th argument: $D"
 				exit 1
 			fi
 		fi
 		case "$X" in \#*) continue ;; esac
-		t $f $F $X
+		t $A $f $F $X
 	done << EOF
 $range
 EOF
@@ -732,6 +754,10 @@ coshf  __s_coshf       $runs    fenv
 coshf  __v_coshf       $runv    fenv
 coshf  __vn_coshf      $runvn   fenv
 coshf  _ZGVnN4v_coshf  $runvn   fenv
+atanhf __s_atanhf      $runs    fenv -c 0
+atanhf __v_atanhf      $runv    fenv -c 0
+atanhf __vn_atanhf     $runvn   fenv -c 0
+atanhf _ZGVnN4v_atanhf $runvn   fenv -c 0
 
 sve_cosf     __sv_cosf         $runsv
 sve_cosf     _ZGVsMxv_cosf     $runsv
diff --git a/pl/math/test/testcases/directed/atanhf.tst b/pl/math/test/testcases/directed/atanhf.tst
new file mode 100644
index 0000000..616b59d
--- /dev/null
+++ b/pl/math/test/testcases/directed/atanhf.tst
@@ -0,0 +1,23 @@
+; atanhf.tst
+;
+; Copyright 2009-2022, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=atanhf op1=7fc00001 result=7fc00001 errno=0
+func=atanhf op1=ffc00001 result=7fc00001 errno=0
+func=atanhf op1=7f800001 result=7fc00001 errno=0 status=i
+func=atanhf op1=ff800001 result=7fc00001 errno=0 status=i
+func=atanhf op1=7f800000 result=7fc00001 errno=EDOM status=i
+func=atanhf op1=ff800000 result=7fc00001 errno=EDOM status=i
+func=atanhf op1=3f800001 result=7fc00001 errno=EDOM status=i
+func=atanhf op1=bf800001 result=7fc00001 errno=EDOM status=i
+func=atanhf op1=3f800000 result=7f800000 errno=ERANGE status=z
+func=atanhf op1=bf800000 result=ff800000 errno=ERANGE status=z
+func=atanhf op1=00000000 result=00000000 errno=0
+func=atanhf op1=80000000 result=80000000 errno=0
+
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+func=atanhf op1=00000001 result=00000001 errno=0 maybestatus=ux
+func=atanhf op1=80000001 result=80000001 errno=0 maybestatus=ux
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 1b674a6..84f4182 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -36,6 +36,7 @@
 F1 (acosh)
 F1 (asinh)
 F2 (atan2)
+F1 (atanh)
 F1 (cosh)
 F1 (erfc)
 F1 (erf)
@@ -59,6 +60,7 @@ _ZVNF1 (atan)
 _ZVND1 (atan)
 _ZVNF2 (atan2)
 _ZVND2 (atan2)
+_ZVNF1 (atanh)
 _ZVNF1 (cosh)
 _ZVND1 (cosh)
 _ZVNF1 (erf)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index a9f2d15..0ee07e9 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -118,6 +118,7 @@ DECL_POW_INT_REF(ref_powi, long double, double, int)
 ZVNF1_WRAP(asinh)
 ZVNF1_WRAP(atan)
 ZVNF2_WRAP(atan2)
+ZVNF1_WRAP(atanh)
 ZVNF1_WRAP(cosh)
 ZVNF1_WRAP(erf)
 ZVNF1_WRAP(erfc)
diff --git a/pl/math/v_atanhf_3u1.c b/pl/math/v_atanhf_3u1.c
new file mode 100644
index 0000000..54dcb9b
--- /dev/null
+++ b/pl/math/v_atanhf_3u1.c
@@ -0,0 +1,88 @@
+/*
+ * Single-precision vector atanh(x) function.
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "mathlib.h"
+
+#if V_SUPPORTED
+
+#define AbsMask 0x7fffffff
+#define Half 0x3f000000
+#define One 0x3f800000
+#define Four 0x40800000
+#define Ln2 0x1.62e43p-1f
+#define TinyBound 0x39800000 /* 0x1p-12, below which atanhf(x) rounds to x. */
+
+#define C(i) v_f32 (__log1pf_data.coeffs[i])
+
+static inline v_f32_t
+eval_poly (v_f32_t m)
+{
+  /* Approximate log(1+m) on [-0.25, 0.5] using Estrin scheme.  */
+  v_f32_t p_12 = v_fma_f32 (m, C (1), C (0));
+  v_f32_t p_34 = v_fma_f32 (m, C (3), C (2));
+  v_f32_t p_56 = v_fma_f32 (m, C (5), C (4));
+  v_f32_t p_78 = v_fma_f32 (m, C (7), C (6));
+
+  v_f32_t m2 = m * m;
+  v_f32_t p_02 = v_fma_f32 (m2, p_12, m);
+  v_f32_t p_36 = v_fma_f32 (m2, p_56, p_34);
+  v_f32_t p_79 = v_fma_f32 (m2, C (8), p_78);
+
+  v_f32_t m4 = m2 * m2;
+  v_f32_t p_06 = v_fma_f32 (m4, p_36, p_02);
+
+  return v_fma_f32 (m4, m4 * p_79, p_06);
+}
+
+static inline v_f32_t
+log1pf_inline (v_f32_t x)
+{
+  /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no
+     special-case handling. See that file for details of the algorithm.  */
+  v_f32_t m = x + 1.0f;
+  v_u32_t k = (v_as_u32_f32 (m) - 0x3f400000) & 0xff800000;
+  v_f32_t s = v_as_f32_u32 (v_u32 (Four) - k);
+  v_f32_t m_scale = v_as_f32_u32 (v_as_u32_f32 (x) - k)
+		    + v_fma_f32 (v_f32 (0.25f), s, v_f32 (-1.0f));
+  v_f32_t p = eval_poly (m_scale);
+  v_f32_t scale_back = v_to_f32_u32 (k) * 0x1.0p-23f;
+  return v_fma_f32 (scale_back, v_f32 (Ln2), p);
+}
+
+/* Approximation for vector single-precision atanh(x) using modified log1p.
+   The maximum error is 3.08 ULP:
+   __v_atanhf(0x1.ff215p-5) got 0x1.ffcb7cp-5
+			   want 0x1.ffcb82p-5.  */
+VPCS_ATTR v_f32_t V_NAME (atanhf) (v_f32_t x)
+{
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_f32_t halfsign
+    = v_as_f32_u32 (v_bsl_u32 (v_u32 (AbsMask), v_u32 (Half), ix));
+  v_u32_t iax = ix & AbsMask;
+
+  v_f32_t ax = v_as_f32_u32 (iax);
+
+#if WANT_ERRNO
+  v_u32_t special = v_cond_u32 ((iax >= One) | (iax <= TinyBound));
+  /* Side-step special cases by setting those lanes to 0, which will trigger no
+     exceptions. These will be fixed up later.  */
+  if (unlikely (v_any_u32 (special)))
+    ax = v_sel_f32 (special, v_f32 (0), ax);
+#else
+  v_u32_t special = v_cond_u32 (iax >= One);
+#endif
+
+  v_f32_t y = halfsign * log1pf_inline ((2 * ax) / (1 - ax));
+
+  if (unlikely (v_any_u32 (special)))
+    return v_call_f32 (atanhf, x, y, special);
+  return y;
+}
+
+VPCS_ALIAS
+
+#endif
diff --git a/pl/math/v_math.h b/pl/math/v_math.h
index d4597c8..3ed6244 100644
--- a/pl/math/v_math.h
+++ b/pl/math/v_math.h
@@ -174,6 +174,11 @@ v_abs_f32 (v_f32_t x)
   return __builtin_fabsf (x);
 }
 static inline v_u32_t
+v_bsl_u32 (v_u32_t m, v_u32_t x, v_u32_t y)
+{
+  return (y & ~m) | (x & m);
+}
+static inline v_u32_t
 v_cagt_f32 (v_f32_t x, v_f32_t y)
 {
   return fabsf (x) > fabsf (y);
@@ -536,6 +541,11 @@ v_abs_f32 (v_f32_t x)
   return vabsq_f32 (x);
 }
 static inline v_u32_t
+v_bsl_u32 (v_u32_t m, v_u32_t x, v_u32_t y)
+{
+  return vbslq_u32 (m, x, y);
+}
+static inline v_u32_t
 v_cagt_f32 (v_f32_t x, v_f32_t y)
 {
   return vcagtq_f32 (x, y);
diff --git a/pl/math/vn_atanhf_3u1.c b/pl/math/vn_atanhf_3u1.c
new file mode 100644
index 0000000..d4ad391
--- /dev/null
+++ b/pl/math/vn_atanhf_3u1.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_atanhf.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_atanhf, _ZGVnN4v_atanhf)
+#include "v_atanhf_3u1.c"
+#endif
author	Joe Ramsay <Joe.Ramsay@arm.com>	2022-11-22 13:40:07 +0000
committer	Joe Ramsay <joe.ramsay@arm.com>	2022-11-22 13:40:07 +0000
commit	d5844d863d498cb1c43fde4ae3e47f3eeb5d0024 (patch)
tree	83c1b8c9ae3cea23c50f8b1288594af31e8922fc
parent	9ace52019cab223febddb3fcd166b6b4ba64081f (diff)
download	arm-optimized-routines-d5844d863d498cb1c43fde4ae3e47f3eeb5d0024.tar.gz