pl/math: Add scalar and vector/Neon tanhf

Both routines use simplified inline versions of expm1f, and are accurate to 2.6 ULP.
author: Joe Ramsay <Joe.Ramsay@arm.com> 2022-11-30 09:42:41 +0000
committer: Joe Ramsay <joe.ramsay@arm.com> 2022-11-30 09:42:41 +0000
commit: 8a644bf15812edaba38b41ca142e8e7e328e7918 (patch)
tree: 1b44a7f8a954f663b41e84cc6fd8909bd6645f55
parent: 8a0d24f8af39bda7b54341067c6ccc8a7f12ff27 (diff)
download: arm-optimized-routines-8a644bf15812edaba38b41ca142e8e7e328e7918.tar.gz
10 files changed, 243 insertions, 0 deletions
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 6721d45..1266eb7 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -22,6 +22,7 @@ float log10f (float);
 float log1pf (float);
 float sinhf (float);
 float tanf (float);
+float tanhf (float);
 
 double acosh (double);
 double asinh (double);
@@ -47,6 +48,7 @@ float __s_log1pf (float);
 float __s_log2f (float);
 float __s_sinhf (float);
 float __s_tanf (float);
+float __s_tanhf (float);
 
 double __s_asinh (double);
 double __s_atan (double);
@@ -97,6 +99,7 @@ __f64x2_t __v_log2 (__f64x2_t);
 __f32x4_t __v_sinhf (__f32x4_t);
 __f64x2_t __v_sinh (__f64x2_t);
 __f32x4_t __v_tanf (__f32x4_t);
+__f32x4_t __v_tanhf (__f32x4_t);
 
 #if __GNUC__ >= 9 || __clang_major__ >= 8
 #define __vpcs __attribute__((__aarch64_vector_pcs__))
@@ -127,6 +130,7 @@ __vpcs __f64x2_t __vn_log2 (__f64x2_t);
 __vpcs __f32x4_t __vn_sinhf (__f32x4_t);
 __vpcs __f64x2_t __vn_sinh (__f64x2_t);
 __vpcs __f32x4_t __vn_tanf (__f32x4_t);
+__vpcs __f32x4_t __vn_tanhf (__f32x4_t);
 
 /* Vector functions following the vector PCS using ABI names.  */
 __vpcs __f32x4_t _ZGVnN4v_asinhf (__f32x4_t);
@@ -154,6 +158,7 @@ __vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_sinhf (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_sinh (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_tanhf (__f32x4_t);
 
 #endif
 
diff --git a/pl/math/s_tanhf_2u6.c b/pl/math/s_tanhf_2u6.c
new file mode 100644
index 0000000..bbb4569
--- /dev/null
+++ b/pl/math/s_tanhf_2u6.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_tanhf_2u6.c"
diff --git a/pl/math/tanhf_2u6.c b/pl/math/tanhf_2u6.c
new file mode 100644
index 0000000..9db2533
--- /dev/null
+++ b/pl/math/tanhf_2u6.c
@@ -0,0 +1,80 @@
+/*
+ * Single-precision tanh(x) function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "math_config.h"
+
+#define BoringBound                                                            \
+  0x41102cb3 /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for        \
+		negative).  */
+#define AbsMask 0x7fffffff
+#define One 0x3f800000
+
+#define Shift (0x1.8p23f)
+#define InvLn2 (0x1.715476p+0f)
+#define Ln2hi (0x1.62e4p-1f)
+#define Ln2lo (0x1.7f7d1cp-20f)
+
+#define C(i) __expm1f_poly[i]
+
+static inline float
+expm1f_inline (float x)
+{
+  /* Helper routine for calculating exp(x) - 1.
+     Copied from expm1f_1u6.c, with several simplifications:
+     - No special-case handling for tiny or special values, instead return early
+       from the main routine.
+     - No special handling for large values:
+       - No early return for infinity.
+       - Simpler combination of p and t in final stage of algorithm.
+       - |i| < 27, so can calculate t by simpler shift-and-add, instead of
+	 ldexpf (same as vector algorithm).  */
+
+  /* Reduce argument: f in [-ln2/2, ln2/2], i is exact.  */
+  float j = fmaf (InvLn2, x, Shift) - Shift;
+  int32_t i = j;
+  float f = fmaf (j, -Ln2hi, x);
+  f = fmaf (j, -Ln2lo, f);
+
+  /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
+     Uses Estrin scheme, where the main expm1f routine uses Horner.  */
+  float f2 = f * f;
+  float p_01 = fmaf (f, C (1), C (0));
+  float p_23 = fmaf (f, C (3), C (2));
+  float p = fmaf (f2, p_23, p_01);
+  p = fmaf (f2 * f2, C (4), p);
+  p = fmaf (f2, p, f);
+
+  /* t = 2^i.  */
+  float t = asfloat ((i << 23) + One);
+  /* expm1(x) ~= p * t + (t - 1).  */
+  return fmaf (p, t, t - 1);
+}
+
+/* Approximation for single-precision tanh(x), using a simplified version of
+   expm1f. The maximum error is 2.58 ULP:
+   tanhf(0x1.fa5eep-5) got 0x1.f9ba02p-5
+		      want 0x1.f9ba08p-5.  */
+float
+tanhf (float x)
+{
+  uint32_t ix = asuint (x);
+  uint32_t iax = ix & AbsMask;
+  uint32_t sign = ix & ~AbsMask;
+
+  if (unlikely (iax > BoringBound))
+    {
+      if (iax > 0x7f800000)
+	return __math_invalidf (x);
+      return asfloat (One | sign);
+    }
+
+  if (unlikely (iax < 0x34000000))
+    return x;
+
+  /* tanh(x) = (e^2x - 1) / (e^2x + 1).  */
+  float q = expm1f_inline (2 * x);
+  return q / (q + 2);
+}
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index b972f83..9e3b9a0 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -43,6 +43,7 @@ F (log2f, 0.01, 11.1)
 F (sinf, -3.1, 3.1)
 F (sinhf, -10.0, 10.0)
 F (tanf, -3.1, 3.1)
+F (tanhf, -10.0, 10.0)
 
 D (acosh, 1.0, 10.0)
 D (asinh, -10.0, 10.0)
@@ -84,6 +85,7 @@ ZVND (log2, 0.01, 11.1)
 ZVNF (sinhf, -10.0, 10.0)
 ZVND (sinh, -10.0, 10.0)
 ZVNF (tanf, -3.1, 3.1)
+ZVNF (tanhf, -10.0, 10.0)
 {"__s_atan2f", 'f', 0, -10.0, 10.0, {.f = __s_atan2f_wrap}},
 {"__s_atan2", 'd', 0, -10.0, 10.0, {.d = __s_atan2_wrap}},
 {"__v_atan2f", 'f', 'v', -10.0, 10.0, {.vf = __v_atan2f_wrap}},
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 5d0188d..484ebdf 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -215,6 +215,14 @@ L=1.03
 t cbrtf  0  inf 1000000
 t cbrtf -0 -inf 1000000
 
+L=2.09
+t tanhf  0              0x1p-23       1000
+t tanhf -0             -0x1p-23       1000
+t tanhf  0x1p-23        0x1.205966p+3 100000
+t tanhf -0x1p-23       -0x1.205966p+3 100000
+t tanhf  0x1.205966p+3  inf           100
+t tanhf -0x1.205966p+3 -inf           100
+
 done
 
 # vector functions
@@ -461,6 +469,15 @@ range_asinh='
  -0x1p511 -inf     40000
 '
 
+range_tanhf='
+  0              0x1p-23       1000
+ -0             -0x1p-23       1000
+  0x1p-23        0x1.205966p+3 100000
+ -0x1p-23       -0x1.205966p+3 100000
+  0x1.205966p+3  inf           100
+ -0x1.205966p+3 -inf           100
+'
+
 range_sve_cosf='
  0    0xffff0000    10000
  0x1p-4    0x1p4    500000
@@ -629,6 +646,7 @@ L_cosh=1.43
 L_atanhf=2.59
 L_cbrtf=1.03
 L_asinh=1.54
+L_tanhf=2.09
 
 L_sve_cosf=1.57
 L_sve_cos=1.61
@@ -796,6 +814,10 @@ asinh  _ZGVnN2v_asinh  $runvn   fenv -c 2
 asinh  __v_asinh       $runv    fenv -c 0x1p600
 asinh  __vn_asinh      $runvn   fenv -c 0x1p600
 asinh  _ZGVnN2v_asinh  $runvn   fenv -c 0x1p600
+tanhf  __s_tanhf       $runs    fenv
+tanhf  __v_tanhf       $runv    fenv
+tanhf  __vn_tanhf      $runvn   fenv
+tanhf  _ZGVnN4v_tanhf  $runvn   fenv
 
 sve_cosf     __sv_cosf         $runsv
 sve_cosf     _ZGVsMxv_cosf     $runsv
diff --git a/pl/math/test/testcases/directed/tanhf.tst b/pl/math/test/testcases/directed/tanhf.tst
new file mode 100644
index 0000000..c3edb50
--- /dev/null
+++ b/pl/math/test/testcases/directed/tanhf.tst
@@ -0,0 +1,20 @@
+; tanhf.tst
+;
+; Copyright 2007-2022, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=tanhf op1=7fc00001 result=7fc00001 errno=0
+func=tanhf op1=ffc00001 result=7fc00001 errno=0
+func=tanhf op1=7f800001 result=7fc00001 errno=0 status=i
+func=tanhf op1=ff800001 result=7fc00001 errno=0 status=i
+func=tanhf op1=7f800000 result=3f800000 errno=0
+func=tanhf op1=ff800000 result=bf800000 errno=0
+func=tanhf op1=00000000 result=00000000 errno=0
+func=tanhf op1=80000000 result=80000000 errno=0
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+; func=tanhf op1=00000001 result=00000001 errno=0 maybestatus=ux
+; func=tanhf op1=80000001 result=80000001 errno=0 maybestatus=ux
+func=tanhf op1=00000001 result=00000001 errno=0 maybestatus=ux
+func=tanhf op1=80000001 result=80000001 errno=0 maybestatus=ux
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 465630d..86e2bed 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -46,6 +46,7 @@ F1 (log10)
 F1 (log1p)
 F1 (sinh)
 F1 (tan)
+F1 (tanh)
 D1 (acosh)
 D1 (asinh)
 D2 (atan2)
@@ -81,6 +82,7 @@ _ZVND1 (log2)
 _ZVNF1 (sinh)
 _ZVND1 (sinh)
 _ZVNF1 (tan)
+_ZVNF1 (tanh)
 #if WANT_SVE_MATH
 _ZSVF2 (atan2)
 _ZSVD2 (atan2)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 28ee251..be87c21 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -129,6 +129,7 @@ ZVNF1_WRAP(log1p)
 ZVNF1_WRAP(log2)
 ZVNF1_WRAP(sinh)
 ZVNF1_WRAP(tan)
+ZVNF1_WRAP(tanh)
 ZVND1_WRAP(asinh)
 ZVND1_WRAP(atan)
 ZVND2_WRAP(atan2)
diff --git a/pl/math/v_tanhf_2u6.c b/pl/math/v_tanhf_2u6.c
new file mode 100644
index 0000000..571fd8b
--- /dev/null
+++ b/pl/math/v_tanhf_2u6.c
@@ -0,0 +1,93 @@
+/*
+ * Single-precision vector tanh(x) function.
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "mathlib.h"
+
+#if V_SUPPORTED
+
+#define BoringBound                                                            \
+  0x41102cb3 /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for        \
+		negative).  */
+#define AbsMask 0x7fffffff
+#define One 0x3f800000
+
+#define Shift v_f32 (0x1.8p23f)
+#define InvLn2 v_f32 (0x1.715476p+0f)
+#define MLn2hi v_f32 (-0x1.62e4p-1f)
+#define MLn2lo v_f32 (-0x1.7f7d1cp-20f)
+
+#define C(i) v_f32 (__expm1f_poly[i])
+
+static inline v_f32_t
+expm1f_inline (v_f32_t x)
+{
+  /* Helper routine for calculating exp(x) - 1.
+     Copied from v_expm1f_1u6.c, with all special-case handling removed, as
+     special, tiny and large values are all dealt with in the main tanhf
+     routine.  */
+
+  /* Reduce argument: f in [-ln2/2, ln2/2], i is exact.  */
+  v_f32_t j = v_fma_f32 (InvLn2, x, Shift) - Shift;
+  v_s32_t i = v_to_s32_f32 (j);
+  v_f32_t f = v_fma_f32 (j, MLn2hi, x);
+  f = v_fma_f32 (j, MLn2lo, f);
+
+  /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
+     Uses Estrin scheme, where the main __v_expm1f routine uses Horner.  */
+  v_f32_t f2 = f * f;
+  v_f32_t p_01 = v_fma_f32 (f, C (1), C (0));
+  v_f32_t p_23 = v_fma_f32 (f, C (3), C (2));
+  v_f32_t p = v_fma_f32 (f2, p_23, p_01);
+  p = v_fma_f32 (f2 * f2, C (4), p);
+  p = v_fma_f32 (f2, p, f);
+
+  /* t = 2^i.  */
+  v_f32_t t = v_as_f32_u32 (v_as_u32_s32 (i << 23) + One);
+  /* expm1(x) ~= p * t + (t - 1).  */
+  return v_fma_f32 (p, t, t - 1);
+}
+
+static NOINLINE v_f32_t
+special_case (v_f32_t x, v_f32_t y, v_u32_t special)
+{
+  return v_call_f32 (tanhf, x, y, special);
+}
+
+/* Approximation for single-precision vector tanh(x), using a simplified version
+   of expm1f. The maximum error is 2.58 ULP:
+   __v_tanhf(0x1.fa5eep-5) got 0x1.f9ba02p-5
+			  want 0x1.f9ba08p-5.  */
+VPCS_ATTR v_f32_t V_NAME (tanhf) (v_f32_t x)
+{
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t iax = ix & AbsMask;
+  v_u32_t sign = ix & ~AbsMask;
+  v_u32_t is_boring = v_cond_u32 (iax > BoringBound);
+  v_f32_t boring = v_as_f32_u32 (sign | One);
+
+#if WANT_ERRNO
+  /* If errno needs to be set properly, set all special and boring lanes to 1,
+     which will trigger no exceptions, and fix them up later.  */
+  v_u32_t special = v_cond_u32 ((iax > 0x7f800000) | (iax < 0x34000000));
+  ix = v_sel_u32 (is_boring, v_u32 (One), ix);
+  if (unlikely (v_any_u32 (special)))
+    ix = v_sel_u32 (special, v_u32 (One), ix);
+#else
+  v_u32_t special = v_cond_u32 ((iax > 0x7f800000) | (iax == 0));
+#endif
+
+  /* tanh(x) = (e^2x - 1) / (e^2x + 1).  */
+  v_f32_t q = expm1f_inline (2 * v_as_f32_u32 (ix));
+  v_f32_t y = q / (q + 2);
+  y = v_sel_f32 (is_boring, boring, y);
+  if (unlikely (v_any_u32 (special)))
+    return special_case (x, y, special);
+  return y;
+}
+VPCS_ALIAS
+
+#endif
diff --git a/pl/math/vn_tanhf_2u6.c b/pl/math/vn_tanhf_2u6.c
new file mode 100644
index 0000000..96fd67a
--- /dev/null
+++ b/pl/math/vn_tanhf_2u6.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_tanhf.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_tanhf, _ZGVnN4v_tanhf)
+#include "v_tanhf_2u6.c"
+#endif
author	Joe Ramsay <Joe.Ramsay@arm.com>	2022-11-30 09:42:41 +0000
committer	Joe Ramsay <joe.ramsay@arm.com>	2022-11-30 09:42:41 +0000
commit	8a644bf15812edaba38b41ca142e8e7e328e7918 (patch)
tree	1b44a7f8a954f663b41e84cc6fd8909bd6645f55
parent	8a0d24f8af39bda7b54341067c6ccc8a7f12ff27 (diff)
download	arm-optimized-routines-8a644bf15812edaba38b41ca142e8e7e328e7918.tar.gz