aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJoe Ramsay <Joe.Ramsay@arm.com>2022-11-30 09:42:41 +0000
committerJoe Ramsay <joe.ramsay@arm.com>2022-11-30 09:42:41 +0000
commit8a644bf15812edaba38b41ca142e8e7e328e7918 (patch)
tree1b44a7f8a954f663b41e84cc6fd8909bd6645f55
parent8a0d24f8af39bda7b54341067c6ccc8a7f12ff27 (diff)
downloadarm-optimized-routines-8a644bf15812edaba38b41ca142e8e7e328e7918.tar.gz
pl/math: Add scalar and vector/Neon tanhf
Both routines use simplified inline versions of expm1f, and are accurate to 2.6 ULP.
-rw-r--r--pl/math/include/mathlib.h5
-rw-r--r--pl/math/s_tanhf_2u6.c6
-rw-r--r--pl/math/tanhf_2u6.c80
-rw-r--r--pl/math/test/mathbench_funcs.h2
-rwxr-xr-xpl/math/test/runulp.sh22
-rw-r--r--pl/math/test/testcases/directed/tanhf.tst20
-rw-r--r--pl/math/test/ulp_funcs.h2
-rw-r--r--pl/math/test/ulp_wrappers.h1
-rw-r--r--pl/math/v_tanhf_2u6.c93
-rw-r--r--pl/math/vn_tanhf_2u6.c12
10 files changed, 243 insertions, 0 deletions
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 6721d45..1266eb7 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -22,6 +22,7 @@ float log10f (float);
float log1pf (float);
float sinhf (float);
float tanf (float);
+float tanhf (float);
double acosh (double);
double asinh (double);
@@ -47,6 +48,7 @@ float __s_log1pf (float);
float __s_log2f (float);
float __s_sinhf (float);
float __s_tanf (float);
+float __s_tanhf (float);
double __s_asinh (double);
double __s_atan (double);
@@ -97,6 +99,7 @@ __f64x2_t __v_log2 (__f64x2_t);
__f32x4_t __v_sinhf (__f32x4_t);
__f64x2_t __v_sinh (__f64x2_t);
__f32x4_t __v_tanf (__f32x4_t);
+__f32x4_t __v_tanhf (__f32x4_t);
#if __GNUC__ >= 9 || __clang_major__ >= 8
#define __vpcs __attribute__((__aarch64_vector_pcs__))
@@ -127,6 +130,7 @@ __vpcs __f64x2_t __vn_log2 (__f64x2_t);
__vpcs __f32x4_t __vn_sinhf (__f32x4_t);
__vpcs __f64x2_t __vn_sinh (__f64x2_t);
__vpcs __f32x4_t __vn_tanf (__f32x4_t);
+__vpcs __f32x4_t __vn_tanhf (__f32x4_t);
/* Vector functions following the vector PCS using ABI names. */
__vpcs __f32x4_t _ZGVnN4v_asinhf (__f32x4_t);
@@ -154,6 +158,7 @@ __vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t);
__vpcs __f32x4_t _ZGVnN4v_sinhf (__f32x4_t);
__vpcs __f64x2_t _ZGVnN2v_sinh (__f64x2_t);
__vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_tanhf (__f32x4_t);
#endif
diff --git a/pl/math/s_tanhf_2u6.c b/pl/math/s_tanhf_2u6.c
new file mode 100644
index 0000000..bbb4569
--- /dev/null
+++ b/pl/math/s_tanhf_2u6.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_tanhf_2u6.c"
diff --git a/pl/math/tanhf_2u6.c b/pl/math/tanhf_2u6.c
new file mode 100644
index 0000000..9db2533
--- /dev/null
+++ b/pl/math/tanhf_2u6.c
@@ -0,0 +1,80 @@
+/*
+ * Single-precision tanh(x) function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "math_config.h"
+
+#define BoringBound \
+ 0x41102cb3 /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for \
+ negative). */
+#define AbsMask 0x7fffffff
+#define One 0x3f800000
+
+#define Shift (0x1.8p23f)
+#define InvLn2 (0x1.715476p+0f)
+#define Ln2hi (0x1.62e4p-1f)
+#define Ln2lo (0x1.7f7d1cp-20f)
+
+#define C(i) __expm1f_poly[i]
+
+static inline float
+expm1f_inline (float x)
+{
+ /* Helper routine for calculating exp(x) - 1.
+ Copied from expm1f_1u6.c, with several simplifications:
+ - No special-case handling for tiny or special values, instead return early
+ from the main routine.
+ - No special handling for large values:
+ - No early return for infinity.
+ - Simpler combination of p and t in final stage of algorithm.
+ - |i| < 27, so can calculate t by simpler shift-and-add, instead of
+ ldexpf (same as vector algorithm). */
+
+ /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */
+ float j = fmaf (InvLn2, x, Shift) - Shift;
+ int32_t i = j;
+ float f = fmaf (j, -Ln2hi, x);
+ f = fmaf (j, -Ln2lo, f);
+
+ /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
+ Uses Estrin scheme, where the main expm1f routine uses Horner. */
+ float f2 = f * f;
+ float p_01 = fmaf (f, C (1), C (0));
+ float p_23 = fmaf (f, C (3), C (2));
+ float p = fmaf (f2, p_23, p_01);
+ p = fmaf (f2 * f2, C (4), p);
+ p = fmaf (f2, p, f);
+
+ /* t = 2^i. */
+ float t = asfloat ((i << 23) + One);
+ /* expm1(x) ~= p * t + (t - 1). */
+ return fmaf (p, t, t - 1);
+}
+
+/* Approximation for single-precision tanh(x), using a simplified version of
+ expm1f. The maximum error is 2.58 ULP:
+ tanhf(0x1.fa5eep-5) got 0x1.f9ba02p-5
+ want 0x1.f9ba08p-5. */
+float
+tanhf (float x)
+{
+ uint32_t ix = asuint (x);
+ uint32_t iax = ix & AbsMask;
+ uint32_t sign = ix & ~AbsMask;
+
+ if (unlikely (iax > BoringBound))
+ {
+ if (iax > 0x7f800000)
+ return __math_invalidf (x);
+ return asfloat (One | sign);
+ }
+
+ if (unlikely (iax < 0x34000000))
+ return x;
+
+ /* tanh(x) = (e^2x - 1) / (e^2x + 1). */
+ float q = expm1f_inline (2 * x);
+ return q / (q + 2);
+}
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index b972f83..9e3b9a0 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -43,6 +43,7 @@ F (log2f, 0.01, 11.1)
F (sinf, -3.1, 3.1)
F (sinhf, -10.0, 10.0)
F (tanf, -3.1, 3.1)
+F (tanhf, -10.0, 10.0)
D (acosh, 1.0, 10.0)
D (asinh, -10.0, 10.0)
@@ -84,6 +85,7 @@ ZVND (log2, 0.01, 11.1)
ZVNF (sinhf, -10.0, 10.0)
ZVND (sinh, -10.0, 10.0)
ZVNF (tanf, -3.1, 3.1)
+ZVNF (tanhf, -10.0, 10.0)
{"__s_atan2f", 'f', 0, -10.0, 10.0, {.f = __s_atan2f_wrap}},
{"__s_atan2", 'd', 0, -10.0, 10.0, {.d = __s_atan2_wrap}},
{"__v_atan2f", 'f', 'v', -10.0, 10.0, {.vf = __v_atan2f_wrap}},
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 5d0188d..484ebdf 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -215,6 +215,14 @@ L=1.03
t cbrtf 0 inf 1000000
t cbrtf -0 -inf 1000000
+L=2.09
+t tanhf 0 0x1p-23 1000
+t tanhf -0 -0x1p-23 1000
+t tanhf 0x1p-23 0x1.205966p+3 100000
+t tanhf -0x1p-23 -0x1.205966p+3 100000
+t tanhf 0x1.205966p+3 inf 100
+t tanhf -0x1.205966p+3 -inf 100
+
done
# vector functions
@@ -461,6 +469,15 @@ range_asinh='
-0x1p511 -inf 40000
'
+range_tanhf='
+ 0 0x1p-23 1000
+ -0 -0x1p-23 1000
+ 0x1p-23 0x1.205966p+3 100000
+ -0x1p-23 -0x1.205966p+3 100000
+ 0x1.205966p+3 inf 100
+ -0x1.205966p+3 -inf 100
+'
+
range_sve_cosf='
0 0xffff0000 10000
0x1p-4 0x1p4 500000
@@ -629,6 +646,7 @@ L_cosh=1.43
L_atanhf=2.59
L_cbrtf=1.03
L_asinh=1.54
+L_tanhf=2.09
L_sve_cosf=1.57
L_sve_cos=1.61
@@ -796,6 +814,10 @@ asinh _ZGVnN2v_asinh $runvn fenv -c 2
asinh __v_asinh $runv fenv -c 0x1p600
asinh __vn_asinh $runvn fenv -c 0x1p600
asinh _ZGVnN2v_asinh $runvn fenv -c 0x1p600
+tanhf __s_tanhf $runs fenv
+tanhf __v_tanhf $runv fenv
+tanhf __vn_tanhf $runvn fenv
+tanhf _ZGVnN4v_tanhf $runvn fenv
sve_cosf __sv_cosf $runsv
sve_cosf _ZGVsMxv_cosf $runsv
diff --git a/pl/math/test/testcases/directed/tanhf.tst b/pl/math/test/testcases/directed/tanhf.tst
new file mode 100644
index 0000000..c3edb50
--- /dev/null
+++ b/pl/math/test/testcases/directed/tanhf.tst
@@ -0,0 +1,20 @@
+; tanhf.tst
+;
+; Copyright 2007-2022, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=tanhf op1=7fc00001 result=7fc00001 errno=0
+func=tanhf op1=ffc00001 result=7fc00001 errno=0
+func=tanhf op1=7f800001 result=7fc00001 errno=0 status=i
+func=tanhf op1=ff800001 result=7fc00001 errno=0 status=i
+func=tanhf op1=7f800000 result=3f800000 errno=0
+func=tanhf op1=ff800000 result=bf800000 errno=0
+func=tanhf op1=00000000 result=00000000 errno=0
+func=tanhf op1=80000000 result=80000000 errno=0
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+; func=tanhf op1=00000001 result=00000001 errno=0 maybestatus=ux
+; func=tanhf op1=80000001 result=80000001 errno=0 maybestatus=ux
+func=tanhf op1=00000001 result=00000001 errno=0 maybestatus=ux
+func=tanhf op1=80000001 result=80000001 errno=0 maybestatus=ux
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 465630d..86e2bed 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -46,6 +46,7 @@ F1 (log10)
F1 (log1p)
F1 (sinh)
F1 (tan)
+F1 (tanh)
D1 (acosh)
D1 (asinh)
D2 (atan2)
@@ -81,6 +82,7 @@ _ZVND1 (log2)
_ZVNF1 (sinh)
_ZVND1 (sinh)
_ZVNF1 (tan)
+_ZVNF1 (tanh)
#if WANT_SVE_MATH
_ZSVF2 (atan2)
_ZSVD2 (atan2)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 28ee251..be87c21 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -129,6 +129,7 @@ ZVNF1_WRAP(log1p)
ZVNF1_WRAP(log2)
ZVNF1_WRAP(sinh)
ZVNF1_WRAP(tan)
+ZVNF1_WRAP(tanh)
ZVND1_WRAP(asinh)
ZVND1_WRAP(atan)
ZVND2_WRAP(atan2)
diff --git a/pl/math/v_tanhf_2u6.c b/pl/math/v_tanhf_2u6.c
new file mode 100644
index 0000000..571fd8b
--- /dev/null
+++ b/pl/math/v_tanhf_2u6.c
@@ -0,0 +1,93 @@
+/*
+ * Single-precision vector tanh(x) function.
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "mathlib.h"
+
+#if V_SUPPORTED
+
+#define BoringBound \
+ 0x41102cb3 /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for \
+ negative). */
+#define AbsMask 0x7fffffff
+#define One 0x3f800000
+
+#define Shift v_f32 (0x1.8p23f)
+#define InvLn2 v_f32 (0x1.715476p+0f)
+#define MLn2hi v_f32 (-0x1.62e4p-1f)
+#define MLn2lo v_f32 (-0x1.7f7d1cp-20f)
+
+#define C(i) v_f32 (__expm1f_poly[i])
+
+static inline v_f32_t
+expm1f_inline (v_f32_t x)
+{
+ /* Helper routine for calculating exp(x) - 1.
+ Copied from v_expm1f_1u6.c, with all special-case handling removed, as
+ special, tiny and large values are all dealt with in the main tanhf
+ routine. */
+
+ /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */
+ v_f32_t j = v_fma_f32 (InvLn2, x, Shift) - Shift;
+ v_s32_t i = v_to_s32_f32 (j);
+ v_f32_t f = v_fma_f32 (j, MLn2hi, x);
+ f = v_fma_f32 (j, MLn2lo, f);
+
+ /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
+ Uses Estrin scheme, where the main __v_expm1f routine uses Horner. */
+ v_f32_t f2 = f * f;
+ v_f32_t p_01 = v_fma_f32 (f, C (1), C (0));
+ v_f32_t p_23 = v_fma_f32 (f, C (3), C (2));
+ v_f32_t p = v_fma_f32 (f2, p_23, p_01);
+ p = v_fma_f32 (f2 * f2, C (4), p);
+ p = v_fma_f32 (f2, p, f);
+
+ /* t = 2^i. */
+ v_f32_t t = v_as_f32_u32 (v_as_u32_s32 (i << 23) + One);
+ /* expm1(x) ~= p * t + (t - 1). */
+ return v_fma_f32 (p, t, t - 1);
+}
+
+static NOINLINE v_f32_t
+special_case (v_f32_t x, v_f32_t y, v_u32_t special)
+{
+ return v_call_f32 (tanhf, x, y, special);
+}
+
+/* Approximation for single-precision vector tanh(x), using a simplified version
+ of expm1f. The maximum error is 2.58 ULP:
+ __v_tanhf(0x1.fa5eep-5) got 0x1.f9ba02p-5
+ want 0x1.f9ba08p-5. */
+VPCS_ATTR v_f32_t V_NAME (tanhf) (v_f32_t x)
+{
+ v_u32_t ix = v_as_u32_f32 (x);
+ v_u32_t iax = ix & AbsMask;
+ v_u32_t sign = ix & ~AbsMask;
+ v_u32_t is_boring = v_cond_u32 (iax > BoringBound);
+ v_f32_t boring = v_as_f32_u32 (sign | One);
+
+#if WANT_ERRNO
+ /* If errno needs to be set properly, set all special and boring lanes to 1,
+ which will trigger no exceptions, and fix them up later. */
+ v_u32_t special = v_cond_u32 ((iax > 0x7f800000) | (iax < 0x34000000));
+ ix = v_sel_u32 (is_boring, v_u32 (One), ix);
+ if (unlikely (v_any_u32 (special)))
+ ix = v_sel_u32 (special, v_u32 (One), ix);
+#else
+ v_u32_t special = v_cond_u32 ((iax > 0x7f800000) | (iax == 0));
+#endif
+
+ /* tanh(x) = (e^2x - 1) / (e^2x + 1). */
+ v_f32_t q = expm1f_inline (2 * v_as_f32_u32 (ix));
+ v_f32_t y = q / (q + 2);
+ y = v_sel_f32 (is_boring, boring, y);
+ if (unlikely (v_any_u32 (special)))
+ return special_case (x, y, special);
+ return y;
+}
+VPCS_ALIAS
+
+#endif
diff --git a/pl/math/vn_tanhf_2u6.c b/pl/math/vn_tanhf_2u6.c
new file mode 100644
index 0000000..96fd67a
--- /dev/null
+++ b/pl/math/vn_tanhf_2u6.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_tanhf.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_tanhf, _ZGVnN4v_tanhf)
+#include "v_tanhf_2u6.c"
+#endif