diff options
author | Joe Ramsay <Joe.Ramsay@arm.com> | 2022-11-30 09:42:41 +0000 |
---|---|---|
committer | Joe Ramsay <joe.ramsay@arm.com> | 2022-11-30 09:42:41 +0000 |
commit | 8a644bf15812edaba38b41ca142e8e7e328e7918 (patch) | |
tree | 1b44a7f8a954f663b41e84cc6fd8909bd6645f55 | |
parent | 8a0d24f8af39bda7b54341067c6ccc8a7f12ff27 (diff) | |
download | arm-optimized-routines-8a644bf15812edaba38b41ca142e8e7e328e7918.tar.gz |
pl/math: Add scalar and vector/Neon tanhf
Both routines use simplified inline versions of expm1f, and are
accurate to 2.6 ULP.
-rw-r--r-- | pl/math/include/mathlib.h | 5 | ||||
-rw-r--r-- | pl/math/s_tanhf_2u6.c | 6 | ||||
-rw-r--r-- | pl/math/tanhf_2u6.c | 80 | ||||
-rw-r--r-- | pl/math/test/mathbench_funcs.h | 2 | ||||
-rwxr-xr-x | pl/math/test/runulp.sh | 22 | ||||
-rw-r--r-- | pl/math/test/testcases/directed/tanhf.tst | 20 | ||||
-rw-r--r-- | pl/math/test/ulp_funcs.h | 2 | ||||
-rw-r--r-- | pl/math/test/ulp_wrappers.h | 1 | ||||
-rw-r--r-- | pl/math/v_tanhf_2u6.c | 93 | ||||
-rw-r--r-- | pl/math/vn_tanhf_2u6.c | 12 |
10 files changed, 243 insertions, 0 deletions
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h index 6721d45..1266eb7 100644 --- a/pl/math/include/mathlib.h +++ b/pl/math/include/mathlib.h @@ -22,6 +22,7 @@ float log10f (float); float log1pf (float); float sinhf (float); float tanf (float); +float tanhf (float); double acosh (double); double asinh (double); @@ -47,6 +48,7 @@ float __s_log1pf (float); float __s_log2f (float); float __s_sinhf (float); float __s_tanf (float); +float __s_tanhf (float); double __s_asinh (double); double __s_atan (double); @@ -97,6 +99,7 @@ __f64x2_t __v_log2 (__f64x2_t); __f32x4_t __v_sinhf (__f32x4_t); __f64x2_t __v_sinh (__f64x2_t); __f32x4_t __v_tanf (__f32x4_t); +__f32x4_t __v_tanhf (__f32x4_t); #if __GNUC__ >= 9 || __clang_major__ >= 8 #define __vpcs __attribute__((__aarch64_vector_pcs__)) @@ -127,6 +130,7 @@ __vpcs __f64x2_t __vn_log2 (__f64x2_t); __vpcs __f32x4_t __vn_sinhf (__f32x4_t); __vpcs __f64x2_t __vn_sinh (__f64x2_t); __vpcs __f32x4_t __vn_tanf (__f32x4_t); +__vpcs __f32x4_t __vn_tanhf (__f32x4_t); /* Vector functions following the vector PCS using ABI names. */ __vpcs __f32x4_t _ZGVnN4v_asinhf (__f32x4_t); @@ -154,6 +158,7 @@ __vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t); __vpcs __f32x4_t _ZGVnN4v_sinhf (__f32x4_t); __vpcs __f64x2_t _ZGVnN2v_sinh (__f64x2_t); __vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t); +__vpcs __f32x4_t _ZGVnN4v_tanhf (__f32x4_t); #endif diff --git a/pl/math/s_tanhf_2u6.c b/pl/math/s_tanhf_2u6.c new file mode 100644 index 0000000..bbb4569 --- /dev/null +++ b/pl/math/s_tanhf_2u6.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define SCALAR 1 +#include "v_tanhf_2u6.c" diff --git a/pl/math/tanhf_2u6.c b/pl/math/tanhf_2u6.c new file mode 100644 index 0000000..9db2533 --- /dev/null +++ b/pl/math/tanhf_2u6.c @@ -0,0 +1,80 @@ +/* + * Single-precision tanh(x) function. + * + * Copyright (c) 2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "math_config.h" + +#define BoringBound \ + 0x41102cb3 /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for \ + negative). */ +#define AbsMask 0x7fffffff +#define One 0x3f800000 + +#define Shift (0x1.8p23f) +#define InvLn2 (0x1.715476p+0f) +#define Ln2hi (0x1.62e4p-1f) +#define Ln2lo (0x1.7f7d1cp-20f) + +#define C(i) __expm1f_poly[i] + +static inline float +expm1f_inline (float x) +{ + /* Helper routine for calculating exp(x) - 1. + Copied from expm1f_1u6.c, with several simplifications: + - No special-case handling for tiny or special values, instead return early + from the main routine. + - No special handling for large values: + - No early return for infinity. + - Simpler combination of p and t in final stage of algorithm. + - |i| < 27, so can calculate t by simpler shift-and-add, instead of + ldexpf (same as vector algorithm). */ + + /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */ + float j = fmaf (InvLn2, x, Shift) - Shift; + int32_t i = j; + float f = fmaf (j, -Ln2hi, x); + f = fmaf (j, -Ln2lo, f); + + /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f). + Uses Estrin scheme, where the main expm1f routine uses Horner. */ + float f2 = f * f; + float p_01 = fmaf (f, C (1), C (0)); + float p_23 = fmaf (f, C (3), C (2)); + float p = fmaf (f2, p_23, p_01); + p = fmaf (f2 * f2, C (4), p); + p = fmaf (f2, p, f); + + /* t = 2^i. */ + float t = asfloat ((i << 23) + One); + /* expm1(x) ~= p * t + (t - 1). */ + return fmaf (p, t, t - 1); +} + +/* Approximation for single-precision tanh(x), using a simplified version of + expm1f. The maximum error is 2.58 ULP: + tanhf(0x1.fa5eep-5) got 0x1.f9ba02p-5 + want 0x1.f9ba08p-5. */ +float +tanhf (float x) +{ + uint32_t ix = asuint (x); + uint32_t iax = ix & AbsMask; + uint32_t sign = ix & ~AbsMask; + + if (unlikely (iax > BoringBound)) + { + if (iax > 0x7f800000) + return __math_invalidf (x); + return asfloat (One | sign); + } + + if (unlikely (iax < 0x34000000)) + return x; + + /* tanh(x) = (e^2x - 1) / (e^2x + 1). */ + float q = expm1f_inline (2 * x); + return q / (q + 2); +} diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h index b972f83..9e3b9a0 100644 --- a/pl/math/test/mathbench_funcs.h +++ b/pl/math/test/mathbench_funcs.h @@ -43,6 +43,7 @@ F (log2f, 0.01, 11.1) F (sinf, -3.1, 3.1) F (sinhf, -10.0, 10.0) F (tanf, -3.1, 3.1) +F (tanhf, -10.0, 10.0) D (acosh, 1.0, 10.0) D (asinh, -10.0, 10.0) @@ -84,6 +85,7 @@ ZVND (log2, 0.01, 11.1) ZVNF (sinhf, -10.0, 10.0) ZVND (sinh, -10.0, 10.0) ZVNF (tanf, -3.1, 3.1) +ZVNF (tanhf, -10.0, 10.0) {"__s_atan2f", 'f', 0, -10.0, 10.0, {.f = __s_atan2f_wrap}}, {"__s_atan2", 'd', 0, -10.0, 10.0, {.d = __s_atan2_wrap}}, {"__v_atan2f", 'f', 'v', -10.0, 10.0, {.vf = __v_atan2f_wrap}}, diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh index 5d0188d..484ebdf 100755 --- a/pl/math/test/runulp.sh +++ b/pl/math/test/runulp.sh @@ -215,6 +215,14 @@ L=1.03 t cbrtf 0 inf 1000000 t cbrtf -0 -inf 1000000 +L=2.09 +t tanhf 0 0x1p-23 1000 +t tanhf -0 -0x1p-23 1000 +t tanhf 0x1p-23 0x1.205966p+3 100000 +t tanhf -0x1p-23 -0x1.205966p+3 100000 +t tanhf 0x1.205966p+3 inf 100 +t tanhf -0x1.205966p+3 -inf 100 + done # vector functions @@ -461,6 +469,15 @@ range_asinh=' -0x1p511 -inf 40000 ' +range_tanhf=' + 0 0x1p-23 1000 + -0 -0x1p-23 1000 + 0x1p-23 0x1.205966p+3 100000 + -0x1p-23 -0x1.205966p+3 100000 + 0x1.205966p+3 inf 100 + -0x1.205966p+3 -inf 100 +' + range_sve_cosf=' 0 0xffff0000 10000 0x1p-4 0x1p4 500000 @@ -629,6 +646,7 @@ L_cosh=1.43 L_atanhf=2.59 L_cbrtf=1.03 L_asinh=1.54 +L_tanhf=2.09 L_sve_cosf=1.57 L_sve_cos=1.61 @@ -796,6 +814,10 @@ asinh _ZGVnN2v_asinh $runvn fenv -c 2 asinh __v_asinh $runv fenv -c 0x1p600 asinh __vn_asinh $runvn fenv -c 0x1p600 asinh _ZGVnN2v_asinh $runvn fenv -c 0x1p600 +tanhf __s_tanhf $runs fenv +tanhf __v_tanhf $runv fenv +tanhf __vn_tanhf $runvn fenv +tanhf _ZGVnN4v_tanhf $runvn fenv sve_cosf __sv_cosf $runsv sve_cosf _ZGVsMxv_cosf $runsv diff --git a/pl/math/test/testcases/directed/tanhf.tst b/pl/math/test/testcases/directed/tanhf.tst new file mode 100644 index 0000000..c3edb50 --- /dev/null +++ b/pl/math/test/testcases/directed/tanhf.tst @@ -0,0 +1,20 @@ +; tanhf.tst +; +; Copyright 2007-2022, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=tanhf op1=7fc00001 result=7fc00001 errno=0 +func=tanhf op1=ffc00001 result=7fc00001 errno=0 +func=tanhf op1=7f800001 result=7fc00001 errno=0 status=i +func=tanhf op1=ff800001 result=7fc00001 errno=0 status=i +func=tanhf op1=7f800000 result=3f800000 errno=0 +func=tanhf op1=ff800000 result=bf800000 errno=0 +func=tanhf op1=00000000 result=00000000 errno=0 +func=tanhf op1=80000000 result=80000000 errno=0 +; No exception is raised with certain versions of glibc. Functions +; approximated by x near zero may not generate/implement flops and +; thus may not raise exceptions. +; func=tanhf op1=00000001 result=00000001 errno=0 maybestatus=ux +; func=tanhf op1=80000001 result=80000001 errno=0 maybestatus=ux +func=tanhf op1=00000001 result=00000001 errno=0 maybestatus=ux +func=tanhf op1=80000001 result=80000001 errno=0 maybestatus=ux diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h index 465630d..86e2bed 100644 --- a/pl/math/test/ulp_funcs.h +++ b/pl/math/test/ulp_funcs.h @@ -46,6 +46,7 @@ F1 (log10) F1 (log1p) F1 (sinh) F1 (tan) +F1 (tanh) D1 (acosh) D1 (asinh) D2 (atan2) @@ -81,6 +82,7 @@ _ZVND1 (log2) _ZVNF1 (sinh) _ZVND1 (sinh) _ZVNF1 (tan) +_ZVNF1 (tanh) #if WANT_SVE_MATH _ZSVF2 (atan2) _ZSVD2 (atan2) diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h index 28ee251..be87c21 100644 --- a/pl/math/test/ulp_wrappers.h +++ b/pl/math/test/ulp_wrappers.h @@ -129,6 +129,7 @@ ZVNF1_WRAP(log1p) ZVNF1_WRAP(log2) ZVNF1_WRAP(sinh) ZVNF1_WRAP(tan) +ZVNF1_WRAP(tanh) ZVND1_WRAP(asinh) ZVND1_WRAP(atan) ZVND2_WRAP(atan2) diff --git a/pl/math/v_tanhf_2u6.c b/pl/math/v_tanhf_2u6.c new file mode 100644 index 0000000..571fd8b --- /dev/null +++ b/pl/math/v_tanhf_2u6.c @@ -0,0 +1,93 @@ +/* + * Single-precision vector tanh(x) function. + * Copyright (c) 2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "mathlib.h" + +#if V_SUPPORTED + +#define BoringBound \ + 0x41102cb3 /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for \ + negative). */ +#define AbsMask 0x7fffffff +#define One 0x3f800000 + +#define Shift v_f32 (0x1.8p23f) +#define InvLn2 v_f32 (0x1.715476p+0f) +#define MLn2hi v_f32 (-0x1.62e4p-1f) +#define MLn2lo v_f32 (-0x1.7f7d1cp-20f) + +#define C(i) v_f32 (__expm1f_poly[i]) + +static inline v_f32_t +expm1f_inline (v_f32_t x) +{ + /* Helper routine for calculating exp(x) - 1. + Copied from v_expm1f_1u6.c, with all special-case handling removed, as + special, tiny and large values are all dealt with in the main tanhf + routine. */ + + /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */ + v_f32_t j = v_fma_f32 (InvLn2, x, Shift) - Shift; + v_s32_t i = v_to_s32_f32 (j); + v_f32_t f = v_fma_f32 (j, MLn2hi, x); + f = v_fma_f32 (j, MLn2lo, f); + + /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f). + Uses Estrin scheme, where the main __v_expm1f routine uses Horner. */ + v_f32_t f2 = f * f; + v_f32_t p_01 = v_fma_f32 (f, C (1), C (0)); + v_f32_t p_23 = v_fma_f32 (f, C (3), C (2)); + v_f32_t p = v_fma_f32 (f2, p_23, p_01); + p = v_fma_f32 (f2 * f2, C (4), p); + p = v_fma_f32 (f2, p, f); + + /* t = 2^i. */ + v_f32_t t = v_as_f32_u32 (v_as_u32_s32 (i << 23) + One); + /* expm1(x) ~= p * t + (t - 1). */ + return v_fma_f32 (p, t, t - 1); +} + +static NOINLINE v_f32_t +special_case (v_f32_t x, v_f32_t y, v_u32_t special) +{ + return v_call_f32 (tanhf, x, y, special); +} + +/* Approximation for single-precision vector tanh(x), using a simplified version + of expm1f. The maximum error is 2.58 ULP: + __v_tanhf(0x1.fa5eep-5) got 0x1.f9ba02p-5 + want 0x1.f9ba08p-5. */ +VPCS_ATTR v_f32_t V_NAME (tanhf) (v_f32_t x) +{ + v_u32_t ix = v_as_u32_f32 (x); + v_u32_t iax = ix & AbsMask; + v_u32_t sign = ix & ~AbsMask; + v_u32_t is_boring = v_cond_u32 (iax > BoringBound); + v_f32_t boring = v_as_f32_u32 (sign | One); + +#if WANT_ERRNO + /* If errno needs to be set properly, set all special and boring lanes to 1, + which will trigger no exceptions, and fix them up later. */ + v_u32_t special = v_cond_u32 ((iax > 0x7f800000) | (iax < 0x34000000)); + ix = v_sel_u32 (is_boring, v_u32 (One), ix); + if (unlikely (v_any_u32 (special))) + ix = v_sel_u32 (special, v_u32 (One), ix); +#else + v_u32_t special = v_cond_u32 ((iax > 0x7f800000) | (iax == 0)); +#endif + + /* tanh(x) = (e^2x - 1) / (e^2x + 1). */ + v_f32_t q = expm1f_inline (2 * v_as_f32_u32 (ix)); + v_f32_t y = q / (q + 2); + y = v_sel_f32 (is_boring, boring, y); + if (unlikely (v_any_u32 (special))) + return special_case (x, y, special); + return y; +} +VPCS_ALIAS + +#endif diff --git a/pl/math/vn_tanhf_2u6.c b/pl/math/vn_tanhf_2u6.c new file mode 100644 index 0000000..96fd67a --- /dev/null +++ b/pl/math/vn_tanhf_2u6.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_tanhf. + * + * Copyright (c) 2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "include/mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS strong_alias (__vn_tanhf, _ZGVnN4v_tanhf) +#include "v_tanhf_2u6.c" +#endif |