diff options
author | Joe Ramsay <Joe.Ramsay@arm.com> | 2022-12-06 10:40:54 +0000 |
---|---|---|
committer | Joe Ramsay <joe.ramsay@arm.com> | 2022-12-06 10:40:54 +0000 |
commit | 2a963bbff4f16998def16ab5c7b1c7ab92f825a8 (patch) | |
tree | 62cd030c054cb4f6582b89129ddce0824bd3021f | |
parent | 0d3c3cd35440d224ddbcd1496b48835443f4c7c1 (diff) | |
download | arm-optimized-routines-2a963bbff4f16998def16ab5c7b1c7ab92f825a8.tar.gz |
pl/math: Set fenv flags in Neon asinhf
Routine no longer relies on vector log1pf, as this has to become more
complex to deal with fenv itself. Instead we re-use a log1pf helper
from Neon atanhf which does no special-case handling, instead leaving
it all up to the main routine. We now just fall back to the scalar
routine for special-case handling. This uncovered a mistake in
asinhf's handling of NaNs, which has been fixed.
-rw-r--r-- | pl/math/asinhf_3u5.c | 10 | ||||
-rwxr-xr-x | pl/math/test/runulp.sh | 8 | ||||
-rw-r--r-- | pl/math/v_asinhf_2u7.c | 43 | ||||
-rw-r--r-- | pl/math/v_atanhf_3u1.c | 41 | ||||
-rw-r--r-- | pl/math/v_log1pf_inline.h | 55 |
5 files changed, 93 insertions, 64 deletions
diff --git a/pl/math/asinhf_3u5.c b/pl/math/asinhf_3u5.c index 10f9f31..8aa62ad 100644 --- a/pl/math/asinhf_3u5.c +++ b/pl/math/asinhf_3u5.c @@ -11,7 +11,6 @@ #define Ln2 (0x1.62e4p-1f) #define One (0x3f8) #define ExpM12 (0x398) -#define QNaN (0x7fc) #define C(i) __asinhf_data.coeffs[i] @@ -45,10 +44,11 @@ asinhf (float x) float ax = asfloat (ia); uint32_t sign = ix & ~AbsMask; - if (ia12 < ExpM12 || ia12 == QNaN) - { - return x; - } + if (unlikely (ia12 < ExpM12 || ia == 0x7f800000)) + return x; + + if (unlikely (ia12 >= 0x7f8)) + return __math_invalidf (x); if (ia12 < One) { diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh index 484ebdf..ed45c73 100755 --- a/pl/math/test/runulp.sh +++ b/pl/math/test/runulp.sh @@ -766,10 +766,10 @@ log1pf __s_log1pf $runs log1pf __v_log1pf $runv log1pf __vn_log1pf $runvn log1pf _ZGVnN4v_log1pf $runvn -asinhf __s_asinhf $runs -asinhf __v_asinhf $runv -asinhf __vn_asinhf $runvn -asinhf _ZGVnN4v_asinhf $runvn +asinhf __s_asinhf $runs fenv +asinhf __v_asinhf $runv fenv +asinhf __vn_asinhf $runvn fenv +asinhf _ZGVnN4v_asinhf $runvn fenv log2f __s_log2f $runs log2f __v_log2f $runv log2f __vn_log2f $runvn diff --git a/pl/math/v_asinhf_2u7.c b/pl/math/v_asinhf_2u7.c index 675b8a8..7bce7ff 100644 --- a/pl/math/v_asinhf_2u7.c +++ b/pl/math/v_asinhf_2u7.c @@ -11,34 +11,45 @@ #define SignMask v_u32 (0x80000000) #define One v_f32 (1.0f) -#define Ln2 v_f32 (0x1.62e43p-1f) -#define SpecialBound v_u32 (0x5f800000) /* asuint(0x1p64). */ +#define BigBound v_u32 (0x5f800000) /* asuint(0x1p64). */ +#define TinyBound v_u32 (0x30800000) /* asuint(0x1p-30). */ + +#include "v_log1pf_inline.h" + +static NOINLINE v_f32_t +specialcase (v_f32_t x, v_f32_t y, v_u32_t special) +{ + return v_call_f32 (asinhf, x, y, special); +} /* Single-precision implementation of vector asinh(x), using vector log1p. Worst-case error is 2.66 ULP, at roughly +/-0.25: __v_asinhf(0x1.01b04p-2) got 0x1.fe163ep-3 want 0x1.fe1638p-3. */ VPCS_ATTR v_f32_t V_NAME (asinhf) (v_f32_t x) { - v_f32_t ax = v_abs_f32 (x); - v_u32_t special = v_cond_u32 (v_as_u32_f32 (ax) >= SpecialBound); - v_u32_t sign = v_as_u32_f32 (x) & SignMask; + v_u32_t ix = v_as_u32_f32 (x); + v_u32_t iax = ix & ~SignMask; + v_u32_t sign = ix & SignMask; + v_f32_t ax = v_as_f32_u32 (iax); + v_u32_t special = v_cond_u32 (iax >= BigBound); + +#if WANT_ERRNO + /* Sidestep tiny and large values to avoid inadvertently triggering + under/overflow. */ + special |= v_cond_u32 (iax < TinyBound); + if (unlikely (v_any_u32 (special))) + ax = v_sel_f32 (special, One, ax); +#endif /* asinh(x) = log(x + sqrt(x * x + 1)). For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))). */ v_f32_t d = One + v_sqrt_f32 (ax * ax + One); - v_f32_t y = V_NAME (log1pf) (ax + ax * ax / d); + v_f32_t y = log1pf_inline (ax + ax * ax / d); + y = v_as_f32_u32 (sign | v_as_u32_f32 (y)); if (unlikely (v_any_u32 (special))) - { - /* If |x| is too large, we cannot square it at low cost without overflow. - At very large x, asinh(x) ~= log(2x) and log(x) ~= log1p(x), so we - calculate asinh(x) as log1p(x) + log(2). */ - v_f32_t y_large = V_NAME (log1pf) (ax) + Ln2; - return v_as_f32_u32 (sign - | v_as_u32_f32 (v_sel_f32 (special, y_large, y))); - } - - return v_as_f32_u32 (sign | v_as_u32_f32 (y)); + return specialcase (x, y, special); + return y; } VPCS_ALIAS diff --git a/pl/math/v_atanhf_3u1.c b/pl/math/v_atanhf_3u1.c index 54dcb9b..1e3a561 100644 --- a/pl/math/v_atanhf_3u1.c +++ b/pl/math/v_atanhf_3u1.c @@ -9,50 +9,13 @@ #if V_SUPPORTED +#include "v_log1pf_inline.h" + #define AbsMask 0x7fffffff #define Half 0x3f000000 #define One 0x3f800000 -#define Four 0x40800000 -#define Ln2 0x1.62e43p-1f #define TinyBound 0x39800000 /* 0x1p-12, below which atanhf(x) rounds to x. */ -#define C(i) v_f32 (__log1pf_data.coeffs[i]) - -static inline v_f32_t -eval_poly (v_f32_t m) -{ - /* Approximate log(1+m) on [-0.25, 0.5] using Estrin scheme. */ - v_f32_t p_12 = v_fma_f32 (m, C (1), C (0)); - v_f32_t p_34 = v_fma_f32 (m, C (3), C (2)); - v_f32_t p_56 = v_fma_f32 (m, C (5), C (4)); - v_f32_t p_78 = v_fma_f32 (m, C (7), C (6)); - - v_f32_t m2 = m * m; - v_f32_t p_02 = v_fma_f32 (m2, p_12, m); - v_f32_t p_36 = v_fma_f32 (m2, p_56, p_34); - v_f32_t p_79 = v_fma_f32 (m2, C (8), p_78); - - v_f32_t m4 = m2 * m2; - v_f32_t p_06 = v_fma_f32 (m4, p_36, p_02); - - return v_fma_f32 (m4, m4 * p_79, p_06); -} - -static inline v_f32_t -log1pf_inline (v_f32_t x) -{ - /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no - special-case handling. See that file for details of the algorithm. */ - v_f32_t m = x + 1.0f; - v_u32_t k = (v_as_u32_f32 (m) - 0x3f400000) & 0xff800000; - v_f32_t s = v_as_f32_u32 (v_u32 (Four) - k); - v_f32_t m_scale = v_as_f32_u32 (v_as_u32_f32 (x) - k) - + v_fma_f32 (v_f32 (0.25f), s, v_f32 (-1.0f)); - v_f32_t p = eval_poly (m_scale); - v_f32_t scale_back = v_to_f32_u32 (k) * 0x1.0p-23f; - return v_fma_f32 (scale_back, v_f32 (Ln2), p); -} - /* Approximation for vector single-precision atanh(x) using modified log1p. The maximum error is 3.08 ULP: __v_atanhf(0x1.ff215p-5) got 0x1.ffcb7cp-5 diff --git a/pl/math/v_log1pf_inline.h b/pl/math/v_log1pf_inline.h new file mode 100644 index 0000000..cf32b2a --- /dev/null +++ b/pl/math/v_log1pf_inline.h @@ -0,0 +1,55 @@ +/* + * Helper for single-precision routines which calculate log(1 + x) and do not + * need special-case handling + * + * Copyright (c) 2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef PL_MATH_V_LOG1PF_INLINE_H +#define PL_MATH_V_LOG1PF_INLINE_H + +#include "v_math.h" +#include "math_config.h" + +#define Four 0x40800000 +#define Ln2 v_f32 (0x1.62e43p-1f) + +#define C(i) v_f32 (__log1pf_data.coeffs[i]) + +static inline v_f32_t +eval_poly (v_f32_t m) +{ + /* Approximate log(1+m) on [-0.25, 0.5] using Estrin scheme. */ + v_f32_t p_12 = v_fma_f32 (m, C (1), C (0)); + v_f32_t p_34 = v_fma_f32 (m, C (3), C (2)); + v_f32_t p_56 = v_fma_f32 (m, C (5), C (4)); + v_f32_t p_78 = v_fma_f32 (m, C (7), C (6)); + + v_f32_t m2 = m * m; + v_f32_t p_02 = v_fma_f32 (m2, p_12, m); + v_f32_t p_36 = v_fma_f32 (m2, p_56, p_34); + v_f32_t p_79 = v_fma_f32 (m2, C (8), p_78); + + v_f32_t m4 = m2 * m2; + v_f32_t p_06 = v_fma_f32 (m4, p_36, p_02); + + return v_fma_f32 (m4, m4 * p_79, p_06); +} + +static inline v_f32_t +log1pf_inline (v_f32_t x) +{ + /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no + special-case handling. See that file for details of the algorithm. */ + v_f32_t m = x + 1.0f; + v_u32_t k = (v_as_u32_f32 (m) - 0x3f400000) & 0xff800000; + v_f32_t s = v_as_f32_u32 (v_u32 (Four) - k); + v_f32_t m_scale = v_as_f32_u32 (v_as_u32_f32 (x) - k) + + v_fma_f32 (v_f32 (0.25f), s, v_f32 (-1.0f)); + v_f32_t p = eval_poly (m_scale); + v_f32_t scale_back = v_to_f32_u32 (k) * 0x1.0p-23f; + return v_fma_f32 (scale_back, Ln2, p); +} + +#endif // PL_MATH_V_LOG1PF_INLINE_H |