aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJoe Ramsay <Joe.Ramsay@arm.com>2022-12-06 10:40:54 +0000
committerJoe Ramsay <joe.ramsay@arm.com>2022-12-06 10:40:54 +0000
commit2a963bbff4f16998def16ab5c7b1c7ab92f825a8 (patch)
tree62cd030c054cb4f6582b89129ddce0824bd3021f
parent0d3c3cd35440d224ddbcd1496b48835443f4c7c1 (diff)
downloadarm-optimized-routines-2a963bbff4f16998def16ab5c7b1c7ab92f825a8.tar.gz
pl/math: Set fenv flags in Neon asinhf
Routine no longer relies on vector log1pf, as this has to become more complex to deal with fenv itself. Instead we re-use a log1pf helper from Neon atanhf which does no special-case handling, instead leaving it all up to the main routine. We now just fall back to the scalar routine for special-case handling. This uncovered a mistake in asinhf's handling of NaNs, which has been fixed.
-rw-r--r--pl/math/asinhf_3u5.c10
-rwxr-xr-xpl/math/test/runulp.sh8
-rw-r--r--pl/math/v_asinhf_2u7.c43
-rw-r--r--pl/math/v_atanhf_3u1.c41
-rw-r--r--pl/math/v_log1pf_inline.h55
5 files changed, 93 insertions, 64 deletions
diff --git a/pl/math/asinhf_3u5.c b/pl/math/asinhf_3u5.c
index 10f9f31..8aa62ad 100644
--- a/pl/math/asinhf_3u5.c
+++ b/pl/math/asinhf_3u5.c
@@ -11,7 +11,6 @@
#define Ln2 (0x1.62e4p-1f)
#define One (0x3f8)
#define ExpM12 (0x398)
-#define QNaN (0x7fc)
#define C(i) __asinhf_data.coeffs[i]
@@ -45,10 +44,11 @@ asinhf (float x)
float ax = asfloat (ia);
uint32_t sign = ix & ~AbsMask;
- if (ia12 < ExpM12 || ia12 == QNaN)
- {
- return x;
- }
+ if (unlikely (ia12 < ExpM12 || ia == 0x7f800000))
+ return x;
+
+ if (unlikely (ia12 >= 0x7f8))
+ return __math_invalidf (x);
if (ia12 < One)
{
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 484ebdf..ed45c73 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -766,10 +766,10 @@ log1pf __s_log1pf $runs
log1pf __v_log1pf $runv
log1pf __vn_log1pf $runvn
log1pf _ZGVnN4v_log1pf $runvn
-asinhf __s_asinhf $runs
-asinhf __v_asinhf $runv
-asinhf __vn_asinhf $runvn
-asinhf _ZGVnN4v_asinhf $runvn
+asinhf __s_asinhf $runs fenv
+asinhf __v_asinhf $runv fenv
+asinhf __vn_asinhf $runvn fenv
+asinhf _ZGVnN4v_asinhf $runvn fenv
log2f __s_log2f $runs
log2f __v_log2f $runv
log2f __vn_log2f $runvn
diff --git a/pl/math/v_asinhf_2u7.c b/pl/math/v_asinhf_2u7.c
index 675b8a8..7bce7ff 100644
--- a/pl/math/v_asinhf_2u7.c
+++ b/pl/math/v_asinhf_2u7.c
@@ -11,34 +11,45 @@
#define SignMask v_u32 (0x80000000)
#define One v_f32 (1.0f)
-#define Ln2 v_f32 (0x1.62e43p-1f)
-#define SpecialBound v_u32 (0x5f800000) /* asuint(0x1p64). */
+#define BigBound v_u32 (0x5f800000) /* asuint(0x1p64). */
+#define TinyBound v_u32 (0x30800000) /* asuint(0x1p-30). */
+
+#include "v_log1pf_inline.h"
+
+static NOINLINE v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t special)
+{
+ return v_call_f32 (asinhf, x, y, special);
+}
/* Single-precision implementation of vector asinh(x), using vector log1p.
Worst-case error is 2.66 ULP, at roughly +/-0.25:
__v_asinhf(0x1.01b04p-2) got 0x1.fe163ep-3 want 0x1.fe1638p-3. */
VPCS_ATTR v_f32_t V_NAME (asinhf) (v_f32_t x)
{
- v_f32_t ax = v_abs_f32 (x);
- v_u32_t special = v_cond_u32 (v_as_u32_f32 (ax) >= SpecialBound);
- v_u32_t sign = v_as_u32_f32 (x) & SignMask;
+ v_u32_t ix = v_as_u32_f32 (x);
+ v_u32_t iax = ix & ~SignMask;
+ v_u32_t sign = ix & SignMask;
+ v_f32_t ax = v_as_f32_u32 (iax);
+ v_u32_t special = v_cond_u32 (iax >= BigBound);
+
+#if WANT_ERRNO
+ /* Sidestep tiny and large values to avoid inadvertently triggering
+ under/overflow. */
+ special |= v_cond_u32 (iax < TinyBound);
+ if (unlikely (v_any_u32 (special)))
+ ax = v_sel_f32 (special, One, ax);
+#endif
/* asinh(x) = log(x + sqrt(x * x + 1)).
For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))). */
v_f32_t d = One + v_sqrt_f32 (ax * ax + One);
- v_f32_t y = V_NAME (log1pf) (ax + ax * ax / d);
+ v_f32_t y = log1pf_inline (ax + ax * ax / d);
+ y = v_as_f32_u32 (sign | v_as_u32_f32 (y));
if (unlikely (v_any_u32 (special)))
- {
- /* If |x| is too large, we cannot square it at low cost without overflow.
- At very large x, asinh(x) ~= log(2x) and log(x) ~= log1p(x), so we
- calculate asinh(x) as log1p(x) + log(2). */
- v_f32_t y_large = V_NAME (log1pf) (ax) + Ln2;
- return v_as_f32_u32 (sign
- | v_as_u32_f32 (v_sel_f32 (special, y_large, y)));
- }
-
- return v_as_f32_u32 (sign | v_as_u32_f32 (y));
+ return specialcase (x, y, special);
+ return y;
}
VPCS_ALIAS
diff --git a/pl/math/v_atanhf_3u1.c b/pl/math/v_atanhf_3u1.c
index 54dcb9b..1e3a561 100644
--- a/pl/math/v_atanhf_3u1.c
+++ b/pl/math/v_atanhf_3u1.c
@@ -9,50 +9,13 @@
#if V_SUPPORTED
+#include "v_log1pf_inline.h"
+
#define AbsMask 0x7fffffff
#define Half 0x3f000000
#define One 0x3f800000
-#define Four 0x40800000
-#define Ln2 0x1.62e43p-1f
#define TinyBound 0x39800000 /* 0x1p-12, below which atanhf(x) rounds to x. */
-#define C(i) v_f32 (__log1pf_data.coeffs[i])
-
-static inline v_f32_t
-eval_poly (v_f32_t m)
-{
- /* Approximate log(1+m) on [-0.25, 0.5] using Estrin scheme. */
- v_f32_t p_12 = v_fma_f32 (m, C (1), C (0));
- v_f32_t p_34 = v_fma_f32 (m, C (3), C (2));
- v_f32_t p_56 = v_fma_f32 (m, C (5), C (4));
- v_f32_t p_78 = v_fma_f32 (m, C (7), C (6));
-
- v_f32_t m2 = m * m;
- v_f32_t p_02 = v_fma_f32 (m2, p_12, m);
- v_f32_t p_36 = v_fma_f32 (m2, p_56, p_34);
- v_f32_t p_79 = v_fma_f32 (m2, C (8), p_78);
-
- v_f32_t m4 = m2 * m2;
- v_f32_t p_06 = v_fma_f32 (m4, p_36, p_02);
-
- return v_fma_f32 (m4, m4 * p_79, p_06);
-}
-
-static inline v_f32_t
-log1pf_inline (v_f32_t x)
-{
- /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no
- special-case handling. See that file for details of the algorithm. */
- v_f32_t m = x + 1.0f;
- v_u32_t k = (v_as_u32_f32 (m) - 0x3f400000) & 0xff800000;
- v_f32_t s = v_as_f32_u32 (v_u32 (Four) - k);
- v_f32_t m_scale = v_as_f32_u32 (v_as_u32_f32 (x) - k)
- + v_fma_f32 (v_f32 (0.25f), s, v_f32 (-1.0f));
- v_f32_t p = eval_poly (m_scale);
- v_f32_t scale_back = v_to_f32_u32 (k) * 0x1.0p-23f;
- return v_fma_f32 (scale_back, v_f32 (Ln2), p);
-}
-
/* Approximation for vector single-precision atanh(x) using modified log1p.
The maximum error is 3.08 ULP:
__v_atanhf(0x1.ff215p-5) got 0x1.ffcb7cp-5
diff --git a/pl/math/v_log1pf_inline.h b/pl/math/v_log1pf_inline.h
new file mode 100644
index 0000000..cf32b2a
--- /dev/null
+++ b/pl/math/v_log1pf_inline.h
@@ -0,0 +1,55 @@
+/*
+ * Helper for single-precision routines which calculate log(1 + x) and do not
+ * need special-case handling
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef PL_MATH_V_LOG1PF_INLINE_H
+#define PL_MATH_V_LOG1PF_INLINE_H
+
+#include "v_math.h"
+#include "math_config.h"
+
+#define Four 0x40800000
+#define Ln2 v_f32 (0x1.62e43p-1f)
+
+#define C(i) v_f32 (__log1pf_data.coeffs[i])
+
+static inline v_f32_t
+eval_poly (v_f32_t m)
+{
+ /* Approximate log(1+m) on [-0.25, 0.5] using Estrin scheme. */
+ v_f32_t p_12 = v_fma_f32 (m, C (1), C (0));
+ v_f32_t p_34 = v_fma_f32 (m, C (3), C (2));
+ v_f32_t p_56 = v_fma_f32 (m, C (5), C (4));
+ v_f32_t p_78 = v_fma_f32 (m, C (7), C (6));
+
+ v_f32_t m2 = m * m;
+ v_f32_t p_02 = v_fma_f32 (m2, p_12, m);
+ v_f32_t p_36 = v_fma_f32 (m2, p_56, p_34);
+ v_f32_t p_79 = v_fma_f32 (m2, C (8), p_78);
+
+ v_f32_t m4 = m2 * m2;
+ v_f32_t p_06 = v_fma_f32 (m4, p_36, p_02);
+
+ return v_fma_f32 (m4, m4 * p_79, p_06);
+}
+
+static inline v_f32_t
+log1pf_inline (v_f32_t x)
+{
+ /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no
+ special-case handling. See that file for details of the algorithm. */
+ v_f32_t m = x + 1.0f;
+ v_u32_t k = (v_as_u32_f32 (m) - 0x3f400000) & 0xff800000;
+ v_f32_t s = v_as_f32_u32 (v_u32 (Four) - k);
+ v_f32_t m_scale = v_as_f32_u32 (v_as_u32_f32 (x) - k)
+ + v_fma_f32 (v_f32 (0.25f), s, v_f32 (-1.0f));
+ v_f32_t p = eval_poly (m_scale);
+ v_f32_t scale_back = v_to_f32_u32 (k) * 0x1.0p-23f;
+ return v_fma_f32 (scale_back, Ln2, p);
+}
+
+#endif // PL_MATH_V_LOG1PF_INLINE_H