pl/math: Fix fp exceptions in Neon sinhf and sinh

Both routines previously relied on the vector expm1(f) routine exposed by the library, which depended on WANT_SIMD_EXCEPT for its fenv behaviour, however both routines were expected to always trigger fp exceptions correctly. To remedy this, both routines now use an inlined helper for expm1 (reused from vector tanhf in the case of sinhf), and special-case small input as well as large when WANT_SIMD_EXCEPT is enabled.
author: Joe Ramsay <Joe.Ramsay@arm.com> 2022-12-22 16:20:22 +0000
committer: Joe Ramsay <joe.ramsay@arm.com> 2022-12-22 16:20:22 +0000
commit: 0a9270a27f48bea87c5bd3f0f9c759da66fb45a3 (patch)
tree: 7a9eb5555187c92f1e1a49bd155241a1b41f4764
parent: 3bfa7bd49c5576d5b1f9e6a79e3d3a15fe3823bc (diff)
download: arm-optimized-routines-0a9270a27f48bea87c5bd3f0f9c759da66fb45a3.tar.gz
4 files changed, 132 insertions, 69 deletions
diff --git a/pl/math/v_expm1f_inline.h b/pl/math/v_expm1f_inline.h
new file mode 100644
index 0000000..ef9e934
--- /dev/null
+++ b/pl/math/v_expm1f_inline.h
@@ -0,0 +1,49 @@
+/*
+ * Helper for single-precision routines which calculate exp(x) - 1 and do not
+ * need special-case handling
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef PL_MATH_V_EXPM1F_INLINE_H
+#define PL_MATH_V_EXPM1F_INLINE_H
+
+#include "v_math.h"
+#include "math_config.h"
+#include "estrinf.h"
+
+#define One 0x3f800000
+#define Shift v_f32 (0x1.8p23f)
+#define InvLn2 v_f32 (0x1.715476p+0f)
+#define MLn2hi v_f32 (-0x1.62e4p-1f)
+#define MLn2lo v_f32 (-0x1.7f7d1cp-20f)
+
+#define C(i) v_f32 (__expm1f_poly[i])
+
+static inline v_f32_t
+expm1f_inline (v_f32_t x)
+{
+  /* Helper routine for calculating exp(x) - 1.
+     Copied from v_expm1f_1u6.c, with all special-case handling removed - the
+     calling routine should handle special values if required.  */
+
+  /* Reduce argument: f in [-ln2/2, ln2/2], i is exact.  */
+  v_f32_t j = v_fma_f32 (InvLn2, x, Shift) - Shift;
+  v_s32_t i = v_to_s32_f32 (j);
+  v_f32_t f = v_fma_f32 (j, MLn2hi, x);
+  f = v_fma_f32 (j, MLn2lo, f);
+
+  /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
+     Uses Estrin scheme, where the main __v_expm1f routine uses Horner.  */
+  v_f32_t f2 = f * f;
+  v_f32_t p = ESTRIN_4 (f, f2, f2 * f2, C);
+  p = v_fma_f32 (f2, p, f);
+
+  /* t = 2^i.  */
+  v_f32_t t = v_as_f32_u32 (v_as_u32_s32 (i << 23) + One);
+  /* expm1(x) ~= p * t + (t - 1).  */
+  return v_fma_f32 (p, t, t - 1);
+}
+
+#endif // PL_MATH_V_EXPM1F_INLINE_H
diff --git a/pl/math/v_sinh_3u.c b/pl/math/v_sinh_3u.c
index 9fe496e..8ddd29d 100644
--- a/pl/math/v_sinh_3u.c
+++ b/pl/math/v_sinh_3u.c
@@ -5,18 +5,51 @@
  */
 
 #include "v_math.h"
-#include "mathlib.h"
+#include "estrin.h"
 #include "pl_sig.h"
 #include "pl_test.h"
 
 #define AbsMask 0x7fffffffffffffff
 #define Half 0x3fe0000000000000
-#define OFlowBound                                                             \
-  0x40862e42fefa39f0 /* 0x1.62e42fefa39fp+9, above which using expm1 results   \
-			in NaN.  */
+#define BigBound                                                               \
+  0x4080000000000000 /* 2^9. expm1 helper overflows for large input.  */
+#define TinyBound                                                              \
+  0x3e50000000000000 /* 2^-26, below which sinh(x) rounds to x.  */
+#define InvLn2 v_f64 (0x1.71547652b82fep0)
+#define MLn2hi v_f64 (-0x1.62e42fefa39efp-1)
+#define MLn2lo v_f64 (-0x1.abc9e3b39803fp-56)
+#define Shift v_f64 (0x1.8p52)
+#define One 0x3ff0000000000000
+#define C(i) v_f64 (__expm1_poly[i])
 
 #if V_SUPPORTED
 
+static inline v_f64_t
+expm1_inline (v_f64_t x)
+{
+  /* Reduce argument:
+     exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+     where i = round(x / ln2)
+     and   f = x - i * ln2 (f in [-ln2/2, ln2/2]).  */
+  v_f64_t j = v_fma_f64 (InvLn2, x, Shift) - Shift;
+  v_s64_t i = v_to_s64_f64 (j);
+  v_f64_t f = v_fma_f64 (j, MLn2hi, x);
+  f = v_fma_f64 (j, MLn2lo, f);
+  /* Approximate expm1(f) using polynomial.  */
+  v_f64_t f2 = f * f, f4 = f2 * f2, f8 = f4 * f4;
+  v_f64_t p = v_fma_f64 (f2, ESTRIN_10 (f, f2, f4, f8, C), f);
+  /* t = 2^i.  */
+  v_f64_t t = v_as_f64_u64 (v_as_u64_s64 (i << 52) + One);
+  /* expm1(x) ~= p * t + (t - 1).  */
+  return v_fma_f64 (p, t, t - 1);
+}
+
+static NOINLINE VPCS_ATTR v_f64_t
+special_case (v_f64_t x)
+{
+  return v_call_f64 (sinh, x, x, v_u64 (-1));
+}
+
 /* Approximation for vector double-precision sinh(x) using expm1.
    sinh(x) = (exp(x) - exp(-x)) / 2.
    The greatest observed error is 2.57 ULP:
@@ -30,28 +63,31 @@ VPCS_ATTR v_f64_t V_NAME (sinh) (v_f64_t x)
   v_u64_t sign = ix & ~AbsMask;
   v_f64_t halfsign = v_as_f64_u64 (sign | Half);
 
-  v_u64_t special = v_cond_u64 (iax >= OFlowBound);
-  /* Fall back to the scalar variant for all lanes if any of them should trigger
-     an exception.  */
+#if WANT_SIMD_EXCEPT
+  v_u64_t special = v_cond_u64 ((iax - TinyBound) >= (BigBound - TinyBound));
+#else
+  v_u64_t special = v_cond_u64 (iax >= BigBound);
+#endif
+
+  /* Fall back to scalar variant for all lanes if any of them are special.  */
   if (unlikely (v_any_u64 (special)))
-    return v_call_f64 (sinh, x, x, v_u64 (-1));
+    return special_case (x);
 
   /* Up to the point that expm1 overflows, we can use it to calculate sinh
-     using a slight rearrangement of the definition of asinh. This allows us to
+     using a slight rearrangement of the definition of sinh. This allows us to
      retain acceptable accuracy for very small inputs.  */
-  v_f64_t t = V_NAME (expm1) (ax);
+  v_f64_t t = expm1_inline (ax);
   return (t + t / (t + 1)) * halfsign;
 }
 VPCS_ALIAS
 
 PL_SIG (V, D, 1, sinh, -10.0, 10.0)
 PL_TEST_ULP (V_NAME (sinh), 2.08)
-/* TODO: reinstate PL_TEST_EXPECT_FENV here once fp exceptions are triggered
-   correctly.  */
-PL_TEST_INTERVAL (V_NAME (sinh), 0, 0x1p-51, 100)
-PL_TEST_INTERVAL (V_NAME (sinh), -0, -0x1p-51, 100)
-PL_TEST_INTERVAL (V_NAME (sinh), 0x1p-51, 0x1.62e42fefa39fp+9, 100000)
-PL_TEST_INTERVAL (V_NAME (sinh), -0x1p-51, -0x1.62e42fefa39fp+9, 100000)
-PL_TEST_INTERVAL (V_NAME (sinh), 0x1.62e42fefa39fp+9, inf, 1000)
-PL_TEST_INTERVAL (V_NAME (sinh), -0x1.62e42fefa39fp+9, -inf, 1000)
+PL_TEST_EXPECT_FENV (V_NAME (sinh), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME (sinh), 0, TinyBound, 1000)
+PL_TEST_INTERVAL (V_NAME (sinh), -0, -TinyBound, 1000)
+PL_TEST_INTERVAL (V_NAME (sinh), TinyBound, BigBound, 500000)
+PL_TEST_INTERVAL (V_NAME (sinh), -TinyBound, -BigBound, 500000)
+PL_TEST_INTERVAL (V_NAME (sinh), BigBound, inf, 1000)
+PL_TEST_INTERVAL (V_NAME (sinh), -BigBound, -inf, 1000)
 #endif
diff --git a/pl/math/v_sinhf_2u3.c b/pl/math/v_sinhf_2u3.c
index ce2fe0e..a54c178 100644
--- a/pl/math/v_sinhf_2u3.c
+++ b/pl/math/v_sinhf_2u3.c
@@ -5,17 +5,25 @@
  */
 
 #include "v_math.h"
-#include "mathlib.h"
 #include "pl_sig.h"
 #include "pl_test.h"
 
 #if V_SUPPORTED
 
+#include "v_expm1f_inline.h"
+
 #define AbsMask 0x7fffffff
 #define Half 0x3f000000
-#define Expm1OFlowLimit                                                        \
-  0x42b17218 /* 0x1.62e43p+6, 2^7*ln2, minimum value for which expm1f          \
-		overflows.  */
+#define BigBound                                                               \
+  0x42b0c0a7 /* 0x1.61814ep+6, above which expm1f helper overflows.  */
+#define TinyBound                                                              \
+  0x2fb504f4 /* 0x1.6a09e8p-32, below which expm1f underflows.  */
+
+static NOINLINE VPCS_ATTR v_f32_t
+special_case (v_f32_t x)
+{
+  return v_call_f32 (sinhf, x, x, v_u32 (-1));
+}
 
 /* Approximation for vector single-precision sinh(x) using expm1.
    sinh(x) = (exp(x) - exp(-x)) / 2.
@@ -29,28 +37,32 @@ VPCS_ATTR v_f32_t V_NAME (sinhf) (v_f32_t x)
   v_u32_t sign = ix & ~AbsMask;
   v_f32_t halfsign = v_as_f32_u32 (sign | Half);
 
-  v_u32_t special = v_cond_u32 (iax >= Expm1OFlowLimit);
+#if WANT_SIMD_EXCEPT
+  v_u32_t special = v_cond_u32 ((iax - TinyBound) >= (BigBound - TinyBound));
+#else
+  v_u32_t special = v_cond_u32 (iax >= BigBound);
+#endif
+
   /* Fall back to the scalar variant for all lanes if any of them should trigger
      an exception.  */
   if (unlikely (v_any_u32 (special)))
-    return v_call_f32 (sinhf, x, x, v_u32 (-1));
+    return special_case (x);
 
   /* Up to the point that expm1f overflows, we can use it to calculate sinhf
      using a slight rearrangement of the definition of asinh. This allows us to
      retain acceptable accuracy for very small inputs.  */
-  v_f32_t t = V_NAME (expm1f) (ax);
+  v_f32_t t = expm1f_inline (ax);
   return (t + t / (t + 1)) * halfsign;
 }
 VPCS_ALIAS
 
 PL_SIG (V, F, 1, sinh, -10.0, 10.0)
 PL_TEST_ULP (V_NAME (sinhf), 1.76)
-/* TODO: reinstate PL_TEST_EXPECT_FENV here once fp exceptions are triggered
-   correctly.  */
-PL_TEST_INTERVAL (V_NAME (sinhf), 0, 0x1.62e43p+6, 100000)
-PL_TEST_INTERVAL (V_NAME (sinhf), -0, -0x1.62e43p+6, 100000)
-PL_TEST_INTERVAL (V_NAME (sinhf), 0x1.62e43p+6, 0x1.65a9fap+6, 100)
-PL_TEST_INTERVAL (V_NAME (sinhf), -0x1.62e43p+6, -0x1.65a9fap+6, 100)
-PL_TEST_INTERVAL (V_NAME (sinhf), 0x1.65a9fap+6, inf, 100)
-PL_TEST_INTERVAL (V_NAME (sinhf), -0x1.65a9fap+6, -inf, 100)
+PL_TEST_EXPECT_FENV (V_NAME (sinhf), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME (sinhf), 0, TinyBound, 1000)
+PL_TEST_INTERVAL (V_NAME (sinhf), -0, -TinyBound, 1000)
+PL_TEST_INTERVAL (V_NAME (sinhf), TinyBound, BigBound, 100000)
+PL_TEST_INTERVAL (V_NAME (sinhf), -TinyBound, -BigBound, 100000)
+PL_TEST_INTERVAL (V_NAME (sinhf), BigBound, inf, 1000)
+PL_TEST_INTERVAL (V_NAME (sinhf), -BigBound, -inf, 1000)
 #endif
diff --git a/pl/math/v_tanhf_2u6.c b/pl/math/v_tanhf_2u6.c
index dedc085..0e7ff69 100644
--- a/pl/math/v_tanhf_2u6.c
+++ b/pl/math/v_tanhf_2u6.c
@@ -5,51 +5,17 @@
  */
 
 #include "v_math.h"
-#include "estrinf.h"
-#include "mathlib.h"
 #include "pl_sig.h"
 #include "pl_test.h"
 
 #if V_SUPPORTED
 
+#include "v_expm1f_inline.h"
+
 #define BoringBound                                                            \
   0x41102cb3 /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for        \
 		negative).  */
 #define AbsMask 0x7fffffff
-#define One 0x3f800000
-
-#define Shift v_f32 (0x1.8p23f)
-#define InvLn2 v_f32 (0x1.715476p+0f)
-#define MLn2hi v_f32 (-0x1.62e4p-1f)
-#define MLn2lo v_f32 (-0x1.7f7d1cp-20f)
-
-#define C(i) v_f32 (__expm1f_poly[i])
-
-static inline v_f32_t
-expm1f_inline (v_f32_t x)
-{
-  /* Helper routine for calculating exp(x) - 1.
-     Copied from v_expm1f_1u6.c, with all special-case handling removed, as
-     special, tiny and large values are all dealt with in the main tanhf
-     routine.  */
-
-  /* Reduce argument: f in [-ln2/2, ln2/2], i is exact.  */
-  v_f32_t j = v_fma_f32 (InvLn2, x, Shift) - Shift;
-  v_s32_t i = v_to_s32_f32 (j);
-  v_f32_t f = v_fma_f32 (j, MLn2hi, x);
-  f = v_fma_f32 (j, MLn2lo, f);
-
-  /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
-     Uses Estrin scheme, where the main __v_expm1f routine uses Horner.  */
-  v_f32_t f2 = f * f;
-  v_f32_t p = ESTRIN_4 (f, f2, f2 * f2, C);
-  p = v_fma_f32 (f2, p, f);
-
-  /* t = 2^i.  */
-  v_f32_t t = v_as_f32_u32 (v_as_u32_s32 (i << 23) + One);
-  /* expm1(x) ~= p * t + (t - 1).  */
-  return v_fma_f32 (p, t, t - 1);
-}
 
 static NOINLINE v_f32_t
 special_case (v_f32_t x, v_f32_t y, v_u32_t special)
author	Joe Ramsay <Joe.Ramsay@arm.com>	2022-12-22 16:20:22 +0000
committer	Joe Ramsay <joe.ramsay@arm.com>	2022-12-22 16:20:22 +0000
commit	0a9270a27f48bea87c5bd3f0f9c759da66fb45a3 (patch)
tree	7a9eb5555187c92f1e1a49bd155241a1b41f4764
parent	3bfa7bd49c5576d5b1f9e6a79e3d3a15fe3823bc (diff)
download	arm-optimized-routines-0a9270a27f48bea87c5bd3f0f9c759da66fb45a3.tar.gz