From 8a644bf15812edaba38b41ca142e8e7e328e7918 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Wed, 30 Nov 2022 09:42:41 +0000
Subject: pl/math: Add scalar and vector/Neon tanhf

Both routines use simplified inline versions of expm1f, and are
accurate to 2.6 ULP.
---
 pl/math/v_tanhf_2u6.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 93 insertions(+)
 create mode 100644 pl/math/v_tanhf_2u6.c

(limited to 'pl/math/v_tanhf_2u6.c')

diff --git a/pl/math/v_tanhf_2u6.c b/pl/math/v_tanhf_2u6.c
new file mode 100644
index 0000000..571fd8b
--- /dev/null
+++ b/pl/math/v_tanhf_2u6.c
@@ -0,0 +1,93 @@
+/*
+ * Single-precision vector tanh(x) function.
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "mathlib.h"
+
+#if V_SUPPORTED
+
+#define BoringBound                                                            \
+  0x41102cb3 /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for        \
+		negative).  */
+#define AbsMask 0x7fffffff
+#define One 0x3f800000
+
+#define Shift v_f32 (0x1.8p23f)
+#define InvLn2 v_f32 (0x1.715476p+0f)
+#define MLn2hi v_f32 (-0x1.62e4p-1f)
+#define MLn2lo v_f32 (-0x1.7f7d1cp-20f)
+
+#define C(i) v_f32 (__expm1f_poly[i])
+
+static inline v_f32_t
+expm1f_inline (v_f32_t x)
+{
+  /* Helper routine for calculating exp(x) - 1.
+     Copied from v_expm1f_1u6.c, with all special-case handling removed, as
+     special, tiny and large values are all dealt with in the main tanhf
+     routine.  */
+
+  /* Reduce argument: f in [-ln2/2, ln2/2], i is exact.  */
+  v_f32_t j = v_fma_f32 (InvLn2, x, Shift) - Shift;
+  v_s32_t i = v_to_s32_f32 (j);
+  v_f32_t f = v_fma_f32 (j, MLn2hi, x);
+  f = v_fma_f32 (j, MLn2lo, f);
+
+  /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
+     Uses Estrin scheme, where the main __v_expm1f routine uses Horner.  */
+  v_f32_t f2 = f * f;
+  v_f32_t p_01 = v_fma_f32 (f, C (1), C (0));
+  v_f32_t p_23 = v_fma_f32 (f, C (3), C (2));
+  v_f32_t p = v_fma_f32 (f2, p_23, p_01);
+  p = v_fma_f32 (f2 * f2, C (4), p);
+  p = v_fma_f32 (f2, p, f);
+
+  /* t = 2^i.  */
+  v_f32_t t = v_as_f32_u32 (v_as_u32_s32 (i << 23) + One);
+  /* expm1(x) ~= p * t + (t - 1).  */
+  return v_fma_f32 (p, t, t - 1);
+}
+
+static NOINLINE v_f32_t
+special_case (v_f32_t x, v_f32_t y, v_u32_t special)
+{
+  return v_call_f32 (tanhf, x, y, special);
+}
+
+/* Approximation for single-precision vector tanh(x), using a simplified version
+   of expm1f. The maximum error is 2.58 ULP:
+   __v_tanhf(0x1.fa5eep-5) got 0x1.f9ba02p-5
+			  want 0x1.f9ba08p-5.  */
+VPCS_ATTR v_f32_t V_NAME (tanhf) (v_f32_t x)
+{
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t iax = ix & AbsMask;
+  v_u32_t sign = ix & ~AbsMask;
+  v_u32_t is_boring = v_cond_u32 (iax > BoringBound);
+  v_f32_t boring = v_as_f32_u32 (sign | One);
+
+#if WANT_ERRNO
+  /* If errno needs to be set properly, set all special and boring lanes to 1,
+     which will trigger no exceptions, and fix them up later.  */
+  v_u32_t special = v_cond_u32 ((iax > 0x7f800000) | (iax < 0x34000000));
+  ix = v_sel_u32 (is_boring, v_u32 (One), ix);
+  if (unlikely (v_any_u32 (special)))
+    ix = v_sel_u32 (special, v_u32 (One), ix);
+#else
+  v_u32_t special = v_cond_u32 ((iax > 0x7f800000) | (iax == 0));
+#endif
+
+  /* tanh(x) = (e^2x - 1) / (e^2x + 1).  */
+  v_f32_t q = expm1f_inline (2 * v_as_f32_u32 (ix));
+  v_f32_t y = q / (q + 2);
+  y = v_sel_f32 (is_boring, boring, y);
+  if (unlikely (v_any_u32 (special)))
+    return special_case (x, y, special);
+  return y;
+}
+VPCS_ALIAS
+
+#endif
-- 
cgit v1.2.3


From bc7cc9d2a762a26b2fcbf150b3fc9c6993ffa16c Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Fri, 9 Dec 2022 12:19:38 +0000
Subject: pl/math: Add polynomial helpers

Add macros for simplifying polynomial evaluation using either Horner,
pairwise Horner or Estrin. Several routines have been modified to use
the new helpers. Readability is improved slightly, and we expect that
this will make prototyping new routines simpler.
---
 pl/math/v_tanhf_2u6.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'pl/math/v_tanhf_2u6.c')

diff --git a/pl/math/v_tanhf_2u6.c b/pl/math/v_tanhf_2u6.c
index 571fd8b..67e4520 100644
--- a/pl/math/v_tanhf_2u6.c
+++ b/pl/math/v_tanhf_2u6.c
@@ -6,6 +6,7 @@
 
 #include "v_math.h"
 #include "mathlib.h"
+#include "estrinf.h"
 
 #if V_SUPPORTED
 
@@ -39,10 +40,7 @@ expm1f_inline (v_f32_t x)
   /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
      Uses Estrin scheme, where the main __v_expm1f routine uses Horner.  */
   v_f32_t f2 = f * f;
-  v_f32_t p_01 = v_fma_f32 (f, C (1), C (0));
-  v_f32_t p_23 = v_fma_f32 (f, C (3), C (2));
-  v_f32_t p = v_fma_f32 (f2, p_23, p_01);
-  p = v_fma_f32 (f2 * f2, C (4), p);
+  v_f32_t p = ESTRIN_4 (f, f2, f2 * f2, C);
   p = v_fma_f32 (f2, p, f);
 
   /* t = 2^i.  */
-- 
cgit v1.2.3


From 1bca1a541cce13c352296acd5dfa16160fc27bc9 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 15 Dec 2022 13:27:31 +0000
Subject: pl/math: Auto-generate mathbench and ulp headers

Instead of maintaining three separate lists of routines, which
are cumbersome and prone to merge conflicts, we provide a new
macro, PL_SIG, which by some preprocessor machinery outputs the
lists in the required format (macro formats have been changed
very slightly to make the generation simpler). Only routines with
simple signatures are handled - binary functions still need
mathbench wrappers defined manually. As well, routines with
non-standard references (i.e. powi/powk) still need entries and
wrappers manually defined.
---
 pl/math/v_tanhf_2u6.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'pl/math/v_tanhf_2u6.c')

diff --git a/pl/math/v_tanhf_2u6.c b/pl/math/v_tanhf_2u6.c
index 67e4520..1196c4a 100644
--- a/pl/math/v_tanhf_2u6.c
+++ b/pl/math/v_tanhf_2u6.c
@@ -5,8 +5,9 @@
  */
 
 #include "v_math.h"
-#include "mathlib.h"
 #include "estrinf.h"
+#include "mathlib.h"
+#include "pl_sig.h"
 
 #if V_SUPPORTED
 
@@ -88,4 +89,5 @@ VPCS_ATTR v_f32_t V_NAME (tanhf) (v_f32_t x)
 }
 VPCS_ALIAS
 
+PL_SIG (V, F, 1, tanh, -10.0, 10.0)
 #endif
-- 
cgit v1.2.3


From ecb1c6f6ea7872645cb4c26514d5f64815b61a1b Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 15 Dec 2022 13:27:39 +0000
Subject: pl/math: Move ULP limits to routine source files

Introduces a new set of macros and Make rules for mechanically
generating a list of ULP limits for each routine, to be consumed
by runulp.sh. This removes the need to maintain long lists of
thresholds in runulp.sh.
---
 pl/math/v_tanhf_2u6.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'pl/math/v_tanhf_2u6.c')

diff --git a/pl/math/v_tanhf_2u6.c b/pl/math/v_tanhf_2u6.c
index 1196c4a..bb86794 100644
--- a/pl/math/v_tanhf_2u6.c
+++ b/pl/math/v_tanhf_2u6.c
@@ -8,6 +8,7 @@
 #include "estrinf.h"
 #include "mathlib.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if V_SUPPORTED
 
@@ -90,4 +91,5 @@ VPCS_ATTR v_f32_t V_NAME (tanhf) (v_f32_t x)
 VPCS_ALIAS
 
 PL_SIG (V, F, 1, tanh, -10.0, 10.0)
+PL_TEST_ULP (V_NAME (tanhf), 2.09)
 #endif
-- 
cgit v1.2.3


From d748e1520dd2ff5ad3574bd0827cdd882bf6bed8 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 15 Dec 2022 13:27:57 +0000
Subject: pl/math: Move fenv expectations out of runulp.sh

Introduces a new macro, similar to how ULP thresholds are now
handled, that emits a list of routines which are expected to
correctly trigger fenv exceptions, to be consumed by runulp.sh.
All scalar routines are expected to do so. A small number of Neon
routines are also expected to, dependent on WANT_ERRNO.
---
 pl/math/v_tanhf_2u6.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'pl/math/v_tanhf_2u6.c')

diff --git a/pl/math/v_tanhf_2u6.c b/pl/math/v_tanhf_2u6.c
index bb86794..ae87f50 100644
--- a/pl/math/v_tanhf_2u6.c
+++ b/pl/math/v_tanhf_2u6.c
@@ -92,4 +92,5 @@ VPCS_ALIAS
 
 PL_SIG (V, F, 1, tanh, -10.0, 10.0)
 PL_TEST_ULP (V_NAME (tanhf), 2.09)
+PL_TEST_EXPECT_FENV (V_NAME (tanhf), WANT_ERRNO)
 #endif
-- 
cgit v1.2.3


From 202e46317ee8983516b6413066a57bd624ffa044 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 15 Dec 2022 13:28:06 +0000
Subject: pl/math: Move test intervals to routine source files

To conclude the work on simplifying the runulp.sh script, a new macro
has been introduced to specify the intervals in which a routine should
be tested in the routine source. This is eventually consumed by
runulp.sh.
---
 pl/math/v_tanhf_2u6.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'pl/math/v_tanhf_2u6.c')

diff --git a/pl/math/v_tanhf_2u6.c b/pl/math/v_tanhf_2u6.c
index ae87f50..c10be40 100644
--- a/pl/math/v_tanhf_2u6.c
+++ b/pl/math/v_tanhf_2u6.c
@@ -93,4 +93,10 @@ VPCS_ALIAS
 PL_SIG (V, F, 1, tanh, -10.0, 10.0)
 PL_TEST_ULP (V_NAME (tanhf), 2.09)
 PL_TEST_EXPECT_FENV (V_NAME (tanhf), WANT_ERRNO)
+PL_TEST_INTERVAL (V_NAME (tanhf), 0, 0x1p-23, 1000)
+PL_TEST_INTERVAL (V_NAME (tanhf), -0, -0x1p-23, 1000)
+PL_TEST_INTERVAL (V_NAME (tanhf), 0x1p-23, 0x1.205966p+3, 100000)
+PL_TEST_INTERVAL (V_NAME (tanhf), -0x1p-23, -0x1.205966p+3, 100000)
+PL_TEST_INTERVAL (V_NAME (tanhf), 0x1.205966p+3, inf, 100)
+PL_TEST_INTERVAL (V_NAME (tanhf), -0x1.205966p+3, -inf, 100)
 #endif
-- 
cgit v1.2.3


From d05594e6718e6d86959c823bea4f019dea878bcb Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Mon, 19 Dec 2022 12:34:51 +0000
Subject: pl/math: Replace WANT_ERRNO with WANT_SIMD_EXCEPT for Neon fenv

We were previously misusing the WANT_ERRNO build flag. This is now
replaced everywhere appropriate with WANT_SIMD_EXCEPT. A small number
of vector routines get fp exceptions right with no modification - the
tests have been updated to track this.
---
 pl/math/v_tanhf_2u6.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'pl/math/v_tanhf_2u6.c')

diff --git a/pl/math/v_tanhf_2u6.c b/pl/math/v_tanhf_2u6.c
index c10be40..dedc085 100644
--- a/pl/math/v_tanhf_2u6.c
+++ b/pl/math/v_tanhf_2u6.c
@@ -69,9 +69,9 @@ VPCS_ATTR v_f32_t V_NAME (tanhf) (v_f32_t x)
   v_u32_t is_boring = v_cond_u32 (iax > BoringBound);
   v_f32_t boring = v_as_f32_u32 (sign | One);
 
-#if WANT_ERRNO
-  /* If errno needs to be set properly, set all special and boring lanes to 1,
-     which will trigger no exceptions, and fix them up later.  */
+#if WANT_SIMD_EXCEPT
+  /* If fp exceptions are to be triggered properly, set all special and boring
+     lanes to 1, which will trigger no exceptions, and fix them up later.  */
   v_u32_t special = v_cond_u32 ((iax > 0x7f800000) | (iax < 0x34000000));
   ix = v_sel_u32 (is_boring, v_u32 (One), ix);
   if (unlikely (v_any_u32 (special)))
@@ -92,7 +92,7 @@ VPCS_ALIAS
 
 PL_SIG (V, F, 1, tanh, -10.0, 10.0)
 PL_TEST_ULP (V_NAME (tanhf), 2.09)
-PL_TEST_EXPECT_FENV (V_NAME (tanhf), WANT_ERRNO)
+PL_TEST_EXPECT_FENV (V_NAME (tanhf), WANT_SIMD_EXCEPT)
 PL_TEST_INTERVAL (V_NAME (tanhf), 0, 0x1p-23, 1000)
 PL_TEST_INTERVAL (V_NAME (tanhf), -0, -0x1p-23, 1000)
 PL_TEST_INTERVAL (V_NAME (tanhf), 0x1p-23, 0x1.205966p+3, 100000)
-- 
cgit v1.2.3


From 0a9270a27f48bea87c5bd3f0f9c759da66fb45a3 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 22 Dec 2022 16:20:22 +0000
Subject: pl/math: Fix fp exceptions in Neon sinhf and sinh

Both routines previously relied on the vector expm1(f) routine exposed
by the library, which depended on WANT_SIMD_EXCEPT for its fenv
behaviour, however both routines were expected to always trigger fp
exceptions correctly. To remedy this, both routines now use an inlined
helper for expm1 (reused from vector tanhf in the case of sinhf), and
special-case small input as well as large when WANT_SIMD_EXCEPT is
enabled.
---
 pl/math/v_tanhf_2u6.c | 38 ++------------------------------------
 1 file changed, 2 insertions(+), 36 deletions(-)

(limited to 'pl/math/v_tanhf_2u6.c')

diff --git a/pl/math/v_tanhf_2u6.c b/pl/math/v_tanhf_2u6.c
index dedc085..0e7ff69 100644
--- a/pl/math/v_tanhf_2u6.c
+++ b/pl/math/v_tanhf_2u6.c
@@ -5,51 +5,17 @@
  */
 
 #include "v_math.h"
-#include "estrinf.h"
-#include "mathlib.h"
 #include "pl_sig.h"
 #include "pl_test.h"
 
 #if V_SUPPORTED
 
+#include "v_expm1f_inline.h"
+
 #define BoringBound                                                            \
   0x41102cb3 /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for        \
 		negative).  */
 #define AbsMask 0x7fffffff
-#define One 0x3f800000
-
-#define Shift v_f32 (0x1.8p23f)
-#define InvLn2 v_f32 (0x1.715476p+0f)
-#define MLn2hi v_f32 (-0x1.62e4p-1f)
-#define MLn2lo v_f32 (-0x1.7f7d1cp-20f)
-
-#define C(i) v_f32 (__expm1f_poly[i])
-
-static inline v_f32_t
-expm1f_inline (v_f32_t x)
-{
-  /* Helper routine for calculating exp(x) - 1.
-     Copied from v_expm1f_1u6.c, with all special-case handling removed, as
-     special, tiny and large values are all dealt with in the main tanhf
-     routine.  */
-
-  /* Reduce argument: f in [-ln2/2, ln2/2], i is exact.  */
-  v_f32_t j = v_fma_f32 (InvLn2, x, Shift) - Shift;
-  v_s32_t i = v_to_s32_f32 (j);
-  v_f32_t f = v_fma_f32 (j, MLn2hi, x);
-  f = v_fma_f32 (j, MLn2lo, f);
-
-  /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
-     Uses Estrin scheme, where the main __v_expm1f routine uses Horner.  */
-  v_f32_t f2 = f * f;
-  v_f32_t p = ESTRIN_4 (f, f2, f2 * f2, C);
-  p = v_fma_f32 (f2, p, f);
-
-  /* t = 2^i.  */
-  v_f32_t t = v_as_f32_u32 (v_as_u32_s32 (i << 23) + One);
-  /* expm1(x) ~= p * t + (t - 1).  */
-  return v_fma_f32 (p, t, t - 1);
-}
 
 static NOINLINE v_f32_t
 special_case (v_f32_t x, v_f32_t y, v_u32_t special)
-- 
cgit v1.2.3


From f0f80b8a19b2593491847ed87456694d789f6f80 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Fri, 6 Jan 2023 09:10:57 +0000
Subject: pl/math: Update copyright years

All files in pl/math updated to 2023.
---
 pl/math/v_tanhf_2u6.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'pl/math/v_tanhf_2u6.c')

diff --git a/pl/math/v_tanhf_2u6.c b/pl/math/v_tanhf_2u6.c
index 0e7ff69..3616611 100644
--- a/pl/math/v_tanhf_2u6.c
+++ b/pl/math/v_tanhf_2u6.c
@@ -1,6 +1,7 @@
 /*
  * Single-precision vector tanh(x) function.
- * Copyright (c) 2022, Arm Limited.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-- 
cgit v1.2.3