From 47eb0a883fb82fcd394353920e2cca4d0a0ffe9d Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Fri, 17 Jun 2022 11:09:34 +0100
Subject: pl/math: Add vector/Neon atan2f

Successfully ran tests and benchmarks. New routine is accurate to 3 ulps.
---
 pl/math/v_atan2f_3u.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)
 create mode 100644 pl/math/v_atan2f_3u.c

(limited to 'pl/math/v_atan2f_3u.c')

diff --git a/pl/math/v_atan2f_3u.c b/pl/math/v_atan2f_3u.c
new file mode 100644
index 0000000..4212351
--- /dev/null
+++ b/pl/math/v_atan2f_3u.c
@@ -0,0 +1,78 @@
+/*
+ * Single-precision vector atan2(x) function.
+ *
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#if V_SUPPORTED
+
+#include "atanf_common.h"
+
+/* Useful constants.  */
+#define PiOver2 v_f32 (0x1.921fb6p+0f)
+#define SignMask v_u32 (0x80000000)
+
+/* Special cases i.e. 0, infinity and nan (fall back to scalar calls).  */
+VPCS_ATTR
+__attribute__ ((noinline)) static v_f32_t
+specialcase (v_f32_t y, v_f32_t x, v_f32_t ret, v_u32_t cmp)
+{
+  return v_call2_f32 (atan2f, y, x, ret, cmp);
+}
+
+/* Returns 1 if input is the bit representation of 0, infinity or nan.  */
+static inline v_u32_t
+zeroinfnan (v_u32_t i)
+{
+  return v_cond_u32 (2 * i - 1 >= v_u32 (2 * 0x7f800000lu - 1));
+}
+
+/* Fast implementation of vector atan2f. Maximum observed error is
+   2.95 ULP in [0x1.9300d6p+6 0x1.93c0c6p+6] x [0x1.8c2dbp+6 0x1.8cea6p+6]:
+   v_atan2(0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1
+				       want 0x1.967f00p-1.  */
+VPCS_ATTR
+v_f32_t V_NAME (atan2f) (v_f32_t y, v_f32_t x)
+{
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t iy = v_as_u32_f32 (y);
+
+  v_u32_t special_cases = zeroinfnan (ix) | zeroinfnan (iy);
+
+  v_u32_t sign_x = ix & SignMask;
+  v_u32_t sign_y = iy & SignMask;
+  v_u32_t sign_xy = sign_x ^ sign_y;
+
+  v_f32_t ax = v_abs_f32 (x);
+  v_f32_t ay = v_abs_f32 (y);
+
+  v_u32_t pred_xlt0 = x < 0.0f;
+  v_u32_t pred_aygtax = ay > ax;
+
+  /* Set up z for call to atanf.  */
+  v_f32_t n = v_sel_f32 (pred_aygtax, -ax, ay);
+  v_f32_t d = v_sel_f32 (pred_aygtax, ay, ax);
+  v_f32_t z = v_div_f32 (n, d);
+
+  /* Work out the correct shift.  */
+  v_f32_t shift = v_sel_f32 (pred_xlt0, v_f32 (-2.0f), v_f32 (0.0f));
+  shift = v_sel_f32 (pred_aygtax, shift + 1.0f, shift);
+  shift *= PiOver2;
+
+  v_f32_t ret = eval_poly (z, z, shift);
+
+  /* Account for the sign of y.  */
+  ret = v_as_f32_u32 (v_as_u32_f32 (ret) ^ sign_xy);
+
+  if (unlikely (v_any_u32 (special_cases)))
+    {
+      return specialcase (y, x, ret, special_cases);
+    }
+
+  return ret;
+}
+VPCS_ALIAS
+
+#endif
-- 
cgit v1.2.3


From 3d1a87e2fe152dc52d4a624425f5b2349a4088b0 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Mon, 15 Aug 2022 11:19:25 +0100
Subject: pl/math: Audit Neon special-case handlers

Prevent inlining in most cases - change to use AOR style (NOINLINE).
---
 pl/math/v_atan2f_3u.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'pl/math/v_atan2f_3u.c')

diff --git a/pl/math/v_atan2f_3u.c b/pl/math/v_atan2f_3u.c
index 4212351..dc0fbca 100644
--- a/pl/math/v_atan2f_3u.c
+++ b/pl/math/v_atan2f_3u.c
@@ -16,7 +16,7 @@
 
 /* Special cases i.e. 0, infinity and nan (fall back to scalar calls).  */
 VPCS_ATTR
-__attribute__ ((noinline)) static v_f32_t
+NOINLINE static v_f32_t
 specialcase (v_f32_t y, v_f32_t x, v_f32_t ret, v_u32_t cmp)
 {
   return v_call2_f32 (atan2f, y, x, ret, cmp);
-- 
cgit v1.2.3


From 1bca1a541cce13c352296acd5dfa16160fc27bc9 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 15 Dec 2022 13:27:31 +0000
Subject: pl/math: Auto-generate mathbench and ulp headers

Instead of maintaining three separate lists of routines, which
are cumbersome and prone to merge conflicts, we provide a new
macro, PL_SIG, which by some preprocessor machinery outputs the
lists in the required format (macro formats have been changed
very slightly to make the generation simpler). Only routines with
simple signatures are handled - binary functions still need
mathbench wrappers defined manually. As well, routines with
non-standard references (i.e. powi/powk) still need entries and
wrappers manually defined.
---
 pl/math/v_atan2f_3u.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'pl/math/v_atan2f_3u.c')

diff --git a/pl/math/v_atan2f_3u.c b/pl/math/v_atan2f_3u.c
index dc0fbca..8c2c8f2 100644
--- a/pl/math/v_atan2f_3u.c
+++ b/pl/math/v_atan2f_3u.c
@@ -6,6 +6,8 @@
  */
 
 #include "v_math.h"
+#include "pl_sig.h"
+
 #if V_SUPPORTED
 
 #include "atanf_common.h"
@@ -75,4 +77,6 @@ v_f32_t V_NAME (atan2f) (v_f32_t y, v_f32_t x)
 }
 VPCS_ALIAS
 
+/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h.  */
+PL_SIG (V, F, 2, atan2)
 #endif
-- 
cgit v1.2.3


From ecb1c6f6ea7872645cb4c26514d5f64815b61a1b Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 15 Dec 2022 13:27:39 +0000
Subject: pl/math: Move ULP limits to routine source files

Introduces a new set of macros and Make rules for mechanically
generating a list of ULP limits for each routine, to be consumed
by runulp.sh. This removes the need to maintain long lists of
thresholds in runulp.sh.
---
 pl/math/v_atan2f_3u.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'pl/math/v_atan2f_3u.c')

diff --git a/pl/math/v_atan2f_3u.c b/pl/math/v_atan2f_3u.c
index 8c2c8f2..3d8f9fc 100644
--- a/pl/math/v_atan2f_3u.c
+++ b/pl/math/v_atan2f_3u.c
@@ -7,6 +7,7 @@
 
 #include "v_math.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if V_SUPPORTED
 
@@ -79,4 +80,5 @@ VPCS_ALIAS
 
 /* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h.  */
 PL_SIG (V, F, 2, atan2)
+PL_TEST_ULP (V_NAME (atan2f), 2.46)
 #endif
-- 
cgit v1.2.3


From 202e46317ee8983516b6413066a57bd624ffa044 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 15 Dec 2022 13:28:06 +0000
Subject: pl/math: Move test intervals to routine source files

To conclude the work on simplifying the runulp.sh script, a new macro
has been introduced to specify the intervals in which a routine should
be tested in the routine source. This is eventually consumed by
runulp.sh.
---
 pl/math/v_atan2f_3u.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'pl/math/v_atan2f_3u.c')

diff --git a/pl/math/v_atan2f_3u.c b/pl/math/v_atan2f_3u.c
index 3d8f9fc..abf8f5e 100644
--- a/pl/math/v_atan2f_3u.c
+++ b/pl/math/v_atan2f_3u.c
@@ -81,4 +81,9 @@ VPCS_ALIAS
 /* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h.  */
 PL_SIG (V, F, 2, atan2)
 PL_TEST_ULP (V_NAME (atan2f), 2.46)
+PL_TEST_INTERVAL (V_NAME (atan2f), -10.0, 10.0, 50000)
+PL_TEST_INTERVAL (V_NAME (atan2f), -1.0, 1.0, 40000)
+PL_TEST_INTERVAL (V_NAME (atan2f), 0.0, 1.0, 40000)
+PL_TEST_INTERVAL (V_NAME (atan2f), 1.0, 100.0, 40000)
+PL_TEST_INTERVAL (V_NAME (atan2f), 1e6, 1e32, 40000)
 #endif
-- 
cgit v1.2.3


From f0f80b8a19b2593491847ed87456694d789f6f80 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Fri, 6 Jan 2023 09:10:57 +0000
Subject: pl/math: Update copyright years

All files in pl/math updated to 2023.
---
 pl/math/v_atan2f_3u.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'pl/math/v_atan2f_3u.c')

diff --git a/pl/math/v_atan2f_3u.c b/pl/math/v_atan2f_3u.c
index abf8f5e..5d1e6ca 100644
--- a/pl/math/v_atan2f_3u.c
+++ b/pl/math/v_atan2f_3u.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision vector atan2(x) function.
  *
- * Copyright (c) 2021-2022, Arm Limited.
+ * Copyright (c) 2021-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-- 
cgit v1.2.3