4 files changed, 117 insertions, 8 deletions
diff --git a/pl/math/atan_2u5.c b/pl/math/atan_2u5.c
new file mode 100644
index 0000000..99fea0f
--- /dev/null
+++ b/pl/math/atan_2u5.c
@@ -0,0 +1,73 @@
+/*
+ * Double-precision atan(x) function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "pl_sig.h"
+#include "pl_test.h"
+#include "atan_common.h"
+
+#define AbsMask 0x7fffffffffffffff
+#define PiOver2 0x1.921fb54442d18p+0
+#define TinyBound 0x3e1 /* top12(asuint64(0x1p-30)).  */
+#define BigBound 0x434	/* top12(asuint64(0x1p53)).  */
+#define OneTop 0x3ff
+
+/* Fast implementation of double-precision atan.
+   Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
+   z=1/x and shift = pi/2. Maximum observed error is 2.27 ulps:
+   atan(0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1
+			     want 0x1.9225645bdd7c3p-1.  */
+double
+atan (double x)
+{
+  uint64_t ix = asuint64 (x);
+  uint64_t sign = ix & ~AbsMask;
+  uint64_t ia = ix & AbsMask;
+  uint32_t ia12 = ia >> 52;
+
+  if (unlikely (ia12 >= BigBound || ia12 < TinyBound))
+    {
+      if (ia12 < TinyBound)
+	/* Avoid underflow by returning x.  */
+	return x;
+      if (ia > 0x7ff0000000000000)
+	/* Propagate NaN.  */
+	return __math_invalid (x);
+      /* atan(x) rounds to PiOver2 for large x.  */
+      return asdouble (asuint64 (PiOver2) ^ sign);
+    }
+
+  double z, az, shift;
+  if (ia12 >= OneTop)
+    {
+      /* For x > 1, use atan(x) = pi / 2 + atan(-1 / x).  */
+      z = -1.0 / x;
+      shift = PiOver2;
+      /* Use absolute value only when needed (odd powers of z).  */
+      az = -fabs (z);
+    }
+  else
+    {
+      /* For x < 1, approximate atan(x) directly.  */
+      z = x;
+      shift = 0;
+      az = asdouble (ia);
+    }
+
+  /* Calculate polynomial, shift + z + z^3 * P(z^2).  */
+  double y = eval_poly (z, az, shift);
+  /* Copy sign.  */
+  return asdouble (asuint64 (y) ^ sign);
+}
+
+PL_SIG (S, D, 1, atan, -10.0, 10.0)
+PL_TEST_ULP (atan, 1.78)
+PL_TEST_INTERVAL (atan, 0, 0x1p-30, 10000)
+PL_TEST_INTERVAL (atan, -0, -0x1p-30, 1000)
+PL_TEST_INTERVAL (atan, 0x1p-30, 0x1p53, 900000)
+PL_TEST_INTERVAL (atan, -0x1p-30, -0x1p53, 90000)
+PL_TEST_INTERVAL (atan, 0x1p53, inf, 10000)
+PL_TEST_INTERVAL (atan, -0x1p53, -inf, 1000)
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 44cbc73..05fa306 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -27,6 +27,7 @@ float tanhf (float);
 
 double acosh (double);
 double asinh (double);
+double atan (double);
 double atan2 (double, double);
 double cbrt (double);
 double cosh (double);
diff --git a/pl/math/test/testcases/directed/atan.tst b/pl/math/test/testcases/directed/atan.tst
new file mode 100644
index 0000000..5716276
--- /dev/null
+++ b/pl/math/test/testcases/directed/atan.tst
@@ -0,0 +1,22 @@
+; atan.tst
+;
+; Copyright 1999-2022, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=atan op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan op1=7ff00000.00000000 result=3ff921fb.54442d18.469 errno=0
+func=atan op1=fff00000.00000000 result=bff921fb.54442d18.469 errno=0
+func=atan op1=00000000.00000000 result=00000000.00000000 errno=0
+func=atan op1=80000000.00000000 result=80000000.00000000 errno=0
+; Inconsistent behavior was detected for the following 2 cases.
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+func=atan op1=00000000.00000001 result=00000000.00000001 errno=0 maybestatus=ux
+func=atan op1=80000000.00000001 result=80000000.00000001 errno=0 maybestatus=ux
+
+func=atan op1=3ff00000.00000000 result=3fe921fb.54442d18.469 errno=0
+func=atan op1=bff00000.00000000 result=bfe921fb.54442d18.469 errno=0
diff --git a/pl/math/v_atan_2u5.c b/pl/math/v_atan_2u5.c
index 43b4abd..92479ab 100644
--- a/pl/math/v_atan_2u5.c
+++ b/pl/math/v_atan_2u5.c
@@ -15,6 +15,8 @@
 
 #define PiOver2 v_f64 (0x1.921fb54442d18p+0)
 #define AbsMask v_u64 (0x7fffffffffffffff)
+#define TinyBound 0x3e1 /* top12(asuint64(0x1p-30)).  */
+#define BigBound 0x434	/* top12(asuint64(0x1p53)).  */
 
 /* Fast implementation of vector atan.
    Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
@@ -24,11 +26,20 @@
 VPCS_ATTR
 v_f64_t V_NAME (atan) (v_f64_t x)
 {
-  /* No need to trigger special case. Small cases, infs and nans
-     are supported by our approximation technique.  */
+  /* Small cases, infs and nans are supported by our approximation technique,
+     but do not set fenv flags correctly. Only trigger special case if we need
+     fenv.  */
   v_u64_t ix = v_as_u64_f64 (x);
   v_u64_t sign = ix & ~AbsMask;
 
+#if WANT_SIMD_EXCEPT
+  v_u64_t ia12 = (ix >> 52) & 0x7ff;
+  v_u64_t special = v_cond_u64 (ia12 - TinyBound > BigBound - TinyBound);
+  /* If any lane is special, fall back to the scalar routine for all lanes.  */
+  if (unlikely (v_any_u64 (special)))
+    return v_call_f64 (atan, x, v_f64 (0), v_u64 (-1));
+#endif
+
   /* Argument reduction:
      y := arctan(x) for x < 1
      y := pi/2 + arctan(-1/x) for x > 1
@@ -46,16 +57,18 @@ v_f64_t V_NAME (atan) (v_f64_t x)
 
   /* y = atan(x) if x>0, -atan(-x) otherwise.  */
   y = v_as_f64_u64 (v_as_u64_f64 (y) ^ sign);
-
   return y;
 }
 VPCS_ALIAS
 
 PL_SIG (V, D, 1, atan, -10.0, 10.0)
 PL_TEST_ULP (V_NAME (atan), 1.78)
-PL_TEST_INTERVAL (V_NAME (atan), -10.0, 10.0, 50000)
-PL_TEST_INTERVAL (V_NAME (atan), -1.0, 1.0, 40000)
-PL_TEST_INTERVAL (V_NAME (atan), 0.0, 1.0, 40000)
-PL_TEST_INTERVAL (V_NAME (atan), 1.0, 100.0, 40000)
-PL_TEST_INTERVAL (V_NAME (atan), 1e6, 1e32, 40000)
+PL_TEST_EXPECT_FENV (V_NAME (atan), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME (atan), 0, 0x1p-30, 10000)
+PL_TEST_INTERVAL (V_NAME (atan), -0, -0x1p-30, 1000)
+PL_TEST_INTERVAL (V_NAME (atan), 0x1p-30, 0x1p53, 900000)
+PL_TEST_INTERVAL (V_NAME (atan), -0x1p-30, -0x1p53, 90000)
+PL_TEST_INTERVAL (V_NAME (atan), 0x1p53, inf, 10000)
+PL_TEST_INTERVAL (V_NAME (atan), -0x1p53, -inf, 1000)
+
 #endif