pl/math: Add scalar atanf and set fenv in Neon atanf

The simplest way to set fenv in Neon atanf is by using a scalar fallback to under/overflow cases, however this routine did not have a scalar counterpart so we add a new one, based on the same algorithm and polynomial as the vector variants, and accurate to 2.9 ULP. This is now used as the fallback for all lanes, when any lane of the Neon input is special.
author: Joe Ramsay <Joe.Ramsay@arm.com> 2022-12-20 09:26:57 +0000
committer: Joe Ramsay <joe.ramsay@arm.com> 2022-12-20 09:26:57 +0000
commit: f312cb80e4a306d8a127a1dd78b5ee0a1ee89732 (patch)
tree: f3a011d4cbbe8faa565d49184329063820648ef7
parent: 04e91eca36b0a7dbbab78bf9401c978ab1b08b67 (diff)
download: arm-optimized-routines-f312cb80e4a306d8a127a1dd78b5ee0a1ee89732.tar.gz
4 files changed, 129 insertions, 7 deletions
diff --git a/pl/math/atanf_2u9.c b/pl/math/atanf_2u9.c
new file mode 100644
index 0000000..d7071be
--- /dev/null
+++ b/pl/math/atanf_2u9.c
@@ -0,0 +1,76 @@
+/*
+ * Single-precision atan(x) function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "atanf_common.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define PiOver2 0x1.921fb6p+0f
+#define AbsMask 0x7fffffff
+#define TinyBound 0x30800000 /* asuint(0x1p-30).  */
+#define BigBound 0x4e800000  /* asuint(0x1p30).  */
+#define One 0x3f800000
+
+/* Approximation of single-precision atan(x) based on
+   atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1]
+   using z=-1/x and shift = pi/2.
+   Maximum error is 2.88 ulps:
+   atanf(0x1.0565ccp+0) got 0x1.97771p-1
+		       want 0x1.97770ap-1.  */
+float
+atanf (float x)
+{
+  uint32_t ix = asuint (x);
+  uint32_t sign = ix & ~AbsMask;
+  uint32_t ia = ix & AbsMask;
+
+  if (unlikely (ia < TinyBound))
+    /* Avoid underflow by returning x.  */
+    return x;
+
+  if (unlikely (ia > BigBound))
+    {
+      if (ia > 0x7f800000)
+	/* Propagate NaN.  */
+	return __math_invalidf (x);
+      /* atan(x) rounds to PiOver2 for large x.  */
+      return asfloat (asuint (PiOver2) ^ sign);
+    }
+
+  float z, az, shift;
+  if (ia > One)
+    {
+      /* For x > 1, use atan(x) = pi / 2 + atan(-1 / x).  */
+      z = -1.0f / x;
+      shift = PiOver2;
+      /* Use absolute value only when needed (odd powers of z).  */
+      az = -fabsf (z);
+    }
+  else
+    {
+      /* For x < 1, approximate atan(x) directly.  */
+      z = x;
+      az = asfloat (ia);
+      shift = 0;
+    }
+
+  /* Calculate polynomial, shift + z + z^3 * P(z^2).  */
+  float y = eval_poly (z, az, shift);
+  /* Copy sign.  */
+  return asfloat (asuint (y) ^ sign);
+}
+
+PL_SIG (S, F, 1, atan, -10.0, 10.0)
+PL_TEST_ULP (atanf, 2.38)
+PL_TEST_INTERVAL (atanf, 0, 0x1p-30, 5000)
+PL_TEST_INTERVAL (atanf, -0, -0x1p-30, 5000)
+PL_TEST_INTERVAL (atanf, 0x1p-30, 1, 40000)
+PL_TEST_INTERVAL (atanf, -0x1p-30, -1, 40000)
+PL_TEST_INTERVAL (atanf, 1, 0x1p30, 40000)
+PL_TEST_INTERVAL (atanf, -1, -0x1p30, 40000)
+PL_TEST_INTERVAL (atanf, 0x1p30, inf, 1000)
+PL_TEST_INTERVAL (atanf, -0x1p30, -inf, 1000)
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 12d72fe..44cbc73 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -12,6 +12,7 @@
 float acoshf (float);
 float asinhf (float);
 float atan2f (float, float);
+float atanf (float);
 float atanhf (float);
 float cbrtf (float);
 float coshf (float);
diff --git a/pl/math/test/testcases/directed/atanf.tst b/pl/math/test/testcases/directed/atanf.tst
new file mode 100644
index 0000000..8661527
--- /dev/null
+++ b/pl/math/test/testcases/directed/atanf.tst
@@ -0,0 +1,22 @@
+; atanf.tst
+;
+; Copyright 2007-2022, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=atanf op1=7fc00001 result=7fc00001 errno=0
+func=atanf op1=ffc00001 result=7fc00001 errno=0
+func=atanf op1=7f800001 result=7fc00001 errno=0 status=i
+func=atanf op1=ff800001 result=7fc00001 errno=0 status=i
+func=atanf op1=7f800000 result=3fc90fda.a22 errno=0
+func=atanf op1=ff800000 result=bfc90fda.a22 errno=0
+func=atanf op1=00000000 result=00000000 errno=0
+func=atanf op1=80000000 result=80000000 errno=0
+; Inconsistent behavior was detected for the following 2 cases.
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+func=atanf op1=00000001 result=00000001 errno=0 maybestatus=ux
+func=atanf op1=80000001 result=80000001 errno=0 maybestatus=ux
+
+func=atanf op1=3f800000 result=3f490fda.a22 errno=0
+func=atanf op1=bf800000 result=bf490fda.a22 errno=0
diff --git a/pl/math/v_atanf_3u.c b/pl/math/v_atanf_3u.c
index 3cb51b1..c61f8f8 100644
--- a/pl/math/v_atanf_3u.c
+++ b/pl/math/v_atanf_3u.c
@@ -15,6 +15,16 @@
 
 #define PiOver2 v_f32 (0x1.921fb6p+0f)
 #define AbsMask v_u32 (0x7fffffff)
+#define TinyBound 0x308 /* top12(asuint(0x1p-30)).  */
+#define BigBound 0x4e8	/* top12(asuint(0x1p30)).  */
+
+#if WANT_SIMD_EXCEPT
+static NOINLINE v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t special)
+{
+  return v_call_f32 (atanf, x, y, special);
+}
+#endif
 
 /* Fast implementation of vector atanf based on
    atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1]
@@ -23,11 +33,20 @@
 VPCS_ATTR
 v_f32_t V_NAME (atanf) (v_f32_t x)
 {
-  /* No need to trigger special case. Small cases, infs and nans
-     are supported by our approximation technique.  */
+  /* Small cases, infs and nans are supported by our approximation technique,
+     but do not set fenv flags correctly. Only trigger special case if we need
+     fenv.  */
   v_u32_t ix = v_as_u32_f32 (x);
   v_u32_t sign = ix & ~AbsMask;
 
+#if WANT_SIMD_EXCEPT
+  v_u32_t ia12 = (ix >> 20) & 0x7ff;
+  v_u32_t special = v_cond_u32 (ia12 - TinyBound > BigBound - TinyBound);
+  /* If any lane is special, fall back to the scalar routine for all lanes.  */
+  if (unlikely (v_any_u32 (special)))
+    return specialcase (x, x, v_u32 (-1));
+#endif
+
   /* Argument reduction:
      y := arctan(x) for x < 1
      y := pi/2 + arctan(-1/x) for x > 1
@@ -52,9 +71,13 @@ VPCS_ALIAS
 
 PL_SIG (V, F, 1, atan, -10.0, 10.0)
 PL_TEST_ULP (V_NAME (atanf), 2.5)
-PL_TEST_INTERVAL (V_NAME (atanf), -10.0, 10.0, 50000)
-PL_TEST_INTERVAL (V_NAME (atanf), -1.0, 1.0, 40000)
-PL_TEST_INTERVAL (V_NAME (atanf), 0.0, 1.0, 40000)
-PL_TEST_INTERVAL (V_NAME (atanf), 1.0, 100.0, 40000)
-PL_TEST_INTERVAL (V_NAME (atanf), 1e6, 1e32, 40000)
+PL_TEST_EXPECT_FENV (V_NAME (atanf), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME (atanf), 0, 0x1p-30, 5000)
+PL_TEST_INTERVAL (V_NAME (atanf), -0, -0x1p-30, 5000)
+PL_TEST_INTERVAL (V_NAME (atanf), 0x1p-30, 1, 40000)
+PL_TEST_INTERVAL (V_NAME (atanf), -0x1p-30, -1, 40000)
+PL_TEST_INTERVAL (V_NAME (atanf), 1, 0x1p30, 40000)
+PL_TEST_INTERVAL (V_NAME (atanf), -1, -0x1p30, 40000)
+PL_TEST_INTERVAL (V_NAME (atanf), 0x1p30, inf, 1000)
+PL_TEST_INTERVAL (V_NAME (atanf), -0x1p30, -inf, 1000)
 #endif
author	Joe Ramsay <Joe.Ramsay@arm.com>	2022-12-20 09:26:57 +0000
committer	Joe Ramsay <joe.ramsay@arm.com>	2022-12-20 09:26:57 +0000
commit	f312cb80e4a306d8a127a1dd78b5ee0a1ee89732 (patch)
tree	f3a011d4cbbe8faa565d49184329063820648ef7
parent	04e91eca36b0a7dbbab78bf9401c978ab1b08b67 (diff)
download	arm-optimized-routines-f312cb80e4a306d8a127a1dd78b5ee0a1ee89732.tar.gz