10 files changed, 159 insertions, 0 deletions
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index cda3371..baee70f 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -17,6 +17,7 @@ float log10f (float);
 double atan2 (double, double);
 double log10 (double);
 
+float __s_atan2f (float, float);
 float __s_erfcf (float);
 float __s_erff (float);
 float __s_log10f (float);
@@ -40,6 +41,7 @@ typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t;
 
 /* Vector functions following the base PCS.  */
 __f64x2_t __v_atan (__f64x2_t);
+__f32x4_t __v_atan2f (__f32x4_t, __f32x4_t);
 __f64x2_t __v_atan2 (__f64x2_t, __f64x2_t);
 __f32x4_t __v_erff (__f32x4_t);
 __f64x2_t __v_erf (__f64x2_t);
@@ -53,6 +55,7 @@ __f64x2_t __v_log10 (__f64x2_t);
 
 /* Vector functions following the vector PCS.  */
 __vpcs __f64x2_t __vn_atan (__f64x2_t);
+__vpcs __f32x4_t __vn_atan2f (__f32x4_t, __f32x4_t);
 __vpcs __f64x2_t __vn_atan2 (__f64x2_t, __f64x2_t);
 __vpcs __f32x4_t __vn_erff (__f32x4_t);
 __vpcs __f64x2_t __vn_erf (__f64x2_t);
@@ -63,6 +66,7 @@ __vpcs __f64x2_t __vn_log10 (__f64x2_t);
 
 /* Vector functions following the vector PCS using ABI names.  */
 __vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4vv_atan2f (__f32x4_t, __f32x4_t);
 __vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_erf (__f64x2_t);
diff --git a/pl/math/s_atan2f_3u.c b/pl/math/s_atan2f_3u.c
new file mode 100644
index 0000000..5002d32
--- /dev/null
+++ b/pl/math/s_atan2f_3u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_atan2f_3u.c"
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index f2befc8..f1455aa 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -18,6 +18,7 @@ D (log10, 0.01, 11.1)
 
 #if WANT_VMATH
 D (__s_atan, -10.0, 10.0)
+{"__s_atan2f", 'f', 0, -10.0, 10.0, {.f = __s_atan2f_wrap}},
 {"__s_atan2", 'd', 0, -10.0, 10.0, {.d = __s_atan2_wrap}},
 F (__s_erff, -4.0, 4.0)
 D (__s_erf, -6.0, 6.0)
@@ -27,6 +28,7 @@ F (__s_log10f, 0.01, 11.1)
 D (__s_log10, 0.01, 11.1)
 #if __aarch64__
 VD (__v_atan, -10.0, 10.0)
+{"__v_atan2f", 'f', 'v', -10.0, 10.0, {.vf = __v_atan2f_wrap}},
 {"__v_atan2", 'd', 'v', -10.0, 10.0, {.vd = __v_atan2_wrap}},
 VF  (__v_erff, -4.0, 4.0)
 VD  (__v_erf, -6.0, 6.0)
@@ -38,6 +40,9 @@ VF (__v_log10f, 0.01, 11.1)
 VND (__vn_atan, -10.0, 10.0)
 VND (_ZGVnN2v_atan, -10.0, 10.0)
 
+{"__vn_atan2f", 'f', 'n', -10.0, 10.0, {.vnf = __vn_atan2f_wrap}},
+{"_ZGVnN4vv_atan2f", 'f', 'n', -10.0, 10.0, {.vnf = _Z_atan2f_wrap}},
+
 {"__vn_atan2", 'd', 'n', -10.0, 10.0, {.vnd = __vn_atan2_wrap}},
 {"_ZGVnN2vv_atan2", 'd', 'n', -10.0, 10.0, {.vnd = _Z_atan2_wrap}},
 
diff --git a/pl/math/test/mathbench_wrappers.h b/pl/math/test/mathbench_wrappers.h
index 57c0c8f..37edeae 100644
--- a/pl/math/test/mathbench_wrappers.h
+++ b/pl/math/test/mathbench_wrappers.h
@@ -26,12 +26,24 @@ __s_atan2_wrap (double x)
   return __s_atan2 (5.0, x);
 }
 
+static float
+__s_atan2f_wrap (float x)
+{
+  return __s_atan2f (5.0f, x);
+}
+
 static v_double
 __v_atan2_wrap (v_double x)
 {
   return __v_atan2 (v_double_dup (5.0), x);
 }
 
+static v_float
+__v_atan2f_wrap (v_float x)
+{
+  return __v_atan2f (v_float_dup (5.0f), x);
+}
+
 #ifdef __vpcs
 
 __vpcs static v_double
@@ -40,12 +52,24 @@ __vn_atan2_wrap (v_double x)
   return __vn_atan2 (v_double_dup (5.0), x);
 }
 
+__vpcs static v_float
+__vn_atan2f_wrap (v_float x)
+{
+  return __vn_atan2f (v_float_dup (5.0f), x);
+}
+
 __vpcs static v_double
 _Z_atan2_wrap (v_double x)
 {
   return _ZGVnN2vv_atan2 (v_double_dup (5.0), x);
 }
 
+__vpcs static v_float
+_Z_atan2f_wrap (v_float x)
+{
+  return _ZGVnN4vv_atan2f (v_float_dup (5.0f), x);
+}
+
 #endif // __vpcs
 #endif // __arch64__
 #endif // WANT_VMATH
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 7ad2318..66ac3bf 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -161,6 +161,14 @@ range_atan='
    1e6       1e32  40000
 '
 
+range_atan2f='
+ -10.0       10.0  50000
+  -1.0        1.0  40000
+   0.0        1.0  40000
+   1.0      100.0  40000
+   1e6       1e32  40000
+'
+
 # error limits
 L_erfc=3.7
 L_erfcf=1.0
@@ -170,6 +178,7 @@ L_erf=1.76
 L_erff=1.5
 L_atan2=2.9
 L_atan=3.0
+L_atan2f=3.0
 
 while read G F R
 do
@@ -209,6 +218,10 @@ log10  __v_log10       $runv
 log10  __vn_log10      $runvn
 log10  _ZGVnN2v_log10  $runvn
 
+atan2f __s_atan2f       $runs
+atan2f __v_atan2f       $runv
+atan2f __vn_atan2f      $runvn
+atan2f _ZGVnN4vv_atan2f $runvn
 erff   __s_erff        $runs
 erff   __v_erff        $runv
 erff   __vn_erff       $runvn
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 244c96b..5bc7411 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -13,6 +13,7 @@ D1 (erfc)
 D1 (log10)
 #if WANT_VMATH
 F (__s_atan, __s_atan, atanl, mpfr_atan, 1, 0, d1, 0)
+F (__s_atan2f, __s_atan2f, atan2, mpfr_atan2, 2, 1, f2, 0)
 F (__s_atan2, __s_atan2, atan2l, mpfr_atan2, 2, 0, d2, 0)
 F (__s_erff, __s_erff, erf, mpfr_erf, 1, 1, f1, 0)
 F (__s_erf, __s_erf, erfl, mpfr_erf, 1, 0, d1, 0)
@@ -22,6 +23,7 @@ F (__s_log10f, __s_log10f, log10, mpfr_log10, 1, 1, f1, 0)
 F (__s_log10, __s_log10, log10l, mpfr_log10, 1, 0, d1, 0)
 #if __aarch64__
 F (__v_atan, v_atan, atanl, mpfr_atan, 1, 0, d1, 1)
+F (__v_atan2f, v_atan2f, atan2, mpfr_atan2, 2, 1, f2, 1)
 F (__v_atan2, v_atan2, atan2l, mpfr_atan2, 2, 0, d2, 1)
 F (__v_erff, v_erff, erf, mpfr_erf, 1, 1, f1, 1)
 F (__v_erf, v_erf, erfl, mpfr_erf, 1, 0, d1, 1)
@@ -31,6 +33,7 @@ F (__v_log10f, v_log10f, log10, mpfr_log10, 1, 1, f1, 1)
 F (__v_log10, v_log10, log10l, mpfr_log10, 1, 0, d1, 1)
 #ifdef __vpcs
 F (__vn_atan, vn_atan, atanl, mpfr_atan, 1, 0, d1, 1)
+F (__vn_atan2f, vn_atan2f, atan2, mpfr_atan2, 2, 1, f2, 1)
 F (__vn_atan2, vn_atan2, atan2l, mpfr_atan2, 2, 0, d2, 1)
 F (__vn_erff, vn_erff, erf, mpfr_erf, 1, 1, f1, 1)
 F (__vn_erf, vn_erf, erfl, mpfr_erf, 1, 0, d1, 1)
@@ -39,6 +42,7 @@ F (__vn_erfc, vn_erfc, erfcl, mpfr_erfc, 1, 0, d1, 1)
 F (__vn_log10f, vn_log10f, log10, mpfr_log10, 1, 1, f1, 1)
 F (__vn_log10, vn_log10, log10l, mpfr_log10, 1, 0, d1, 1)
 F (_ZGVnN2v_atan, Z_atan, atanl, mpfr_atan, 1, 0, d1, 1)
+F (_ZGVnN4vv_atan2f, Z_atan2f, atan2, mpfr_atan2, 2, 1, f2, 1)
 F (_ZGVnN2vv_atan2, Z_atan2, atan2l, mpfr_atan2, 2, 0, d2, 1)
 F (_ZGVnN4v_erff, Z_erff, erf, mpfr_erf, 1, 1, f1, 1)
 F (_ZGVnN2v_erf, Z_erf, erfl, mpfr_erf, 1, 0, d1, 1)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index f1bfdf2..0603e45 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -14,6 +14,7 @@ static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,
 
 /* Wrappers for vector functions.  */
 #if __aarch64__ && WANT_VMATH
+static float v_atan2f(float x, float y) { return __v_atan2f(argf(x), argf(y))[0]; }
 static float v_erff(float x) { return __v_erff(argf(x))[0]; }
 static float v_erfcf(float x) { return __v_erfcf(argf(x))[0]; }
 static float v_log10f(float x) { return __v_log10f(argf(x))[0]; }
@@ -23,6 +24,7 @@ static double v_erf(double x) { return __v_erf(argd(x))[0]; }
 static double v_erfc(double x) { return __v_erfc(argd(x))[0]; }
 static double v_log10(double x) { return __v_log10(argd(x))[0]; }
 #ifdef __vpcs
+static float vn_atan2f(float x, float y) { return __vn_atan2f(argf(x), argf(y))[0]; }
 static float vn_erff(float x) { return __vn_erff(argf(x))[0]; }
 static float vn_erfcf(float x) { return __vn_erfcf(argf(x))[0]; }
 static float vn_log10f(float x) { return __vn_log10f(argf(x))[0]; }
@@ -32,6 +34,7 @@ static double vn_erf(double x) { return __vn_erf(argd(x))[0]; }
 static double vn_erfc(double x) { return __vn_erfc(argd(x))[0]; }
 static double vn_log10(double x) { return __vn_log10(argd(x))[0]; }
 
+static float Z_atan2f(float x, float y) { return _ZGVnN4vv_atan2f(argf(x), argf(y))[0]; }
 static float Z_erff(float x) { return _ZGVnN4v_erff(argf(x))[0]; }
 static float Z_erfcf(float x) { return _ZGVnN4v_erfcf(argf(x))[0]; }
 static float Z_log10f(float x) { return _ZGVnN4v_log10f(argf(x))[0]; }
diff --git a/pl/math/v_atan2f_3u.c b/pl/math/v_atan2f_3u.c
new file mode 100644
index 0000000..4212351
--- /dev/null
+++ b/pl/math/v_atan2f_3u.c
@@ -0,0 +1,78 @@
+/*
+ * Single-precision vector atan2(x) function.
+ *
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#if V_SUPPORTED
+
+#include "atanf_common.h"
+
+/* Useful constants.  */
+#define PiOver2 v_f32 (0x1.921fb6p+0f)
+#define SignMask v_u32 (0x80000000)
+
+/* Special cases i.e. 0, infinity and nan (fall back to scalar calls).  */
+VPCS_ATTR
+__attribute__ ((noinline)) static v_f32_t
+specialcase (v_f32_t y, v_f32_t x, v_f32_t ret, v_u32_t cmp)
+{
+  return v_call2_f32 (atan2f, y, x, ret, cmp);
+}
+
+/* Returns 1 if input is the bit representation of 0, infinity or nan.  */
+static inline v_u32_t
+zeroinfnan (v_u32_t i)
+{
+  return v_cond_u32 (2 * i - 1 >= v_u32 (2 * 0x7f800000lu - 1));
+}
+
+/* Fast implementation of vector atan2f. Maximum observed error is
+   2.95 ULP in [0x1.9300d6p+6 0x1.93c0c6p+6] x [0x1.8c2dbp+6 0x1.8cea6p+6]:
+   v_atan2(0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1
+				       want 0x1.967f00p-1.  */
+VPCS_ATTR
+v_f32_t V_NAME (atan2f) (v_f32_t y, v_f32_t x)
+{
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t iy = v_as_u32_f32 (y);
+
+  v_u32_t special_cases = zeroinfnan (ix) | zeroinfnan (iy);
+
+  v_u32_t sign_x = ix & SignMask;
+  v_u32_t sign_y = iy & SignMask;
+  v_u32_t sign_xy = sign_x ^ sign_y;
+
+  v_f32_t ax = v_abs_f32 (x);
+  v_f32_t ay = v_abs_f32 (y);
+
+  v_u32_t pred_xlt0 = x < 0.0f;
+  v_u32_t pred_aygtax = ay > ax;
+
+  /* Set up z for call to atanf.  */
+  v_f32_t n = v_sel_f32 (pred_aygtax, -ax, ay);
+  v_f32_t d = v_sel_f32 (pred_aygtax, ay, ax);
+  v_f32_t z = v_div_f32 (n, d);
+
+  /* Work out the correct shift.  */
+  v_f32_t shift = v_sel_f32 (pred_xlt0, v_f32 (-2.0f), v_f32 (0.0f));
+  shift = v_sel_f32 (pred_aygtax, shift + 1.0f, shift);
+  shift *= PiOver2;
+
+  v_f32_t ret = eval_poly (z, z, shift);
+
+  /* Account for the sign of y.  */
+  ret = v_as_f32_u32 (v_as_u32_f32 (ret) ^ sign_xy);
+
+  if (unlikely (v_any_u32 (special_cases)))
+    {
+      return specialcase (y, x, ret, special_cases);
+    }
+
+  return ret;
+}
+VPCS_ALIAS
+
+#endif
diff --git a/pl/math/v_math.h b/pl/math/v_math.h
index 3733557..ddc5dab 100644
--- a/pl/math/v_math.h
+++ b/pl/math/v_math.h
@@ -184,6 +184,11 @@ v_calt_f32 (v_f32_t x, v_f32_t y)
   return fabsf (x) < fabsf (y);
 }
 static inline v_f32_t
+v_div_f32 (v_f32_t x, v_f32_t y)
+{
+  return x / y;
+}
+static inline v_f32_t
 v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
 {
   return __builtin_fmaf (x, y, z);
@@ -509,6 +514,11 @@ v_calt_f32 (v_f32_t x, v_f32_t y)
   return vcaltq_f32 (x, y);
 }
 static inline v_f32_t
+v_div_f32 (v_f32_t x, v_f32_t y)
+{
+  return vdivq_f32 (x, y);
+}
+static inline v_f32_t
 v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
 {
   return vfmaq_f32 (z, x, y);
diff --git a/pl/math/vn_atan2f_3u.c b/pl/math/vn_atan2f_3u.c
new file mode 100644
index 0000000..23aad38
--- /dev/null
+++ b/pl/math/vn_atan2f_3u.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_atan2f.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_atan2f, _ZGVnN4vv_atan2f)
+#include "v_atan2f_3u.c"
+#endif