aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--pl/math/include/mathlib.h4
-rw-r--r--pl/math/s_atan2f_3u.c6
-rw-r--r--pl/math/test/mathbench_funcs.h5
-rw-r--r--pl/math/test/mathbench_wrappers.h24
-rwxr-xr-xpl/math/test/runulp.sh13
-rw-r--r--pl/math/test/ulp_funcs.h4
-rw-r--r--pl/math/test/ulp_wrappers.h3
-rw-r--r--pl/math/v_atan2f_3u.c78
-rw-r--r--pl/math/v_math.h10
-rw-r--r--pl/math/vn_atan2f_3u.c12
10 files changed, 159 insertions, 0 deletions
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index cda3371..baee70f 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -17,6 +17,7 @@ float log10f (float);
double atan2 (double, double);
double log10 (double);
+float __s_atan2f (float, float);
float __s_erfcf (float);
float __s_erff (float);
float __s_log10f (float);
@@ -40,6 +41,7 @@ typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t;
/* Vector functions following the base PCS. */
__f64x2_t __v_atan (__f64x2_t);
+__f32x4_t __v_atan2f (__f32x4_t, __f32x4_t);
__f64x2_t __v_atan2 (__f64x2_t, __f64x2_t);
__f32x4_t __v_erff (__f32x4_t);
__f64x2_t __v_erf (__f64x2_t);
@@ -53,6 +55,7 @@ __f64x2_t __v_log10 (__f64x2_t);
/* Vector functions following the vector PCS. */
__vpcs __f64x2_t __vn_atan (__f64x2_t);
+__vpcs __f32x4_t __vn_atan2f (__f32x4_t, __f32x4_t);
__vpcs __f64x2_t __vn_atan2 (__f64x2_t, __f64x2_t);
__vpcs __f32x4_t __vn_erff (__f32x4_t);
__vpcs __f64x2_t __vn_erf (__f64x2_t);
@@ -63,6 +66,7 @@ __vpcs __f64x2_t __vn_log10 (__f64x2_t);
/* Vector functions following the vector PCS using ABI names. */
__vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4vv_atan2f (__f32x4_t, __f32x4_t);
__vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t);
__vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t);
__vpcs __f64x2_t _ZGVnN2v_erf (__f64x2_t);
diff --git a/pl/math/s_atan2f_3u.c b/pl/math/s_atan2f_3u.c
new file mode 100644
index 0000000..5002d32
--- /dev/null
+++ b/pl/math/s_atan2f_3u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_atan2f_3u.c"
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index f2befc8..f1455aa 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -18,6 +18,7 @@ D (log10, 0.01, 11.1)
#if WANT_VMATH
D (__s_atan, -10.0, 10.0)
+{"__s_atan2f", 'f', 0, -10.0, 10.0, {.f = __s_atan2f_wrap}},
{"__s_atan2", 'd', 0, -10.0, 10.0, {.d = __s_atan2_wrap}},
F (__s_erff, -4.0, 4.0)
D (__s_erf, -6.0, 6.0)
@@ -27,6 +28,7 @@ F (__s_log10f, 0.01, 11.1)
D (__s_log10, 0.01, 11.1)
#if __aarch64__
VD (__v_atan, -10.0, 10.0)
+{"__v_atan2f", 'f', 'v', -10.0, 10.0, {.vf = __v_atan2f_wrap}},
{"__v_atan2", 'd', 'v', -10.0, 10.0, {.vd = __v_atan2_wrap}},
VF (__v_erff, -4.0, 4.0)
VD (__v_erf, -6.0, 6.0)
@@ -38,6 +40,9 @@ VF (__v_log10f, 0.01, 11.1)
VND (__vn_atan, -10.0, 10.0)
VND (_ZGVnN2v_atan, -10.0, 10.0)
+{"__vn_atan2f", 'f', 'n', -10.0, 10.0, {.vnf = __vn_atan2f_wrap}},
+{"_ZGVnN4vv_atan2f", 'f', 'n', -10.0, 10.0, {.vnf = _Z_atan2f_wrap}},
+
{"__vn_atan2", 'd', 'n', -10.0, 10.0, {.vnd = __vn_atan2_wrap}},
{"_ZGVnN2vv_atan2", 'd', 'n', -10.0, 10.0, {.vnd = _Z_atan2_wrap}},
diff --git a/pl/math/test/mathbench_wrappers.h b/pl/math/test/mathbench_wrappers.h
index 57c0c8f..37edeae 100644
--- a/pl/math/test/mathbench_wrappers.h
+++ b/pl/math/test/mathbench_wrappers.h
@@ -26,12 +26,24 @@ __s_atan2_wrap (double x)
return __s_atan2 (5.0, x);
}
+static float
+__s_atan2f_wrap (float x)
+{
+ return __s_atan2f (5.0f, x);
+}
+
static v_double
__v_atan2_wrap (v_double x)
{
return __v_atan2 (v_double_dup (5.0), x);
}
+static v_float
+__v_atan2f_wrap (v_float x)
+{
+ return __v_atan2f (v_float_dup (5.0f), x);
+}
+
#ifdef __vpcs
__vpcs static v_double
@@ -40,12 +52,24 @@ __vn_atan2_wrap (v_double x)
return __vn_atan2 (v_double_dup (5.0), x);
}
+__vpcs static v_float
+__vn_atan2f_wrap (v_float x)
+{
+ return __vn_atan2f (v_float_dup (5.0f), x);
+}
+
__vpcs static v_double
_Z_atan2_wrap (v_double x)
{
return _ZGVnN2vv_atan2 (v_double_dup (5.0), x);
}
+__vpcs static v_float
+_Z_atan2f_wrap (v_float x)
+{
+ return _ZGVnN4vv_atan2f (v_float_dup (5.0f), x);
+}
+
#endif // __vpcs
#endif // __arch64__
#endif // WANT_VMATH
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 7ad2318..66ac3bf 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -161,6 +161,14 @@ range_atan='
1e6 1e32 40000
'
+range_atan2f='
+ -10.0 10.0 50000
+ -1.0 1.0 40000
+ 0.0 1.0 40000
+ 1.0 100.0 40000
+ 1e6 1e32 40000
+'
+
# error limits
L_erfc=3.7
L_erfcf=1.0
@@ -170,6 +178,7 @@ L_erf=1.76
L_erff=1.5
L_atan2=2.9
L_atan=3.0
+L_atan2f=3.0
while read G F R
do
@@ -209,6 +218,10 @@ log10 __v_log10 $runv
log10 __vn_log10 $runvn
log10 _ZGVnN2v_log10 $runvn
+atan2f __s_atan2f $runs
+atan2f __v_atan2f $runv
+atan2f __vn_atan2f $runvn
+atan2f _ZGVnN4vv_atan2f $runvn
erff __s_erff $runs
erff __v_erff $runv
erff __vn_erff $runvn
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 244c96b..5bc7411 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -13,6 +13,7 @@ D1 (erfc)
D1 (log10)
#if WANT_VMATH
F (__s_atan, __s_atan, atanl, mpfr_atan, 1, 0, d1, 0)
+F (__s_atan2f, __s_atan2f, atan2, mpfr_atan2, 2, 1, f2, 0)
F (__s_atan2, __s_atan2, atan2l, mpfr_atan2, 2, 0, d2, 0)
F (__s_erff, __s_erff, erf, mpfr_erf, 1, 1, f1, 0)
F (__s_erf, __s_erf, erfl, mpfr_erf, 1, 0, d1, 0)
@@ -22,6 +23,7 @@ F (__s_log10f, __s_log10f, log10, mpfr_log10, 1, 1, f1, 0)
F (__s_log10, __s_log10, log10l, mpfr_log10, 1, 0, d1, 0)
#if __aarch64__
F (__v_atan, v_atan, atanl, mpfr_atan, 1, 0, d1, 1)
+F (__v_atan2f, v_atan2f, atan2, mpfr_atan2, 2, 1, f2, 1)
F (__v_atan2, v_atan2, atan2l, mpfr_atan2, 2, 0, d2, 1)
F (__v_erff, v_erff, erf, mpfr_erf, 1, 1, f1, 1)
F (__v_erf, v_erf, erfl, mpfr_erf, 1, 0, d1, 1)
@@ -31,6 +33,7 @@ F (__v_log10f, v_log10f, log10, mpfr_log10, 1, 1, f1, 1)
F (__v_log10, v_log10, log10l, mpfr_log10, 1, 0, d1, 1)
#ifdef __vpcs
F (__vn_atan, vn_atan, atanl, mpfr_atan, 1, 0, d1, 1)
+F (__vn_atan2f, vn_atan2f, atan2, mpfr_atan2, 2, 1, f2, 1)
F (__vn_atan2, vn_atan2, atan2l, mpfr_atan2, 2, 0, d2, 1)
F (__vn_erff, vn_erff, erf, mpfr_erf, 1, 1, f1, 1)
F (__vn_erf, vn_erf, erfl, mpfr_erf, 1, 0, d1, 1)
@@ -39,6 +42,7 @@ F (__vn_erfc, vn_erfc, erfcl, mpfr_erfc, 1, 0, d1, 1)
F (__vn_log10f, vn_log10f, log10, mpfr_log10, 1, 1, f1, 1)
F (__vn_log10, vn_log10, log10l, mpfr_log10, 1, 0, d1, 1)
F (_ZGVnN2v_atan, Z_atan, atanl, mpfr_atan, 1, 0, d1, 1)
+F (_ZGVnN4vv_atan2f, Z_atan2f, atan2, mpfr_atan2, 2, 1, f2, 1)
F (_ZGVnN2vv_atan2, Z_atan2, atan2l, mpfr_atan2, 2, 0, d2, 1)
F (_ZGVnN4v_erff, Z_erff, erf, mpfr_erf, 1, 1, f1, 1)
F (_ZGVnN2v_erf, Z_erf, erfl, mpfr_erf, 1, 0, d1, 1)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index f1bfdf2..0603e45 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -14,6 +14,7 @@ static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,
/* Wrappers for vector functions. */
#if __aarch64__ && WANT_VMATH
+static float v_atan2f(float x, float y) { return __v_atan2f(argf(x), argf(y))[0]; }
static float v_erff(float x) { return __v_erff(argf(x))[0]; }
static float v_erfcf(float x) { return __v_erfcf(argf(x))[0]; }
static float v_log10f(float x) { return __v_log10f(argf(x))[0]; }
@@ -23,6 +24,7 @@ static double v_erf(double x) { return __v_erf(argd(x))[0]; }
static double v_erfc(double x) { return __v_erfc(argd(x))[0]; }
static double v_log10(double x) { return __v_log10(argd(x))[0]; }
#ifdef __vpcs
+static float vn_atan2f(float x, float y) { return __vn_atan2f(argf(x), argf(y))[0]; }
static float vn_erff(float x) { return __vn_erff(argf(x))[0]; }
static float vn_erfcf(float x) { return __vn_erfcf(argf(x))[0]; }
static float vn_log10f(float x) { return __vn_log10f(argf(x))[0]; }
@@ -32,6 +34,7 @@ static double vn_erf(double x) { return __vn_erf(argd(x))[0]; }
static double vn_erfc(double x) { return __vn_erfc(argd(x))[0]; }
static double vn_log10(double x) { return __vn_log10(argd(x))[0]; }
+static float Z_atan2f(float x, float y) { return _ZGVnN4vv_atan2f(argf(x), argf(y))[0]; }
static float Z_erff(float x) { return _ZGVnN4v_erff(argf(x))[0]; }
static float Z_erfcf(float x) { return _ZGVnN4v_erfcf(argf(x))[0]; }
static float Z_log10f(float x) { return _ZGVnN4v_log10f(argf(x))[0]; }
diff --git a/pl/math/v_atan2f_3u.c b/pl/math/v_atan2f_3u.c
new file mode 100644
index 0000000..4212351
--- /dev/null
+++ b/pl/math/v_atan2f_3u.c
@@ -0,0 +1,78 @@
+/*
+ * Single-precision vector atan2(x) function.
+ *
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#if V_SUPPORTED
+
+#include "atanf_common.h"
+
+/* Useful constants. */
+#define PiOver2 v_f32 (0x1.921fb6p+0f)
+#define SignMask v_u32 (0x80000000)
+
+/* Special cases i.e. 0, infinity and nan (fall back to scalar calls). */
+VPCS_ATTR
+__attribute__ ((noinline)) static v_f32_t
+specialcase (v_f32_t y, v_f32_t x, v_f32_t ret, v_u32_t cmp)
+{
+ return v_call2_f32 (atan2f, y, x, ret, cmp);
+}
+
+/* Returns 1 if input is the bit representation of 0, infinity or nan. */
+static inline v_u32_t
+zeroinfnan (v_u32_t i)
+{
+ return v_cond_u32 (2 * i - 1 >= v_u32 (2 * 0x7f800000lu - 1));
+}
+
+/* Fast implementation of vector atan2f. Maximum observed error is
+ 2.95 ULP in [0x1.9300d6p+6 0x1.93c0c6p+6] x [0x1.8c2dbp+6 0x1.8cea6p+6]:
+ v_atan2(0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1
+ want 0x1.967f00p-1. */
+VPCS_ATTR
+v_f32_t V_NAME (atan2f) (v_f32_t y, v_f32_t x)
+{
+ v_u32_t ix = v_as_u32_f32 (x);
+ v_u32_t iy = v_as_u32_f32 (y);
+
+ v_u32_t special_cases = zeroinfnan (ix) | zeroinfnan (iy);
+
+ v_u32_t sign_x = ix & SignMask;
+ v_u32_t sign_y = iy & SignMask;
+ v_u32_t sign_xy = sign_x ^ sign_y;
+
+ v_f32_t ax = v_abs_f32 (x);
+ v_f32_t ay = v_abs_f32 (y);
+
+ v_u32_t pred_xlt0 = x < 0.0f;
+ v_u32_t pred_aygtax = ay > ax;
+
+ /* Set up z for call to atanf. */
+ v_f32_t n = v_sel_f32 (pred_aygtax, -ax, ay);
+ v_f32_t d = v_sel_f32 (pred_aygtax, ay, ax);
+ v_f32_t z = v_div_f32 (n, d);
+
+ /* Work out the correct shift. */
+ v_f32_t shift = v_sel_f32 (pred_xlt0, v_f32 (-2.0f), v_f32 (0.0f));
+ shift = v_sel_f32 (pred_aygtax, shift + 1.0f, shift);
+ shift *= PiOver2;
+
+ v_f32_t ret = eval_poly (z, z, shift);
+
+ /* Account for the sign of y. */
+ ret = v_as_f32_u32 (v_as_u32_f32 (ret) ^ sign_xy);
+
+ if (unlikely (v_any_u32 (special_cases)))
+ {
+ return specialcase (y, x, ret, special_cases);
+ }
+
+ return ret;
+}
+VPCS_ALIAS
+
+#endif
diff --git a/pl/math/v_math.h b/pl/math/v_math.h
index 3733557..ddc5dab 100644
--- a/pl/math/v_math.h
+++ b/pl/math/v_math.h
@@ -184,6 +184,11 @@ v_calt_f32 (v_f32_t x, v_f32_t y)
return fabsf (x) < fabsf (y);
}
static inline v_f32_t
+v_div_f32 (v_f32_t x, v_f32_t y)
+{
+ return x / y;
+}
+static inline v_f32_t
v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
{
return __builtin_fmaf (x, y, z);
@@ -509,6 +514,11 @@ v_calt_f32 (v_f32_t x, v_f32_t y)
return vcaltq_f32 (x, y);
}
static inline v_f32_t
+v_div_f32 (v_f32_t x, v_f32_t y)
+{
+ return vdivq_f32 (x, y);
+}
+static inline v_f32_t
v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
{
return vfmaq_f32 (z, x, y);
diff --git a/pl/math/vn_atan2f_3u.c b/pl/math/vn_atan2f_3u.c
new file mode 100644
index 0000000..23aad38
--- /dev/null
+++ b/pl/math/vn_atan2f_3u.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_atan2f.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_atan2f, _ZGVnN4vv_atan2f)
+#include "v_atan2f_3u.c"
+#endif