aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJoe Ramsay <Joe.Ramsay@arm.com>2022-06-17 11:09:50 +0100
committerJoe Ramsay <joe.ramsay@arm.com>2022-06-17 11:09:50 +0100
commit570d607f2d53dc1d416ec8500487a9e261b15bb9 (patch)
treebeb3f104d160ad85c1b080241036cd46c3d8f48e
parent47eb0a883fb82fcd394353920e2cca4d0a0ffe9d (diff)
downloadarm-optimized-routines-570d607f2d53dc1d416ec8500487a9e261b15bb9.tar.gz
pl/math: Add vector/Neon atanf
Successfully ran tests and benchmarks. New routine is accurate to 3 ulps.
-rw-r--r--pl/math/include/mathlib.h4
-rw-r--r--pl/math/s_atanf_3u.c6
-rw-r--r--pl/math/test/mathbench_funcs.h6
-rwxr-xr-xpl/math/test/runulp.sh13
-rw-r--r--pl/math/test/ulp_funcs.h4
-rw-r--r--pl/math/test/ulp_wrappers.h3
-rw-r--r--pl/math/v_atanf_3u.c49
-rw-r--r--pl/math/vn_atanf_3u.c12
8 files changed, 97 insertions, 0 deletions
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index baee70f..0b7a745 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -17,6 +17,7 @@ float log10f (float);
double atan2 (double, double);
double log10 (double);
+float __s_atanf (float);
float __s_atan2f (float, float);
float __s_erfcf (float);
float __s_erff (float);
@@ -40,6 +41,7 @@ typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t;
#endif
/* Vector functions following the base PCS. */
+__f32x4_t __v_atanf (__f32x4_t);
__f64x2_t __v_atan (__f64x2_t);
__f32x4_t __v_atan2f (__f32x4_t, __f32x4_t);
__f64x2_t __v_atan2 (__f64x2_t, __f64x2_t);
@@ -54,6 +56,7 @@ __f64x2_t __v_log10 (__f64x2_t);
#define __vpcs __attribute__((__aarch64_vector_pcs__))
/* Vector functions following the vector PCS. */
+__vpcs __f32x4_t __vn_atanf (__f32x4_t);
__vpcs __f64x2_t __vn_atan (__f64x2_t);
__vpcs __f32x4_t __vn_atan2f (__f32x4_t, __f32x4_t);
__vpcs __f64x2_t __vn_atan2 (__f64x2_t, __f64x2_t);
@@ -65,6 +68,7 @@ __vpcs __f32x4_t __vn_log10f (__f32x4_t);
__vpcs __f64x2_t __vn_log10 (__f64x2_t);
/* Vector functions following the vector PCS using ABI names. */
+__vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t);
__vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
__vpcs __f32x4_t _ZGVnN4vv_atan2f (__f32x4_t, __f32x4_t);
__vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t);
diff --git a/pl/math/s_atanf_3u.c b/pl/math/s_atanf_3u.c
new file mode 100644
index 0000000..4e8a2f7
--- /dev/null
+++ b/pl/math/s_atanf_3u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_atanf_3u.c"
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index f1455aa..0f8e0ca 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -5,6 +5,7 @@
* Copyright (c) 2022, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
+F (atanf, -10.0, 10.0)
{"atan2f", 'f', 0, -10.0, 10.0, {.f = atan2f_wrap}},
F (erfcf, -4.0, 10.0)
F (erff, -4.0, 4.0)
@@ -17,6 +18,7 @@ D (erfc, -6.0, 28.0)
D (log10, 0.01, 11.1)
#if WANT_VMATH
+F (__s_atanf, -10.0, 10.0)
D (__s_atan, -10.0, 10.0)
{"__s_atan2f", 'f', 0, -10.0, 10.0, {.f = __s_atan2f_wrap}},
{"__s_atan2", 'd', 0, -10.0, 10.0, {.d = __s_atan2_wrap}},
@@ -27,6 +29,7 @@ D (__s_erfc, -6.0, 28.0)
F (__s_log10f, 0.01, 11.1)
D (__s_log10, 0.01, 11.1)
#if __aarch64__
+VF (__v_atanf, -10.0, 10.0)
VD (__v_atan, -10.0, 10.0)
{"__v_atan2f", 'f', 'v', -10.0, 10.0, {.vf = __v_atan2f_wrap}},
{"__v_atan2", 'd', 'v', -10.0, 10.0, {.vd = __v_atan2_wrap}},
@@ -37,6 +40,9 @@ VD (__v_erfc, -6.0, 28.0)
VD (__v_log10, 0.01, 11.1)
VF (__v_log10f, 0.01, 11.1)
#ifdef __vpcs
+VNF (__vn_atanf, -10.0, 10.0)
+VNF (_ZGVnN4v_atanf, -10.0, 10.0)
+
VND (__vn_atan, -10.0, 10.0)
VND (_ZGVnN2v_atan, -10.0, 10.0)
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 66ac3bf..674d718 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -169,6 +169,14 @@ range_atan2f='
1e6 1e32 40000
'
+range_atanf='
+ -10.0 10.0 50000
+ -1.0 1.0 40000
+ 0.0 1.0 40000
+ 1.0 100.0 40000
+ 1e6 1e32 40000
+'
+
# error limits
L_erfc=3.7
L_erfcf=1.0
@@ -179,6 +187,7 @@ L_erff=1.5
L_atan2=2.9
L_atan=3.0
L_atan2f=3.0
+L_atanf=3.0
while read G F R
do
@@ -218,6 +227,10 @@ log10 __v_log10 $runv
log10 __vn_log10 $runvn
log10 _ZGVnN2v_log10 $runvn
+atanf __s_atanf $runs
+atanf __v_atanf $runv
+atanf __vn_atanf $runvn
+atanf _ZGVnN4v_atanf $runvn
atan2f __s_atan2f $runs
atan2f __v_atan2f $runv
atan2f __vn_atan2f $runvn
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 5bc7411..40fae51 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -12,6 +12,7 @@ D2 (atan2)
D1 (erfc)
D1 (log10)
#if WANT_VMATH
+F (__s_atanf, __s_atanf, atan, mpfr_atan, 1, 1, f1, 0)
F (__s_atan, __s_atan, atanl, mpfr_atan, 1, 0, d1, 0)
F (__s_atan2f, __s_atan2f, atan2, mpfr_atan2, 2, 1, f2, 0)
F (__s_atan2, __s_atan2, atan2l, mpfr_atan2, 2, 0, d2, 0)
@@ -22,6 +23,7 @@ F (__s_erfc, __s_erfc, erfcl, mpfr_erfc, 1, 0, d1, 0)
F (__s_log10f, __s_log10f, log10, mpfr_log10, 1, 1, f1, 0)
F (__s_log10, __s_log10, log10l, mpfr_log10, 1, 0, d1, 0)
#if __aarch64__
+F (__v_atanf, v_atanf, atan, mpfr_atan, 1, 1, f1, 1)
F (__v_atan, v_atan, atanl, mpfr_atan, 1, 0, d1, 1)
F (__v_atan2f, v_atan2f, atan2, mpfr_atan2, 2, 1, f2, 1)
F (__v_atan2, v_atan2, atan2l, mpfr_atan2, 2, 0, d2, 1)
@@ -32,6 +34,7 @@ F (__v_erfc, v_erfc, erfcl, mpfr_erfc, 1, 0, d1, 1)
F (__v_log10f, v_log10f, log10, mpfr_log10, 1, 1, f1, 1)
F (__v_log10, v_log10, log10l, mpfr_log10, 1, 0, d1, 1)
#ifdef __vpcs
+F (__vn_atanf, vn_atanf, atan, mpfr_atan, 1, 1, f1, 1)
F (__vn_atan, vn_atan, atanl, mpfr_atan, 1, 0, d1, 1)
F (__vn_atan2f, vn_atan2f, atan2, mpfr_atan2, 2, 1, f2, 1)
F (__vn_atan2, vn_atan2, atan2l, mpfr_atan2, 2, 0, d2, 1)
@@ -41,6 +44,7 @@ F (__vn_erfcf, vn_erfcf, erfc, mpfr_erfc, 1, 1, f1, 1)
F (__vn_erfc, vn_erfc, erfcl, mpfr_erfc, 1, 0, d1, 1)
F (__vn_log10f, vn_log10f, log10, mpfr_log10, 1, 1, f1, 1)
F (__vn_log10, vn_log10, log10l, mpfr_log10, 1, 0, d1, 1)
+F (_ZGVnN4v_atanf, Z_atanf, atan, mpfr_atan, 1, 1, f1, 1)
F (_ZGVnN2v_atan, Z_atan, atanl, mpfr_atan, 1, 0, d1, 1)
F (_ZGVnN4vv_atan2f, Z_atan2f, atan2, mpfr_atan2, 2, 1, f2, 1)
F (_ZGVnN2vv_atan2, Z_atan2, atan2l, mpfr_atan2, 2, 0, d2, 1)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 0603e45..fa4ba4c 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -14,6 +14,7 @@ static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,
/* Wrappers for vector functions. */
#if __aarch64__ && WANT_VMATH
+static float v_atanf(float x) { return __v_atanf(argf(x))[0]; }
static float v_atan2f(float x, float y) { return __v_atan2f(argf(x), argf(y))[0]; }
static float v_erff(float x) { return __v_erff(argf(x))[0]; }
static float v_erfcf(float x) { return __v_erfcf(argf(x))[0]; }
@@ -24,6 +25,7 @@ static double v_erf(double x) { return __v_erf(argd(x))[0]; }
static double v_erfc(double x) { return __v_erfc(argd(x))[0]; }
static double v_log10(double x) { return __v_log10(argd(x))[0]; }
#ifdef __vpcs
+static float vn_atanf(float x) { return __vn_atanf(argf(x))[0]; }
static float vn_atan2f(float x, float y) { return __vn_atan2f(argf(x), argf(y))[0]; }
static float vn_erff(float x) { return __vn_erff(argf(x))[0]; }
static float vn_erfcf(float x) { return __vn_erfcf(argf(x))[0]; }
@@ -34,6 +36,7 @@ static double vn_erf(double x) { return __vn_erf(argd(x))[0]; }
static double vn_erfc(double x) { return __vn_erfc(argd(x))[0]; }
static double vn_log10(double x) { return __vn_log10(argd(x))[0]; }
+static float Z_atanf(float x) { return _ZGVnN4v_atanf(argf(x))[0]; }
static float Z_atan2f(float x, float y) { return _ZGVnN4vv_atan2f(argf(x), argf(y))[0]; }
static float Z_erff(float x) { return _ZGVnN4v_erff(argf(x))[0]; }
static float Z_erfcf(float x) { return _ZGVnN4v_erfcf(argf(x))[0]; }
diff --git a/pl/math/v_atanf_3u.c b/pl/math/v_atanf_3u.c
new file mode 100644
index 0000000..7c84244
--- /dev/null
+++ b/pl/math/v_atanf_3u.c
@@ -0,0 +1,49 @@
+/*
+ * Single-precision vector atan(x) function.
+ *
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#if V_SUPPORTED
+
+#include "atanf_common.h"
+
+#define PiOver2 v_f32 (0x1.921fb6p+0f)
+#define AbsMask v_u32 (0x7fffffff)
+
+/* Fast implementation of vector atanf based on
+ atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1]
+ using z=-1/x and shift = pi/2. Maximum observed error is 2.9ulps:
+ v_atanf(0x1.0468f6p+0) got 0x1.967f06p-1 want 0x1.967fp-1. */
+VPCS_ATTR
+v_f32_t V_NAME (atanf) (v_f32_t x)
+{
+ /* No need to trigger special case. Small cases, infs and nans
+ are supported by our approximation technique. */
+ v_u32_t ix = v_as_u32_f32 (x);
+ v_u32_t sign = ix & ~AbsMask;
+
+ /* Argument reduction:
+ y := arctan(x) for x < 1
+ y := pi/2 + arctan(-1/x) for x > 1
+ Hence, use z=-1/a if x>=1, otherwise z=a. */
+ v_u32_t red = v_cagt_f32 (x, v_f32 (1.0));
+ /* Avoid dependency in abs(x) in division (and comparison). */
+ v_f32_t z = v_sel_f32 (red, v_div_f32 (v_f32 (-1.0f), x), x);
+ v_f32_t shift = v_sel_f32 (red, PiOver2, v_f32 (0.0f));
+ /* Use absolute value only when needed (odd powers of z). */
+ v_f32_t az = v_abs_f32 (z);
+ az = v_sel_f32 (red, -az, az);
+
+ /* Calculate the polynomial approximation. */
+ v_f32_t y = eval_poly (z, az, shift);
+
+ /* y = atan(x) if x>0, -atan(-x) otherwise. */
+ y = v_as_f32_u32 (v_as_u32_f32 (y) ^ sign);
+
+ return y;
+}
+VPCS_ALIAS
+#endif
diff --git a/pl/math/vn_atanf_3u.c b/pl/math/vn_atanf_3u.c
new file mode 100644
index 0000000..17ba6b8
--- /dev/null
+++ b/pl/math/vn_atanf_3u.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_atanf.
+ *
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_atanf, _ZGVnN4v_atanf)
+#include "v_atanf_3u.c"
+#endif