diff options
author | Joe Ramsay <Joe.Ramsay@arm.com> | 2022-06-17 11:09:50 +0100 |
---|---|---|
committer | Joe Ramsay <joe.ramsay@arm.com> | 2022-06-17 11:09:50 +0100 |
commit | 570d607f2d53dc1d416ec8500487a9e261b15bb9 (patch) | |
tree | beb3f104d160ad85c1b080241036cd46c3d8f48e | |
parent | 47eb0a883fb82fcd394353920e2cca4d0a0ffe9d (diff) | |
download | arm-optimized-routines-570d607f2d53dc1d416ec8500487a9e261b15bb9.tar.gz |
pl/math: Add vector/Neon atanf
Successfully ran tests and benchmarks. New routine is accurate to 3 ulps.
-rw-r--r-- | pl/math/include/mathlib.h | 4 | ||||
-rw-r--r-- | pl/math/s_atanf_3u.c | 6 | ||||
-rw-r--r-- | pl/math/test/mathbench_funcs.h | 6 | ||||
-rwxr-xr-x | pl/math/test/runulp.sh | 13 | ||||
-rw-r--r-- | pl/math/test/ulp_funcs.h | 4 | ||||
-rw-r--r-- | pl/math/test/ulp_wrappers.h | 3 | ||||
-rw-r--r-- | pl/math/v_atanf_3u.c | 49 | ||||
-rw-r--r-- | pl/math/vn_atanf_3u.c | 12 |
8 files changed, 97 insertions, 0 deletions
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h index baee70f..0b7a745 100644 --- a/pl/math/include/mathlib.h +++ b/pl/math/include/mathlib.h @@ -17,6 +17,7 @@ float log10f (float); double atan2 (double, double); double log10 (double); +float __s_atanf (float); float __s_atan2f (float, float); float __s_erfcf (float); float __s_erff (float); @@ -40,6 +41,7 @@ typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t; #endif /* Vector functions following the base PCS. */ +__f32x4_t __v_atanf (__f32x4_t); __f64x2_t __v_atan (__f64x2_t); __f32x4_t __v_atan2f (__f32x4_t, __f32x4_t); __f64x2_t __v_atan2 (__f64x2_t, __f64x2_t); @@ -54,6 +56,7 @@ __f64x2_t __v_log10 (__f64x2_t); #define __vpcs __attribute__((__aarch64_vector_pcs__)) /* Vector functions following the vector PCS. */ +__vpcs __f32x4_t __vn_atanf (__f32x4_t); __vpcs __f64x2_t __vn_atan (__f64x2_t); __vpcs __f32x4_t __vn_atan2f (__f32x4_t, __f32x4_t); __vpcs __f64x2_t __vn_atan2 (__f64x2_t, __f64x2_t); @@ -65,6 +68,7 @@ __vpcs __f32x4_t __vn_log10f (__f32x4_t); __vpcs __f64x2_t __vn_log10 (__f64x2_t); /* Vector functions following the vector PCS using ABI names. */ +__vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t); __vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t); __vpcs __f32x4_t _ZGVnN4vv_atan2f (__f32x4_t, __f32x4_t); __vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t); diff --git a/pl/math/s_atanf_3u.c b/pl/math/s_atanf_3u.c new file mode 100644 index 0000000..4e8a2f7 --- /dev/null +++ b/pl/math/s_atanf_3u.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2021-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define SCALAR 1 +#include "v_atanf_3u.c" diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h index f1455aa..0f8e0ca 100644 --- a/pl/math/test/mathbench_funcs.h +++ b/pl/math/test/mathbench_funcs.h @@ -5,6 +5,7 @@ * Copyright (c) 2022, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ +F (atanf, -10.0, 10.0) {"atan2f", 'f', 0, -10.0, 10.0, {.f = atan2f_wrap}}, F (erfcf, -4.0, 10.0) F (erff, -4.0, 4.0) @@ -17,6 +18,7 @@ D (erfc, -6.0, 28.0) D (log10, 0.01, 11.1) #if WANT_VMATH +F (__s_atanf, -10.0, 10.0) D (__s_atan, -10.0, 10.0) {"__s_atan2f", 'f', 0, -10.0, 10.0, {.f = __s_atan2f_wrap}}, {"__s_atan2", 'd', 0, -10.0, 10.0, {.d = __s_atan2_wrap}}, @@ -27,6 +29,7 @@ D (__s_erfc, -6.0, 28.0) F (__s_log10f, 0.01, 11.1) D (__s_log10, 0.01, 11.1) #if __aarch64__ +VF (__v_atanf, -10.0, 10.0) VD (__v_atan, -10.0, 10.0) {"__v_atan2f", 'f', 'v', -10.0, 10.0, {.vf = __v_atan2f_wrap}}, {"__v_atan2", 'd', 'v', -10.0, 10.0, {.vd = __v_atan2_wrap}}, @@ -37,6 +40,9 @@ VD (__v_erfc, -6.0, 28.0) VD (__v_log10, 0.01, 11.1) VF (__v_log10f, 0.01, 11.1) #ifdef __vpcs +VNF (__vn_atanf, -10.0, 10.0) +VNF (_ZGVnN4v_atanf, -10.0, 10.0) + VND (__vn_atan, -10.0, 10.0) VND (_ZGVnN2v_atan, -10.0, 10.0) diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh index 66ac3bf..674d718 100755 --- a/pl/math/test/runulp.sh +++ b/pl/math/test/runulp.sh @@ -169,6 +169,14 @@ range_atan2f=' 1e6 1e32 40000 ' +range_atanf=' + -10.0 10.0 50000 + -1.0 1.0 40000 + 0.0 1.0 40000 + 1.0 100.0 40000 + 1e6 1e32 40000 +' + # error limits L_erfc=3.7 L_erfcf=1.0 @@ -179,6 +187,7 @@ L_erff=1.5 L_atan2=2.9 L_atan=3.0 L_atan2f=3.0 +L_atanf=3.0 while read G F R do @@ -218,6 +227,10 @@ log10 __v_log10 $runv log10 __vn_log10 $runvn log10 _ZGVnN2v_log10 $runvn +atanf __s_atanf $runs +atanf __v_atanf $runv +atanf __vn_atanf $runvn +atanf _ZGVnN4v_atanf $runvn atan2f __s_atan2f $runs atan2f __v_atan2f $runv atan2f __vn_atan2f $runvn diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h index 5bc7411..40fae51 100644 --- a/pl/math/test/ulp_funcs.h +++ b/pl/math/test/ulp_funcs.h @@ -12,6 +12,7 @@ D2 (atan2) D1 (erfc) D1 (log10) #if WANT_VMATH +F (__s_atanf, __s_atanf, atan, mpfr_atan, 1, 1, f1, 0) F (__s_atan, __s_atan, atanl, mpfr_atan, 1, 0, d1, 0) F (__s_atan2f, __s_atan2f, atan2, mpfr_atan2, 2, 1, f2, 0) F (__s_atan2, __s_atan2, atan2l, mpfr_atan2, 2, 0, d2, 0) @@ -22,6 +23,7 @@ F (__s_erfc, __s_erfc, erfcl, mpfr_erfc, 1, 0, d1, 0) F (__s_log10f, __s_log10f, log10, mpfr_log10, 1, 1, f1, 0) F (__s_log10, __s_log10, log10l, mpfr_log10, 1, 0, d1, 0) #if __aarch64__ +F (__v_atanf, v_atanf, atan, mpfr_atan, 1, 1, f1, 1) F (__v_atan, v_atan, atanl, mpfr_atan, 1, 0, d1, 1) F (__v_atan2f, v_atan2f, atan2, mpfr_atan2, 2, 1, f2, 1) F (__v_atan2, v_atan2, atan2l, mpfr_atan2, 2, 0, d2, 1) @@ -32,6 +34,7 @@ F (__v_erfc, v_erfc, erfcl, mpfr_erfc, 1, 0, d1, 1) F (__v_log10f, v_log10f, log10, mpfr_log10, 1, 1, f1, 1) F (__v_log10, v_log10, log10l, mpfr_log10, 1, 0, d1, 1) #ifdef __vpcs +F (__vn_atanf, vn_atanf, atan, mpfr_atan, 1, 1, f1, 1) F (__vn_atan, vn_atan, atanl, mpfr_atan, 1, 0, d1, 1) F (__vn_atan2f, vn_atan2f, atan2, mpfr_atan2, 2, 1, f2, 1) F (__vn_atan2, vn_atan2, atan2l, mpfr_atan2, 2, 0, d2, 1) @@ -41,6 +44,7 @@ F (__vn_erfcf, vn_erfcf, erfc, mpfr_erfc, 1, 1, f1, 1) F (__vn_erfc, vn_erfc, erfcl, mpfr_erfc, 1, 0, d1, 1) F (__vn_log10f, vn_log10f, log10, mpfr_log10, 1, 1, f1, 1) F (__vn_log10, vn_log10, log10l, mpfr_log10, 1, 0, d1, 1) +F (_ZGVnN4v_atanf, Z_atanf, atan, mpfr_atan, 1, 1, f1, 1) F (_ZGVnN2v_atan, Z_atan, atanl, mpfr_atan, 1, 0, d1, 1) F (_ZGVnN4vv_atan2f, Z_atan2f, atan2, mpfr_atan2, 2, 1, f2, 1) F (_ZGVnN2vv_atan2, Z_atan2, atan2l, mpfr_atan2, 2, 0, d2, 1) diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h index 0603e45..fa4ba4c 100644 --- a/pl/math/test/ulp_wrappers.h +++ b/pl/math/test/ulp_wrappers.h @@ -14,6 +14,7 @@ static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y, /* Wrappers for vector functions. */ #if __aarch64__ && WANT_VMATH +static float v_atanf(float x) { return __v_atanf(argf(x))[0]; } static float v_atan2f(float x, float y) { return __v_atan2f(argf(x), argf(y))[0]; } static float v_erff(float x) { return __v_erff(argf(x))[0]; } static float v_erfcf(float x) { return __v_erfcf(argf(x))[0]; } @@ -24,6 +25,7 @@ static double v_erf(double x) { return __v_erf(argd(x))[0]; } static double v_erfc(double x) { return __v_erfc(argd(x))[0]; } static double v_log10(double x) { return __v_log10(argd(x))[0]; } #ifdef __vpcs +static float vn_atanf(float x) { return __vn_atanf(argf(x))[0]; } static float vn_atan2f(float x, float y) { return __vn_atan2f(argf(x), argf(y))[0]; } static float vn_erff(float x) { return __vn_erff(argf(x))[0]; } static float vn_erfcf(float x) { return __vn_erfcf(argf(x))[0]; } @@ -34,6 +36,7 @@ static double vn_erf(double x) { return __vn_erf(argd(x))[0]; } static double vn_erfc(double x) { return __vn_erfc(argd(x))[0]; } static double vn_log10(double x) { return __vn_log10(argd(x))[0]; } +static float Z_atanf(float x) { return _ZGVnN4v_atanf(argf(x))[0]; } static float Z_atan2f(float x, float y) { return _ZGVnN4vv_atan2f(argf(x), argf(y))[0]; } static float Z_erff(float x) { return _ZGVnN4v_erff(argf(x))[0]; } static float Z_erfcf(float x) { return _ZGVnN4v_erfcf(argf(x))[0]; } diff --git a/pl/math/v_atanf_3u.c b/pl/math/v_atanf_3u.c new file mode 100644 index 0000000..7c84244 --- /dev/null +++ b/pl/math/v_atanf_3u.c @@ -0,0 +1,49 @@ +/* + * Single-precision vector atan(x) function. + * + * Copyright (c) 2021-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#if V_SUPPORTED + +#include "atanf_common.h" + +#define PiOver2 v_f32 (0x1.921fb6p+0f) +#define AbsMask v_u32 (0x7fffffff) + +/* Fast implementation of vector atanf based on + atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] + using z=-1/x and shift = pi/2. Maximum observed error is 2.9ulps: + v_atanf(0x1.0468f6p+0) got 0x1.967f06p-1 want 0x1.967fp-1. */ +VPCS_ATTR +v_f32_t V_NAME (atanf) (v_f32_t x) +{ + /* No need to trigger special case. Small cases, infs and nans + are supported by our approximation technique. */ + v_u32_t ix = v_as_u32_f32 (x); + v_u32_t sign = ix & ~AbsMask; + + /* Argument reduction: + y := arctan(x) for x < 1 + y := pi/2 + arctan(-1/x) for x > 1 + Hence, use z=-1/a if x>=1, otherwise z=a. */ + v_u32_t red = v_cagt_f32 (x, v_f32 (1.0)); + /* Avoid dependency in abs(x) in division (and comparison). */ + v_f32_t z = v_sel_f32 (red, v_div_f32 (v_f32 (-1.0f), x), x); + v_f32_t shift = v_sel_f32 (red, PiOver2, v_f32 (0.0f)); + /* Use absolute value only when needed (odd powers of z). */ + v_f32_t az = v_abs_f32 (z); + az = v_sel_f32 (red, -az, az); + + /* Calculate the polynomial approximation. */ + v_f32_t y = eval_poly (z, az, shift); + + /* y = atan(x) if x>0, -atan(-x) otherwise. */ + y = v_as_f32_u32 (v_as_u32_f32 (y) ^ sign); + + return y; +} +VPCS_ALIAS +#endif diff --git a/pl/math/vn_atanf_3u.c b/pl/math/vn_atanf_3u.c new file mode 100644 index 0000000..17ba6b8 --- /dev/null +++ b/pl/math/vn_atanf_3u.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_atanf. + * + * Copyright (c) 2021-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "include/mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS strong_alias (__vn_atanf, _ZGVnN4v_atanf) +#include "v_atanf_3u.c" +#endif |