diff options
-rw-r--r-- | pl/math/include/mathlib.h | 4 | ||||
-rw-r--r-- | pl/math/s_atan2f_3u.c | 6 | ||||
-rw-r--r-- | pl/math/test/mathbench_funcs.h | 5 | ||||
-rw-r--r-- | pl/math/test/mathbench_wrappers.h | 24 | ||||
-rwxr-xr-x | pl/math/test/runulp.sh | 13 | ||||
-rw-r--r-- | pl/math/test/ulp_funcs.h | 4 | ||||
-rw-r--r-- | pl/math/test/ulp_wrappers.h | 3 | ||||
-rw-r--r-- | pl/math/v_atan2f_3u.c | 78 | ||||
-rw-r--r-- | pl/math/v_math.h | 10 | ||||
-rw-r--r-- | pl/math/vn_atan2f_3u.c | 12 |
10 files changed, 159 insertions, 0 deletions
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h index cda3371..baee70f 100644 --- a/pl/math/include/mathlib.h +++ b/pl/math/include/mathlib.h @@ -17,6 +17,7 @@ float log10f (float); double atan2 (double, double); double log10 (double); +float __s_atan2f (float, float); float __s_erfcf (float); float __s_erff (float); float __s_log10f (float); @@ -40,6 +41,7 @@ typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t; /* Vector functions following the base PCS. */ __f64x2_t __v_atan (__f64x2_t); +__f32x4_t __v_atan2f (__f32x4_t, __f32x4_t); __f64x2_t __v_atan2 (__f64x2_t, __f64x2_t); __f32x4_t __v_erff (__f32x4_t); __f64x2_t __v_erf (__f64x2_t); @@ -53,6 +55,7 @@ __f64x2_t __v_log10 (__f64x2_t); /* Vector functions following the vector PCS. */ __vpcs __f64x2_t __vn_atan (__f64x2_t); +__vpcs __f32x4_t __vn_atan2f (__f32x4_t, __f32x4_t); __vpcs __f64x2_t __vn_atan2 (__f64x2_t, __f64x2_t); __vpcs __f32x4_t __vn_erff (__f32x4_t); __vpcs __f64x2_t __vn_erf (__f64x2_t); @@ -63,6 +66,7 @@ __vpcs __f64x2_t __vn_log10 (__f64x2_t); /* Vector functions following the vector PCS using ABI names. */ __vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t); +__vpcs __f32x4_t _ZGVnN4vv_atan2f (__f32x4_t, __f32x4_t); __vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t); __vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t); __vpcs __f64x2_t _ZGVnN2v_erf (__f64x2_t); diff --git a/pl/math/s_atan2f_3u.c b/pl/math/s_atan2f_3u.c new file mode 100644 index 0000000..5002d32 --- /dev/null +++ b/pl/math/s_atan2f_3u.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2021-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define SCALAR 1 +#include "v_atan2f_3u.c" diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h index f2befc8..f1455aa 100644 --- a/pl/math/test/mathbench_funcs.h +++ b/pl/math/test/mathbench_funcs.h @@ -18,6 +18,7 @@ D (log10, 0.01, 11.1) #if WANT_VMATH D (__s_atan, -10.0, 10.0) +{"__s_atan2f", 'f', 0, -10.0, 10.0, {.f = __s_atan2f_wrap}}, {"__s_atan2", 'd', 0, -10.0, 10.0, {.d = __s_atan2_wrap}}, F (__s_erff, -4.0, 4.0) D (__s_erf, -6.0, 6.0) @@ -27,6 +28,7 @@ F (__s_log10f, 0.01, 11.1) D (__s_log10, 0.01, 11.1) #if __aarch64__ VD (__v_atan, -10.0, 10.0) +{"__v_atan2f", 'f', 'v', -10.0, 10.0, {.vf = __v_atan2f_wrap}}, {"__v_atan2", 'd', 'v', -10.0, 10.0, {.vd = __v_atan2_wrap}}, VF (__v_erff, -4.0, 4.0) VD (__v_erf, -6.0, 6.0) @@ -38,6 +40,9 @@ VF (__v_log10f, 0.01, 11.1) VND (__vn_atan, -10.0, 10.0) VND (_ZGVnN2v_atan, -10.0, 10.0) +{"__vn_atan2f", 'f', 'n', -10.0, 10.0, {.vnf = __vn_atan2f_wrap}}, +{"_ZGVnN4vv_atan2f", 'f', 'n', -10.0, 10.0, {.vnf = _Z_atan2f_wrap}}, + {"__vn_atan2", 'd', 'n', -10.0, 10.0, {.vnd = __vn_atan2_wrap}}, {"_ZGVnN2vv_atan2", 'd', 'n', -10.0, 10.0, {.vnd = _Z_atan2_wrap}}, diff --git a/pl/math/test/mathbench_wrappers.h b/pl/math/test/mathbench_wrappers.h index 57c0c8f..37edeae 100644 --- a/pl/math/test/mathbench_wrappers.h +++ b/pl/math/test/mathbench_wrappers.h @@ -26,12 +26,24 @@ __s_atan2_wrap (double x) return __s_atan2 (5.0, x); } +static float +__s_atan2f_wrap (float x) +{ + return __s_atan2f (5.0f, x); +} + static v_double __v_atan2_wrap (v_double x) { return __v_atan2 (v_double_dup (5.0), x); } +static v_float +__v_atan2f_wrap (v_float x) +{ + return __v_atan2f (v_float_dup (5.0f), x); +} + #ifdef __vpcs __vpcs static v_double @@ -40,12 +52,24 @@ __vn_atan2_wrap (v_double x) return __vn_atan2 (v_double_dup (5.0), x); } +__vpcs static v_float +__vn_atan2f_wrap (v_float x) +{ + return __vn_atan2f (v_float_dup (5.0f), x); +} + __vpcs static v_double _Z_atan2_wrap (v_double x) { return _ZGVnN2vv_atan2 (v_double_dup (5.0), x); } +__vpcs static v_float +_Z_atan2f_wrap (v_float x) +{ + return _ZGVnN4vv_atan2f (v_float_dup (5.0f), x); +} + #endif // __vpcs #endif // __arch64__ #endif // WANT_VMATH diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh index 7ad2318..66ac3bf 100755 --- a/pl/math/test/runulp.sh +++ b/pl/math/test/runulp.sh @@ -161,6 +161,14 @@ range_atan=' 1e6 1e32 40000 ' +range_atan2f=' + -10.0 10.0 50000 + -1.0 1.0 40000 + 0.0 1.0 40000 + 1.0 100.0 40000 + 1e6 1e32 40000 +' + # error limits L_erfc=3.7 L_erfcf=1.0 @@ -170,6 +178,7 @@ L_erf=1.76 L_erff=1.5 L_atan2=2.9 L_atan=3.0 +L_atan2f=3.0 while read G F R do @@ -209,6 +218,10 @@ log10 __v_log10 $runv log10 __vn_log10 $runvn log10 _ZGVnN2v_log10 $runvn +atan2f __s_atan2f $runs +atan2f __v_atan2f $runv +atan2f __vn_atan2f $runvn +atan2f _ZGVnN4vv_atan2f $runvn erff __s_erff $runs erff __v_erff $runv erff __vn_erff $runvn diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h index 244c96b..5bc7411 100644 --- a/pl/math/test/ulp_funcs.h +++ b/pl/math/test/ulp_funcs.h @@ -13,6 +13,7 @@ D1 (erfc) D1 (log10) #if WANT_VMATH F (__s_atan, __s_atan, atanl, mpfr_atan, 1, 0, d1, 0) +F (__s_atan2f, __s_atan2f, atan2, mpfr_atan2, 2, 1, f2, 0) F (__s_atan2, __s_atan2, atan2l, mpfr_atan2, 2, 0, d2, 0) F (__s_erff, __s_erff, erf, mpfr_erf, 1, 1, f1, 0) F (__s_erf, __s_erf, erfl, mpfr_erf, 1, 0, d1, 0) @@ -22,6 +23,7 @@ F (__s_log10f, __s_log10f, log10, mpfr_log10, 1, 1, f1, 0) F (__s_log10, __s_log10, log10l, mpfr_log10, 1, 0, d1, 0) #if __aarch64__ F (__v_atan, v_atan, atanl, mpfr_atan, 1, 0, d1, 1) +F (__v_atan2f, v_atan2f, atan2, mpfr_atan2, 2, 1, f2, 1) F (__v_atan2, v_atan2, atan2l, mpfr_atan2, 2, 0, d2, 1) F (__v_erff, v_erff, erf, mpfr_erf, 1, 1, f1, 1) F (__v_erf, v_erf, erfl, mpfr_erf, 1, 0, d1, 1) @@ -31,6 +33,7 @@ F (__v_log10f, v_log10f, log10, mpfr_log10, 1, 1, f1, 1) F (__v_log10, v_log10, log10l, mpfr_log10, 1, 0, d1, 1) #ifdef __vpcs F (__vn_atan, vn_atan, atanl, mpfr_atan, 1, 0, d1, 1) +F (__vn_atan2f, vn_atan2f, atan2, mpfr_atan2, 2, 1, f2, 1) F (__vn_atan2, vn_atan2, atan2l, mpfr_atan2, 2, 0, d2, 1) F (__vn_erff, vn_erff, erf, mpfr_erf, 1, 1, f1, 1) F (__vn_erf, vn_erf, erfl, mpfr_erf, 1, 0, d1, 1) @@ -39,6 +42,7 @@ F (__vn_erfc, vn_erfc, erfcl, mpfr_erfc, 1, 0, d1, 1) F (__vn_log10f, vn_log10f, log10, mpfr_log10, 1, 1, f1, 1) F (__vn_log10, vn_log10, log10l, mpfr_log10, 1, 0, d1, 1) F (_ZGVnN2v_atan, Z_atan, atanl, mpfr_atan, 1, 0, d1, 1) +F (_ZGVnN4vv_atan2f, Z_atan2f, atan2, mpfr_atan2, 2, 1, f2, 1) F (_ZGVnN2vv_atan2, Z_atan2, atan2l, mpfr_atan2, 2, 0, d2, 1) F (_ZGVnN4v_erff, Z_erff, erf, mpfr_erf, 1, 1, f1, 1) F (_ZGVnN2v_erf, Z_erf, erfl, mpfr_erf, 1, 0, d1, 1) diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h index f1bfdf2..0603e45 100644 --- a/pl/math/test/ulp_wrappers.h +++ b/pl/math/test/ulp_wrappers.h @@ -14,6 +14,7 @@ static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y, /* Wrappers for vector functions. */ #if __aarch64__ && WANT_VMATH +static float v_atan2f(float x, float y) { return __v_atan2f(argf(x), argf(y))[0]; } static float v_erff(float x) { return __v_erff(argf(x))[0]; } static float v_erfcf(float x) { return __v_erfcf(argf(x))[0]; } static float v_log10f(float x) { return __v_log10f(argf(x))[0]; } @@ -23,6 +24,7 @@ static double v_erf(double x) { return __v_erf(argd(x))[0]; } static double v_erfc(double x) { return __v_erfc(argd(x))[0]; } static double v_log10(double x) { return __v_log10(argd(x))[0]; } #ifdef __vpcs +static float vn_atan2f(float x, float y) { return __vn_atan2f(argf(x), argf(y))[0]; } static float vn_erff(float x) { return __vn_erff(argf(x))[0]; } static float vn_erfcf(float x) { return __vn_erfcf(argf(x))[0]; } static float vn_log10f(float x) { return __vn_log10f(argf(x))[0]; } @@ -32,6 +34,7 @@ static double vn_erf(double x) { return __vn_erf(argd(x))[0]; } static double vn_erfc(double x) { return __vn_erfc(argd(x))[0]; } static double vn_log10(double x) { return __vn_log10(argd(x))[0]; } +static float Z_atan2f(float x, float y) { return _ZGVnN4vv_atan2f(argf(x), argf(y))[0]; } static float Z_erff(float x) { return _ZGVnN4v_erff(argf(x))[0]; } static float Z_erfcf(float x) { return _ZGVnN4v_erfcf(argf(x))[0]; } static float Z_log10f(float x) { return _ZGVnN4v_log10f(argf(x))[0]; } diff --git a/pl/math/v_atan2f_3u.c b/pl/math/v_atan2f_3u.c new file mode 100644 index 0000000..4212351 --- /dev/null +++ b/pl/math/v_atan2f_3u.c @@ -0,0 +1,78 @@ +/* + * Single-precision vector atan2(x) function. + * + * Copyright (c) 2021-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#if V_SUPPORTED + +#include "atanf_common.h" + +/* Useful constants. */ +#define PiOver2 v_f32 (0x1.921fb6p+0f) +#define SignMask v_u32 (0x80000000) + +/* Special cases i.e. 0, infinity and nan (fall back to scalar calls). */ +VPCS_ATTR +__attribute__ ((noinline)) static v_f32_t +specialcase (v_f32_t y, v_f32_t x, v_f32_t ret, v_u32_t cmp) +{ + return v_call2_f32 (atan2f, y, x, ret, cmp); +} + +/* Returns 1 if input is the bit representation of 0, infinity or nan. */ +static inline v_u32_t +zeroinfnan (v_u32_t i) +{ + return v_cond_u32 (2 * i - 1 >= v_u32 (2 * 0x7f800000lu - 1)); +} + +/* Fast implementation of vector atan2f. Maximum observed error is + 2.95 ULP in [0x1.9300d6p+6 0x1.93c0c6p+6] x [0x1.8c2dbp+6 0x1.8cea6p+6]: + v_atan2(0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1 + want 0x1.967f00p-1. */ +VPCS_ATTR +v_f32_t V_NAME (atan2f) (v_f32_t y, v_f32_t x) +{ + v_u32_t ix = v_as_u32_f32 (x); + v_u32_t iy = v_as_u32_f32 (y); + + v_u32_t special_cases = zeroinfnan (ix) | zeroinfnan (iy); + + v_u32_t sign_x = ix & SignMask; + v_u32_t sign_y = iy & SignMask; + v_u32_t sign_xy = sign_x ^ sign_y; + + v_f32_t ax = v_abs_f32 (x); + v_f32_t ay = v_abs_f32 (y); + + v_u32_t pred_xlt0 = x < 0.0f; + v_u32_t pred_aygtax = ay > ax; + + /* Set up z for call to atanf. */ + v_f32_t n = v_sel_f32 (pred_aygtax, -ax, ay); + v_f32_t d = v_sel_f32 (pred_aygtax, ay, ax); + v_f32_t z = v_div_f32 (n, d); + + /* Work out the correct shift. */ + v_f32_t shift = v_sel_f32 (pred_xlt0, v_f32 (-2.0f), v_f32 (0.0f)); + shift = v_sel_f32 (pred_aygtax, shift + 1.0f, shift); + shift *= PiOver2; + + v_f32_t ret = eval_poly (z, z, shift); + + /* Account for the sign of y. */ + ret = v_as_f32_u32 (v_as_u32_f32 (ret) ^ sign_xy); + + if (unlikely (v_any_u32 (special_cases))) + { + return specialcase (y, x, ret, special_cases); + } + + return ret; +} +VPCS_ALIAS + +#endif diff --git a/pl/math/v_math.h b/pl/math/v_math.h index 3733557..ddc5dab 100644 --- a/pl/math/v_math.h +++ b/pl/math/v_math.h @@ -184,6 +184,11 @@ v_calt_f32 (v_f32_t x, v_f32_t y) return fabsf (x) < fabsf (y); } static inline v_f32_t +v_div_f32 (v_f32_t x, v_f32_t y) +{ + return x / y; +} +static inline v_f32_t v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z) { return __builtin_fmaf (x, y, z); @@ -509,6 +514,11 @@ v_calt_f32 (v_f32_t x, v_f32_t y) return vcaltq_f32 (x, y); } static inline v_f32_t +v_div_f32 (v_f32_t x, v_f32_t y) +{ + return vdivq_f32 (x, y); +} +static inline v_f32_t v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z) { return vfmaq_f32 (z, x, y); diff --git a/pl/math/vn_atan2f_3u.c b/pl/math/vn_atan2f_3u.c new file mode 100644 index 0000000..23aad38 --- /dev/null +++ b/pl/math/vn_atan2f_3u.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_atan2f. + * + * Copyright (c) 2019-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "include/mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS strong_alias (__vn_atan2f, _ZGVnN4vv_atan2f) +#include "v_atan2f_3u.c" +#endif |