From c5cba8528da13fe0d647dbd0f80d0cf21434b224 Mon Sep 17 00:00:00 2001 From: Szabolcs Nagy Date: Fri, 9 Aug 2019 15:39:09 +0100 Subject: Add vector sinf and cosf The polynomials were produced by searching the coefficient space using heuristics and ideas from https://arxiv.org/abs/1508.03211 The worst-case error is 1.886 ulp, large inputs (> 2^20) and other special cases use scalar sinf and cosf. --- math/include/mathlib.h | 8 ++++++ math/s_cosf.c | 6 ++++ math/s_sinf.c | 6 ++++ math/test/mathbench.c | 8 ++++++ math/test/runulp.sh | 19 +++++++++++++ math/test/ulp.c | 14 ++++++++++ math/v_cosf.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++ math/v_sinf.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++ math/vn_cosf.c | 12 ++++++++ math/vn_sinf.c | 12 ++++++++ 10 files changed, 236 insertions(+) create mode 100644 math/s_cosf.c create mode 100644 math/s_sinf.c create mode 100644 math/v_cosf.c create mode 100644 math/v_sinf.c create mode 100644 math/vn_cosf.c create mode 100644 math/vn_sinf.c diff --git a/math/include/mathlib.h b/math/include/mathlib.h index bacd2b6..405cf4a 100644 --- a/math/include/mathlib.h +++ b/math/include/mathlib.h @@ -24,6 +24,8 @@ double log2 (double); double pow (double, double); /* Scalar functions using the vector algorithm with identical result. */ +float __s_sinf (float); +float __s_cosf (float); float __s_expf (float); float __s_expf_1u (float); float __s_logf (float); @@ -41,6 +43,8 @@ typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t; #endif /* Vector functions following the base PCS. */ +__f32x4_t __v_sinf (__f32x4_t); +__f32x4_t __v_cosf (__f32x4_t); __f32x4_t __v_expf (__f32x4_t); __f32x4_t __v_expf_1u (__f32x4_t); __f32x4_t __v_logf (__f32x4_t); @@ -50,12 +54,16 @@ __f64x2_t __v_exp (__f64x2_t); #define __vpcs __attribute__((__aarch64_vector_pcs__)) /* Vector functions following the vector PCS. */ +__vpcs __f32x4_t __vn_sinf (__f32x4_t); +__vpcs __f32x4_t __vn_cosf (__f32x4_t); __vpcs __f32x4_t __vn_expf (__f32x4_t); __vpcs __f32x4_t __vn_expf_1u (__f32x4_t); __vpcs __f32x4_t __vn_logf (__f32x4_t); __vpcs __f64x2_t __vn_exp (__f64x2_t); /* Vector functions following the vector PCS using ABI names. */ +__vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t); +__vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t); __vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t); diff --git a/math/s_cosf.c b/math/s_cosf.c new file mode 100644 index 0000000..914c02e --- /dev/null +++ b/math/s_cosf.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#define SCALAR 1 +#include "v_cosf.c" diff --git a/math/s_sinf.c b/math/s_sinf.c new file mode 100644 index 0000000..68ca908 --- /dev/null +++ b/math/s_sinf.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#define SCALAR 1 +#include "v_sinf.c" diff --git a/math/test/mathbench.c b/math/test/mathbench.c index e4d4a8b..7544a7e 100644 --- a/math/test/mathbench.c +++ b/math/test/mathbench.c @@ -216,12 +216,14 @@ F (sinf, -3.1, 3.1) F (sinf, 3.3, 33.3) F (sinf, 100, 1000) F (sinf, 1e6, 1e32) +F (__s_sinf, -3.1, 3.1) F (cosf, 0.1, 0.7) F (cosf, 0.8, 3.1) F (cosf, -3.1, 3.1) F (cosf, 3.3, 33.3) F (cosf, 100, 1000) F (cosf, 1e6, 1e32) +F (__s_cosf, -3.1, 3.1) #if __aarch64__ VD (__v_dummy, 1.0, 2.0) VD (__v_exp, -9.9, 9.9) @@ -229,6 +231,8 @@ VF (__v_dummyf, 1.0, 2.0) VF (__v_expf, -9.9, 9.9) VF (__v_expf_1u, -9.9, 9.9) VF (__v_logf, 0.01, 11.1) +VF (__v_sinf, -3.1, 3.1) +VF (__v_cosf, -3.1, 3.1) #ifdef __vpcs VND (__vn_dummy, 1.0, 2.0) VND (__vn_exp, -9.9, 9.9) @@ -239,6 +243,10 @@ VNF (_ZGVnN4v_expf, -9.9, 9.9) VNF (__vn_expf_1u, -9.9, 9.9) VNF (__vn_logf, 0.01, 11.1) VNF (_ZGVnN4v_logf, 0.01, 11.1) +VNF (__vn_sinf, -3.1, 3.1) +VNF (_ZGVnN4v_sinf, -3.1, 3.1) +VNF (__vn_cosf, -3.1, 3.1) +VNF (_ZGVnN4v_cosf, -3.1, 3.1) #endif #endif {0}, diff --git a/math/test/runulp.sh b/math/test/runulp.sh index 81716fa..efa9269 100755 --- a/math/test/runulp.sh +++ b/math/test/runulp.sh @@ -121,11 +121,20 @@ range_logf=' 0x1p-4 0x1p4 500000 ' +range_sinf=' + 0 0xffff0000 10000 + 0x1p-4 0x1p4 300000 +-0x1p-9 -0x1p9 300000 +' +range_cosf="$range_sinf" + # error limits L_exp=1.9 L_expf=1.49 L_expf_1u=0.4 L_logf=2.9 +L_sinf=1.4 +L_cosf=1.4 # group symbol run echo " @@ -148,6 +157,16 @@ logf __v_logf $runv logf __vn_logf $runvn logf _ZGVnN4v_logf $runvn +sinf __s_sinf 1 +sinf __v_sinf $runv +sinf __vn_sinf $runvn +sinf _ZGVnN4v_sinf $runvn + +cosf __s_cosf 1 +cosf __v_cosf $runv +cosf __vn_cosf $runvn +cosf _ZGVnN4v_cosf $runvn + " | while read G F R do [ "$R" = 1 ] || continue diff --git a/math/test/ulp.c b/math/test/ulp.c index 2ffba9b..4eb9d85 100644 --- a/math/test/ulp.c +++ b/math/test/ulp.c @@ -223,15 +223,21 @@ static const double dv[2] = {1.0, -INFINITY}; static inline v_float argf(float x) { return (v_float){x,x,x,fv[secondcall]}; } static inline v_double argd(double x) { return (v_double){x,dv[secondcall]}; } +static float v_sinf(float x) { return __v_sinf(argf(x))[0]; } +static float v_cosf(float x) { return __v_cosf(argf(x))[0]; } static float v_expf_1u(float x) { return __v_expf_1u(argf(x))[0]; } static float v_expf(float x) { return __v_expf(argf(x))[0]; } static float v_logf(float x) { return __v_logf(argf(x))[0]; } static double v_exp(double x) { return __v_exp(argd(x))[0]; } #ifdef __vpcs +static float vn_sinf(float x) { return __vn_sinf(argf(x))[0]; } +static float vn_cosf(float x) { return __vn_cosf(argf(x))[0]; } static float vn_expf_1u(float x) { return __vn_expf_1u(argf(x))[0]; } static float vn_expf(float x) { return __vn_expf(argf(x))[0]; } static float vn_logf(float x) { return __vn_logf(argf(x))[0]; } static double vn_exp(double x) { return __vn_exp(argd(x))[0]; } +static float Z_sinf(float x) { return _ZGVnN4v_sinf(argf(x))[0]; } +static float Z_cosf(float x) { return _ZGVnN4v_cosf(argf(x))[0]; } static float Z_expf(float x) { return _ZGVnN4v_expf(argf(x))[0]; } static float Z_logf(float x) { return _ZGVnN4v_logf(argf(x))[0]; } static double Z_exp(double x) { return _ZGVnN2v_exp(argd(x))[0]; } @@ -293,20 +299,28 @@ static const struct fun fun[] = { D1 (log) D1 (log2) D2 (pow) + F (__s_sinf, __s_sinf, sin, mpfr_sin, 1, 1, f1, 0) + F (__s_cosf, __s_cosf, cos, mpfr_cos, 1, 1, f1, 0) F (__s_expf_1u, __s_expf_1u, exp, mpfr_exp, 1, 1, f1, 0) F (__s_expf, __s_expf, exp, mpfr_exp, 1, 1, f1, 0) F (__s_logf, __s_logf, log, mpfr_log, 1, 1, f1, 0) F (__s_exp, __s_exp, expl, mpfr_exp, 1, 0, d1, 0) #if __aarch64__ + F (__v_sinf, v_sinf, sin, mpfr_sin, 1, 1, f1, 1) + F (__v_cosf, v_cosf, cos, mpfr_cos, 1, 1, f1, 1) F (__v_expf_1u, v_expf_1u, exp, mpfr_exp, 1, 1, f1, 1) F (__v_expf, v_expf, exp, mpfr_exp, 1, 1, f1, 1) F (__v_logf, v_logf, log, mpfr_log, 1, 1, f1, 1) F (__v_exp, v_exp, expl, mpfr_exp, 1, 0, d1, 1) #ifdef __vpcs + F (__vn_sinf, vn_sinf, sin, mpfr_sin, 1, 1, f1, 1) + F (__vn_cosf, vn_cosf, cos, mpfr_cos, 1, 1, f1, 1) F (__vn_expf_1u, vn_expf_1u, exp, mpfr_exp, 1, 1, f1, 1) F (__vn_expf, vn_expf, exp, mpfr_exp, 1, 1, f1, 1) F (__vn_logf, vn_logf, log, mpfr_log, 1, 1, f1, 1) F (__vn_exp, vn_exp, expl, mpfr_exp, 1, 0, d1, 1) + F (_ZGVnN4v_sinf, Z_sinf, sin, mpfr_sin, 1, 1, f1, 1) + F (_ZGVnN4v_cosf, Z_cosf, cos, mpfr_cos, 1, 1, f1, 1) F (_ZGVnN4v_expf, Z_expf, exp, mpfr_exp, 1, 1, f1, 1) F (_ZGVnN4v_logf, Z_logf, log, mpfr_log, 1, 1, f1, 1) F (_ZGVnN2v_exp, Z_exp, expl, mpfr_exp, 1, 0, d1, 1) diff --git a/math/v_cosf.c b/math/v_cosf.c new file mode 100644 index 0000000..150294b --- /dev/null +++ b/math/v_cosf.c @@ -0,0 +1,76 @@ +/* + * Single-precision vector cos function. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "mathlib.h" +#include "v_math.h" +#if V_SUPPORTED + +static const float Poly[] = { + /* 1.886 ulp error */ + 0x1.5b2e76p-19f, + -0x1.9f42eap-13f, + 0x1.110df4p-7f, + -0x1.555548p-3f, +}; +#define Pi1 v_f32 (0x1.921fb6p+1f) +#define Pi2 v_f32 (-0x1.777a5cp-24f) +#define Pi3 v_f32 (-0x1.ee59dap-49f) +#define A3 v_f32 (Poly[3]) +#define A5 v_f32 (Poly[2]) +#define A7 v_f32 (Poly[1]) +#define A9 v_f32 (Poly[0]) +#define RangeVal v_f32 (0x1p20f) +#define InvPi v_f32 (0x1.45f306p-2f) +#define Shift v_f32 (0x1.8p+23f) +#define AbsMask v_u32 (0x7fffffff) +#define HalfPi v_f32 (0x1.921fb6p0f) + +VPCS_ATTR +static v_f32_t +specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp) +{ + /* Fall back to scalar code. */ + return v_call_f32 (cosf, x, y, cmp); +} + +VPCS_ATTR +v_f32_t +V_NAME(cosf) (v_f32_t x) +{ + v_f32_t n, r, r2, y; + v_u32_t odd, cmp; + + r = v_as_f32_u32 (v_as_u32_f32 (x) & AbsMask); + cmp = v_cond_u32 (v_as_u32_f32 (r) >= v_as_u32_f32 (RangeVal)); + + /* n = rint((|x|+pi/2)/pi) - 0.5 */ + n = v_fma_f32 (InvPi, r + HalfPi, Shift); + odd = v_as_u32_f32 (n) << 31; + n -= Shift; + n -= v_f32 (0.5f); + + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */ + r = v_fma_f32 (-Pi1, n, r); + r = v_fma_f32 (-Pi2, n, r); + r = v_fma_f32 (-Pi3, n, r); + + /* y = sin(r) */ + r2 = r * r; + y = v_fma_f32 (A9, r2, A7); + y = v_fma_f32 (y, r2, A5); + y = v_fma_f32 (y, r2, A3); + y = v_fma_f32 (y * r2, r, r); + + /* sign fix */ + y = v_as_f32_u32 (v_as_u32_f32 (y) ^ odd); + + if (unlikely (v_any_u32 (cmp))) + return specialcase (x, y, cmp); + return y; +} +VPCS_ALIAS +#endif diff --git a/math/v_sinf.c b/math/v_sinf.c new file mode 100644 index 0000000..e66bfce --- /dev/null +++ b/math/v_sinf.c @@ -0,0 +1,75 @@ +/* + * Single-precision vector sin function. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "mathlib.h" +#include "v_math.h" +#if V_SUPPORTED + +static const float Poly[] = { + /* 1.886 ulp error */ + 0x1.5b2e76p-19f, + -0x1.9f42eap-13f, + 0x1.110df4p-7f, + -0x1.555548p-3f, +}; +#define Pi1 v_f32 (0x1.921fb6p+1f) +#define Pi2 v_f32 (-0x1.777a5cp-24f) +#define Pi3 v_f32 (-0x1.ee59dap-49f) +#define A3 v_f32 (Poly[3]) +#define A5 v_f32 (Poly[2]) +#define A7 v_f32 (Poly[1]) +#define A9 v_f32 (Poly[0]) +#define RangeVal v_f32 (0x1p20f) +#define InvPi v_f32 (0x1.45f306p-2f) +#define Shift v_f32 (0x1.8p+23f) +#define AbsMask v_u32 (0x7fffffff) + +VPCS_ATTR +static v_f32_t +specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp) +{ + /* Fall back to scalar code. */ + return v_call_f32 (sinf, x, y, cmp); +} + +VPCS_ATTR +v_f32_t +V_NAME(sinf) (v_f32_t x) +{ + v_f32_t n, r, r2, y; + v_u32_t sign, odd, cmp; + + r = v_as_f32_u32 (v_as_u32_f32 (x) & AbsMask); + sign = v_as_u32_f32 (x) & ~AbsMask; + cmp = v_cond_u32 (v_as_u32_f32 (r) >= v_as_u32_f32 (RangeVal)); + + /* n = rint(|x|/pi) */ + n = v_fma_f32 (InvPi, r, Shift); + odd = v_as_u32_f32 (n) << 31; + n -= Shift; + + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */ + r = v_fma_f32 (-Pi1, n, r); + r = v_fma_f32 (-Pi2, n, r); + r = v_fma_f32 (-Pi3, n, r); + + /* y = sin(r) */ + r2 = r * r; + y = v_fma_f32 (A9, r2, A7); + y = v_fma_f32 (y, r2, A5); + y = v_fma_f32 (y, r2, A3); + y = v_fma_f32 (y * r2, r, r); + + /* sign fix */ + y = v_as_f32_u32 (v_as_u32_f32 (y) ^ sign ^ odd); + + if (unlikely (v_any_u32 (cmp))) + return specialcase (x, y, cmp); + return y; +} +VPCS_ALIAS +#endif diff --git a/math/vn_cosf.c b/math/vn_cosf.c new file mode 100644 index 0000000..6321d46 --- /dev/null +++ b/math/vn_cosf.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_cosf. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#include "mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS strong_alias (__vn_cosf, _ZGVnN4v_cosf) +#include "v_cosf.c" +#endif diff --git a/math/vn_sinf.c b/math/vn_sinf.c new file mode 100644 index 0000000..1214e1a --- /dev/null +++ b/math/vn_sinf.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_sinf. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#include "mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS strong_alias (__vn_sinf, _ZGVnN4v_sinf) +#include "v_sinf.c" +#endif -- cgit v1.2.3