From c5cba8528da13fe0d647dbd0f80d0cf21434b224 Mon Sep 17 00:00:00 2001
From: Szabolcs Nagy <szabolcs.nagy@arm.com>
Date: Fri, 9 Aug 2019 15:39:09 +0100
Subject: Add vector sinf and cosf

The polynomials were produced by searching the coefficient space using
heuristics and ideas from https://arxiv.org/abs/1508.03211

The worst-case error is 1.886 ulp, large inputs (> 2^20) and other
special cases use scalar sinf and cosf.
---
 math/include/mathlib.h |  8 ++++++
 math/s_cosf.c          |  6 ++++
 math/s_sinf.c          |  6 ++++
 math/test/mathbench.c  |  8 ++++++
 math/test/runulp.sh    | 19 +++++++++++++
 math/test/ulp.c        | 14 ++++++++++
 math/v_cosf.c          | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++
 math/v_sinf.c          | 75 +++++++++++++++++++++++++++++++++++++++++++++++++
 math/vn_cosf.c         | 12 ++++++++
 math/vn_sinf.c         | 12 ++++++++
 10 files changed, 236 insertions(+)
 create mode 100644 math/s_cosf.c
 create mode 100644 math/s_sinf.c
 create mode 100644 math/v_cosf.c
 create mode 100644 math/v_sinf.c
 create mode 100644 math/vn_cosf.c
 create mode 100644 math/vn_sinf.c

diff --git a/math/include/mathlib.h b/math/include/mathlib.h
index bacd2b6..405cf4a 100644
--- a/math/include/mathlib.h
+++ b/math/include/mathlib.h
@@ -24,6 +24,8 @@ double log2 (double);
 double pow (double, double);
 
 /* Scalar functions using the vector algorithm with identical result.  */
+float __s_sinf (float);
+float __s_cosf (float);
 float __s_expf (float);
 float __s_expf_1u (float);
 float __s_logf (float);
@@ -41,6 +43,8 @@ typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t;
 #endif
 
 /* Vector functions following the base PCS.  */
+__f32x4_t __v_sinf (__f32x4_t);
+__f32x4_t __v_cosf (__f32x4_t);
 __f32x4_t __v_expf (__f32x4_t);
 __f32x4_t __v_expf_1u (__f32x4_t);
 __f32x4_t __v_logf (__f32x4_t);
@@ -50,12 +54,16 @@ __f64x2_t __v_exp (__f64x2_t);
 #define __vpcs __attribute__((__aarch64_vector_pcs__))
 
 /* Vector functions following the vector PCS.  */
+__vpcs __f32x4_t __vn_sinf (__f32x4_t);
+__vpcs __f32x4_t __vn_cosf (__f32x4_t);
 __vpcs __f32x4_t __vn_expf (__f32x4_t);
 __vpcs __f32x4_t __vn_expf_1u (__f32x4_t);
 __vpcs __f32x4_t __vn_logf (__f32x4_t);
 __vpcs __f64x2_t __vn_exp (__f64x2_t);
 
 /* Vector functions following the vector PCS using ABI names.  */
+__vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t);
diff --git a/math/s_cosf.c b/math/s_cosf.c
new file mode 100644
index 0000000..914c02e
--- /dev/null
+++ b/math/s_cosf.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#define SCALAR 1
+#include "v_cosf.c"
diff --git a/math/s_sinf.c b/math/s_sinf.c
new file mode 100644
index 0000000..68ca908
--- /dev/null
+++ b/math/s_sinf.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#define SCALAR 1
+#include "v_sinf.c"
diff --git a/math/test/mathbench.c b/math/test/mathbench.c
index e4d4a8b..7544a7e 100644
--- a/math/test/mathbench.c
+++ b/math/test/mathbench.c
@@ -216,12 +216,14 @@ F (sinf, -3.1, 3.1)
 F (sinf, 3.3, 33.3)
 F (sinf, 100, 1000)
 F (sinf, 1e6, 1e32)
+F (__s_sinf, -3.1, 3.1)
 F (cosf, 0.1, 0.7)
 F (cosf, 0.8, 3.1)
 F (cosf, -3.1, 3.1)
 F (cosf, 3.3, 33.3)
 F (cosf, 100, 1000)
 F (cosf, 1e6, 1e32)
+F (__s_cosf, -3.1, 3.1)
 #if __aarch64__
 VD (__v_dummy, 1.0, 2.0)
 VD (__v_exp, -9.9, 9.9)
@@ -229,6 +231,8 @@ VF (__v_dummyf, 1.0, 2.0)
 VF (__v_expf, -9.9, 9.9)
 VF (__v_expf_1u, -9.9, 9.9)
 VF (__v_logf, 0.01, 11.1)
+VF (__v_sinf, -3.1, 3.1)
+VF (__v_cosf, -3.1, 3.1)
 #ifdef __vpcs
 VND (__vn_dummy, 1.0, 2.0)
 VND (__vn_exp, -9.9, 9.9)
@@ -239,6 +243,10 @@ VNF (_ZGVnN4v_expf, -9.9, 9.9)
 VNF (__vn_expf_1u, -9.9, 9.9)
 VNF (__vn_logf, 0.01, 11.1)
 VNF (_ZGVnN4v_logf, 0.01, 11.1)
+VNF (__vn_sinf, -3.1, 3.1)
+VNF (_ZGVnN4v_sinf, -3.1, 3.1)
+VNF (__vn_cosf, -3.1, 3.1)
+VNF (_ZGVnN4v_cosf, -3.1, 3.1)
 #endif
 #endif
 {0},
diff --git a/math/test/runulp.sh b/math/test/runulp.sh
index 81716fa..efa9269 100755
--- a/math/test/runulp.sh
+++ b/math/test/runulp.sh
@@ -121,11 +121,20 @@ range_logf='
  0x1p-4    0x1p4    500000
 '
 
+range_sinf='
+ 0    0xffff0000    10000
+ 0x1p-4    0x1p4    300000
+-0x1p-9   -0x1p9    300000
+'
+range_cosf="$range_sinf"
+
 # error limits
 L_exp=1.9
 L_expf=1.49
 L_expf_1u=0.4
 L_logf=2.9
+L_sinf=1.4
+L_cosf=1.4
 
 # group symbol run
 echo "
@@ -148,6 +157,16 @@ logf __v_logf      $runv
 logf __vn_logf     $runvn
 logf _ZGVnN4v_logf $runvn
 
+sinf __s_sinf      1
+sinf __v_sinf      $runv
+sinf __vn_sinf     $runvn
+sinf _ZGVnN4v_sinf $runvn
+
+cosf __s_cosf      1
+cosf __v_cosf      $runv
+cosf __vn_cosf     $runvn
+cosf _ZGVnN4v_cosf $runvn
+
 " | while read G F R
 do
 	[ "$R" = 1 ] || continue
diff --git a/math/test/ulp.c b/math/test/ulp.c
index 2ffba9b..4eb9d85 100644
--- a/math/test/ulp.c
+++ b/math/test/ulp.c
@@ -223,15 +223,21 @@ static const double dv[2] = {1.0, -INFINITY};
 static inline v_float argf(float x) { return (v_float){x,x,x,fv[secondcall]}; }
 static inline v_double argd(double x) { return (v_double){x,dv[secondcall]}; }
 
+static float v_sinf(float x) { return __v_sinf(argf(x))[0]; }
+static float v_cosf(float x) { return __v_cosf(argf(x))[0]; }
 static float v_expf_1u(float x) { return __v_expf_1u(argf(x))[0]; }
 static float v_expf(float x) { return __v_expf(argf(x))[0]; }
 static float v_logf(float x) { return __v_logf(argf(x))[0]; }
 static double v_exp(double x) { return __v_exp(argd(x))[0]; }
 #ifdef __vpcs
+static float vn_sinf(float x) { return __vn_sinf(argf(x))[0]; }
+static float vn_cosf(float x) { return __vn_cosf(argf(x))[0]; }
 static float vn_expf_1u(float x) { return __vn_expf_1u(argf(x))[0]; }
 static float vn_expf(float x) { return __vn_expf(argf(x))[0]; }
 static float vn_logf(float x) { return __vn_logf(argf(x))[0]; }
 static double vn_exp(double x) { return __vn_exp(argd(x))[0]; }
+static float Z_sinf(float x) { return _ZGVnN4v_sinf(argf(x))[0]; }
+static float Z_cosf(float x) { return _ZGVnN4v_cosf(argf(x))[0]; }
 static float Z_expf(float x) { return _ZGVnN4v_expf(argf(x))[0]; }
 static float Z_logf(float x) { return _ZGVnN4v_logf(argf(x))[0]; }
 static double Z_exp(double x) { return _ZGVnN2v_exp(argd(x))[0]; }
@@ -293,20 +299,28 @@ static const struct fun fun[] = {
  D1 (log)
  D1 (log2)
  D2 (pow)
+ F (__s_sinf, __s_sinf, sin, mpfr_sin, 1, 1, f1, 0)
+ F (__s_cosf, __s_cosf, cos, mpfr_cos, 1, 1, f1, 0)
  F (__s_expf_1u, __s_expf_1u, exp, mpfr_exp, 1, 1, f1, 0)
  F (__s_expf, __s_expf, exp, mpfr_exp, 1, 1, f1, 0)
  F (__s_logf, __s_logf, log, mpfr_log, 1, 1, f1, 0)
  F (__s_exp, __s_exp, expl, mpfr_exp, 1, 0, d1, 0)
 #if __aarch64__
+ F (__v_sinf, v_sinf, sin, mpfr_sin, 1, 1, f1, 1)
+ F (__v_cosf, v_cosf, cos, mpfr_cos, 1, 1, f1, 1)
  F (__v_expf_1u, v_expf_1u, exp, mpfr_exp, 1, 1, f1, 1)
  F (__v_expf, v_expf, exp, mpfr_exp, 1, 1, f1, 1)
  F (__v_logf, v_logf, log, mpfr_log, 1, 1, f1, 1)
  F (__v_exp, v_exp, expl, mpfr_exp, 1, 0, d1, 1)
 #ifdef __vpcs
+ F (__vn_sinf, vn_sinf, sin, mpfr_sin, 1, 1, f1, 1)
+ F (__vn_cosf, vn_cosf, cos, mpfr_cos, 1, 1, f1, 1)
  F (__vn_expf_1u, vn_expf_1u, exp, mpfr_exp, 1, 1, f1, 1)
  F (__vn_expf, vn_expf, exp, mpfr_exp, 1, 1, f1, 1)
  F (__vn_logf, vn_logf, log, mpfr_log, 1, 1, f1, 1)
  F (__vn_exp, vn_exp, expl, mpfr_exp, 1, 0, d1, 1)
+ F (_ZGVnN4v_sinf, Z_sinf, sin, mpfr_sin, 1, 1, f1, 1)
+ F (_ZGVnN4v_cosf, Z_cosf, cos, mpfr_cos, 1, 1, f1, 1)
  F (_ZGVnN4v_expf, Z_expf, exp, mpfr_exp, 1, 1, f1, 1)
  F (_ZGVnN4v_logf, Z_logf, log, mpfr_log, 1, 1, f1, 1)
  F (_ZGVnN2v_exp, Z_exp, expl, mpfr_exp, 1, 0, d1, 1)
diff --git a/math/v_cosf.c b/math/v_cosf.c
new file mode 100644
index 0000000..150294b
--- /dev/null
+++ b/math/v_cosf.c
@@ -0,0 +1,76 @@
+/*
+ * Single-precision vector cos function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#if V_SUPPORTED
+
+static const float Poly[] = {
+  /* 1.886 ulp error */
+  0x1.5b2e76p-19f,
+  -0x1.9f42eap-13f,
+  0x1.110df4p-7f,
+  -0x1.555548p-3f,
+};
+#define Pi1 v_f32 (0x1.921fb6p+1f)
+#define Pi2 v_f32 (-0x1.777a5cp-24f)
+#define Pi3 v_f32 (-0x1.ee59dap-49f)
+#define A3 v_f32 (Poly[3])
+#define A5 v_f32 (Poly[2])
+#define A7 v_f32 (Poly[1])
+#define A9 v_f32 (Poly[0])
+#define RangeVal v_f32 (0x1p20f)
+#define InvPi v_f32 (0x1.45f306p-2f)
+#define Shift v_f32 (0x1.8p+23f)
+#define AbsMask v_u32 (0x7fffffff)
+#define HalfPi v_f32 (0x1.921fb6p0f)
+
+VPCS_ATTR
+static v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
+{
+  /* Fall back to scalar code.  */
+  return v_call_f32 (cosf, x, y, cmp);
+}
+
+VPCS_ATTR
+v_f32_t
+V_NAME(cosf) (v_f32_t x)
+{
+  v_f32_t n, r, r2, y;
+  v_u32_t odd, cmp;
+
+  r = v_as_f32_u32 (v_as_u32_f32 (x) & AbsMask);
+  cmp = v_cond_u32 (v_as_u32_f32 (r) >= v_as_u32_f32 (RangeVal));
+
+  /* n = rint((|x|+pi/2)/pi) - 0.5 */
+  n = v_fma_f32 (InvPi, r + HalfPi, Shift);
+  odd = v_as_u32_f32 (n) << 31;
+  n -= Shift;
+  n -= v_f32 (0.5f);
+
+  /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2) */
+  r = v_fma_f32 (-Pi1, n, r);
+  r = v_fma_f32 (-Pi2, n, r);
+  r = v_fma_f32 (-Pi3, n, r);
+
+  /* y = sin(r) */
+  r2 = r * r;
+  y = v_fma_f32 (A9, r2, A7);
+  y = v_fma_f32 (y, r2, A5);
+  y = v_fma_f32 (y, r2, A3);
+  y = v_fma_f32 (y * r2, r, r);
+
+  /* sign fix */
+  y = v_as_f32_u32 (v_as_u32_f32 (y) ^ odd);
+
+  if (unlikely (v_any_u32 (cmp)))
+    return specialcase (x, y, cmp);
+  return y;
+}
+VPCS_ALIAS
+#endif
diff --git a/math/v_sinf.c b/math/v_sinf.c
new file mode 100644
index 0000000..e66bfce
--- /dev/null
+++ b/math/v_sinf.c
@@ -0,0 +1,75 @@
+/*
+ * Single-precision vector sin function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#if V_SUPPORTED
+
+static const float Poly[] = {
+  /* 1.886 ulp error */
+  0x1.5b2e76p-19f,
+  -0x1.9f42eap-13f,
+  0x1.110df4p-7f,
+  -0x1.555548p-3f,
+};
+#define Pi1 v_f32 (0x1.921fb6p+1f)
+#define Pi2 v_f32 (-0x1.777a5cp-24f)
+#define Pi3 v_f32 (-0x1.ee59dap-49f)
+#define A3 v_f32 (Poly[3])
+#define A5 v_f32 (Poly[2])
+#define A7 v_f32 (Poly[1])
+#define A9 v_f32 (Poly[0])
+#define RangeVal v_f32 (0x1p20f)
+#define InvPi v_f32 (0x1.45f306p-2f)
+#define Shift v_f32 (0x1.8p+23f)
+#define AbsMask v_u32 (0x7fffffff)
+
+VPCS_ATTR
+static v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
+{
+  /* Fall back to scalar code.  */
+  return v_call_f32 (sinf, x, y, cmp);
+}
+
+VPCS_ATTR
+v_f32_t
+V_NAME(sinf) (v_f32_t x)
+{
+  v_f32_t n, r, r2, y;
+  v_u32_t sign, odd, cmp;
+
+  r = v_as_f32_u32 (v_as_u32_f32 (x) & AbsMask);
+  sign = v_as_u32_f32 (x) & ~AbsMask;
+  cmp = v_cond_u32 (v_as_u32_f32 (r) >= v_as_u32_f32 (RangeVal));
+
+  /* n = rint(|x|/pi) */
+  n = v_fma_f32 (InvPi, r, Shift);
+  odd = v_as_u32_f32 (n) << 31;
+  n -= Shift;
+
+  /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2) */
+  r = v_fma_f32 (-Pi1, n, r);
+  r = v_fma_f32 (-Pi2, n, r);
+  r = v_fma_f32 (-Pi3, n, r);
+
+  /* y = sin(r) */
+  r2 = r * r;
+  y = v_fma_f32 (A9, r2, A7);
+  y = v_fma_f32 (y, r2, A5);
+  y = v_fma_f32 (y, r2, A3);
+  y = v_fma_f32 (y * r2, r, r);
+
+  /* sign fix */
+  y = v_as_f32_u32 (v_as_u32_f32 (y) ^ sign ^ odd);
+
+  if (unlikely (v_any_u32 (cmp)))
+    return specialcase (x, y, cmp);
+  return y;
+}
+VPCS_ALIAS
+#endif
diff --git a/math/vn_cosf.c b/math/vn_cosf.c
new file mode 100644
index 0000000..6321d46
--- /dev/null
+++ b/math/vn_cosf.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_cosf.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#include "mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_cosf, _ZGVnN4v_cosf)
+#include "v_cosf.c"
+#endif
diff --git a/math/vn_sinf.c b/math/vn_sinf.c
new file mode 100644
index 0000000..1214e1a
--- /dev/null
+++ b/math/vn_sinf.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_sinf.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#include "mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_sinf, _ZGVnN4v_sinf)
+#include "v_sinf.c"
+#endif
-- 
cgit v1.2.3