pl/math: Add Vector/SVE cosf.

An implementation based on SVE trigonometric instructions. It relies on the same range reduction as Vector/Neon cosf, with a slight modification of the shift. The maximum measured error is 2.06ULPs.
author: Joe Ramsay <Joe.Ramsay@arm.com> 2022-07-21 13:55:41 +0100
committer: Joe Ramsay <joe.ramsay@arm.com> 2022-07-21 13:55:41 +0100
commit: a40716ffedba4fa94c1a0c3e88857740d86a00b0 (patch)
tree: 4fcf4c12b8781d7160cdef97ef2e93c5db3c0bc4
parent: a790e502efe76dbad55a33655f6e3c9066e11325 (diff)
download: arm-optimized-routines-a40716ffedba4fa94c1a0c3e88857740d86a00b0.tar.gz
8 files changed, 288 insertions, 4 deletions
diff --git a/pl/math/Dir.mk b/pl/math/Dir.mk
index 4a96dc6..7909ea0 100644
--- a/pl/math/Dir.mk
+++ b/pl/math/Dir.mk
@@ -128,7 +128,7 @@ check-pl/math-rtest: $(math-host-tools) $(math-tools)
 	cat $(math-rtests) | build/pl/bin/rtest | $(EMULATOR) build/pl/bin/mathtest $(math-testflags)
 
 check-pl/math-ulp: $(math-tools)
-	ULPFLAGS="$(math-ulpflags)" build/pl/bin/runulp.sh $(EMULATOR)
+	WANT_SVE_MATH=$(WANT_SVE_MATH) ULPFLAGS="$(math-ulpflags)" build/pl/bin/runulp.sh $(EMULATOR)
 
 check-pl/math: check-pl/math-test check-pl/math-rtest check-pl/math-ulp
 
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index c6620c0..5ed266a 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -92,6 +92,14 @@ __vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t);
 
 #endif
+
+#if WANT_SVE_MATH
+#include <arm_sve.h>
+svfloat32_t __sv_cosf_x (svfloat32_t, svbool_t);
+/* SVE ABI names.  */
+svfloat32_t _ZGVsMxv_cosf (svfloat32_t, svbool_t);
+#endif
+
 #endif
 
 #endif
diff --git a/pl/math/sv_cosf_2u1.c b/pl/math/sv_cosf_2u1.c
new file mode 100644
index 0000000..70057ea
--- /dev/null
+++ b/pl/math/sv_cosf_2u1.c
@@ -0,0 +1,75 @@
+/*
+ * Single-precision SVE cos(x) function.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#if SV_SUPPORTED
+
+#define NegPio2_1 (sv_f32 (-0x1.921fb6p+0f))
+#define NegPio2_2 (sv_f32 (0x1.777a5cp-25f))
+#define NegPio2_3 (sv_f32 (0x1.ee59dap-50f))
+#define RangeVal (sv_f32 (0x1p20f))
+#define InvPio2 (sv_f32 (0x1.45f306p-1f))
+/* Original shift used in Neon cosf,
+   plus a contribution to set the bit #0 of q
+   as expected by trigonometric instructions.  */
+#define Shift (sv_f32 (0x1.800002p+23f))
+#define AbsMask (0x7fffffff)
+
+static NOINLINE sv_f32_t
+__sv_cosf_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp)
+{
+  return sv_call_f32 (cosf, x, y, cmp);
+}
+
+/* A fast SVE implementation of cosf based on trigonometric
+   instructions (FTMAD, FTSSEL, FTSMUL).
+   Maximum measured error: 2.06 ULPs.
+   __sv_cosf(0x1.dea2f2p+19) got 0x1.fffe7ap-6
+			    want 0x1.fffe76p-6.  */
+sv_f32_t
+__sv_cosf_x (sv_f32_t x, const svbool_t pg)
+{
+  sv_f32_t n, r, r2, y;
+  svbool_t cmp;
+
+  r = sv_as_f32_u32 (svand_n_u32_x (pg, sv_as_u32_f32 (x), AbsMask));
+  cmp = svcmpge_u32 (pg, sv_as_u32_f32 (r), sv_as_u32_f32 (RangeVal));
+
+  /* n = rint(|x|/(pi/2)).  */
+  sv_f32_t q = sv_fma_f32_x (pg, InvPio2, r, Shift);
+  n = svsub_f32_x (pg, q, Shift);
+
+  /* r = |x| - n*(pi/2)  (range reduction into -pi/4 .. pi/4).  */
+  r = sv_fma_f32_x (pg, NegPio2_1, n, r);
+  r = sv_fma_f32_x (pg, NegPio2_2, n, r);
+  r = sv_fma_f32_x (pg, NegPio2_3, n, r);
+
+  /* Final multiplicative factor: 1.0 or x depending on bit #0 of q.  */
+  sv_f32_t f = svtssel_f32 (r, sv_as_u32_f32 (q));
+
+  /* cos(r) poly approx.  */
+  r2 = svtsmul_f32 (r, sv_as_u32_f32 (q));
+  y = sv_f32 (0.0f);
+  y = svtmad_f32 (y, r2, 4);
+  y = svtmad_f32 (y, r2, 3);
+  y = svtmad_f32 (y, r2, 2);
+  y = svtmad_f32 (y, r2, 1);
+  y = svtmad_f32 (y, r2, 0);
+
+  /* Apply factor.  */
+  y = svmul_f32_x (pg, f, y);
+
+  /* No need to pass pg to specialcase here since cmp is a strict subset,
+     guaranteed by the cmpge above.  */
+  if (unlikely (svptest_any (pg, cmp)))
+    return __sv_cosf_specialcase (x, y, cmp);
+  return y;
+}
+
+strong_alias (__sv_cosf_x, _ZGVsMxv_cosf)
+
+#endif
diff --git a/pl/math/sv_math.h b/pl/math/sv_math.h
new file mode 100644
index 0000000..14919be
--- /dev/null
+++ b/pl/math/sv_math.h
@@ -0,0 +1,160 @@
+/*
+ * Wrapper functions for SVE ACLE.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef SV_MATH_H
+#define SV_MATH_H
+
+#ifndef WANT_VMATH
+/* Enable the build of vector math code.  */
+#define WANT_VMATH 1
+#endif
+#if WANT_VMATH
+
+#if WANT_SVE_MATH
+#define SV_SUPPORTED 1
+
+#include <arm_sve.h>
+#include <stdbool.h>
+
+#include "math_config.h"
+
+typedef float f32_t;
+typedef uint32_t u32_t;
+typedef int32_t s32_t;
+typedef double f64_t;
+typedef uint64_t u64_t;
+typedef int64_t s64_t;
+
+typedef svfloat64_t sv_f64_t;
+typedef svuint64_t sv_u64_t;
+typedef svint64_t sv_s64_t;
+
+typedef svfloat32_t sv_f32_t;
+typedef svuint32_t sv_u32_t;
+typedef svint32_t sv_s32_t;
+
+/* Double precision.  */
+static inline sv_s64_t
+sv_s64 (s64_t x)
+{
+  return svdup_n_s64 (x);
+}
+
+static inline sv_u64_t
+sv_u64 (u64_t x)
+{
+  return svdup_n_u64 (x);
+}
+
+static inline sv_f64_t
+sv_f64 (f64_t x)
+{
+  return svdup_n_f64 (x);
+}
+
+static inline sv_f64_t
+sv_fma_f64_x (svbool_t pg, sv_f64_t x, sv_f64_t y, sv_f64_t z)
+{
+  return svmla_f64_x (pg, z, x, y);
+}
+
+/* res = z + x * y with x scalar. */
+static inline sv_f64_t
+sv_fma_n_f64_x (svbool_t pg, f64_t x, sv_f64_t y, sv_f64_t z)
+{
+  return svmla_n_f64_x (pg, z, y, x);
+}
+
+static inline sv_u64_t
+sv_as_u64_f64 (sv_f64_t x)
+{
+  return svreinterpret_u64_f64 (x);
+}
+
+static inline sv_f64_t
+sv_as_f64_u64 (sv_u64_t x)
+{
+  return svreinterpret_f64_u64 (x);
+}
+
+static inline sv_f64_t
+sv_call_f64 (f64_t (*f) (f64_t), sv_f64_t x, sv_f64_t y, svbool_t cmp)
+{
+  svbool_t p = svpfirst (cmp, svpfalse ());
+  while (svptest_any (cmp, p))
+    {
+      f64_t elem = svclastb_n_f64 (p, 0, x);
+      elem = (*f) (elem);
+      sv_f64_t y2 = svdup_n_f64 (elem);
+      y = svsel_f64 (p, y2, y);
+      p = svpnext_b64 (cmp, p);
+    }
+  return y;
+}
+
+/* Single precision.  */
+static inline sv_s32_t
+sv_s32 (s32_t x)
+{
+  return svdup_n_s32 (x);
+}
+
+static inline sv_u32_t
+sv_u32 (u32_t x)
+{
+  return svdup_n_u32 (x);
+}
+
+static inline sv_f32_t
+sv_f32 (f32_t x)
+{
+  return svdup_n_f32 (x);
+}
+
+static inline sv_f32_t
+sv_fma_f32_x (svbool_t pg, sv_f32_t x, sv_f32_t y, sv_f32_t z)
+{
+  return svmla_f32_x (pg, z, x, y);
+}
+
+/* res = z + x * y with x scalar.  */
+static inline sv_f32_t
+sv_fma_n_f32_x (svbool_t pg, f32_t x, sv_f32_t y, sv_f32_t z)
+{
+  return svmla_n_f32_x (pg, z, y, x);
+}
+
+static inline sv_u32_t
+sv_as_u32_f32 (sv_f32_t x)
+{
+  return svreinterpret_u32_f32 (x);
+}
+
+static inline sv_f32_t
+sv_as_f32_u32 (sv_u32_t x)
+{
+  return svreinterpret_f32_u32 (x);
+}
+
+static inline sv_f32_t
+sv_call_f32 (f32_t (*f) (f32_t), sv_f32_t x, sv_f32_t y, svbool_t cmp)
+{
+  svbool_t p = svpfirst (cmp, svpfalse ());
+  while (svptest_any (cmp, p))
+    {
+      f32_t elem = svclastb_n_f32 (p, 0, x);
+      elem = (*f) (elem);
+      sv_f32_t y2 = svdup_n_f32 (elem);
+      y = svsel_f32 (p, y2, y);
+      p = svpnext_b32 (cmp, p);
+    }
+  return y;
+}
+
+#endif
+#endif
+#endif
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index f681489..5b6a806 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -8,6 +8,7 @@
 F (asinhf, -10.0, 10.0)
 F (atanf, -10.0, 10.0)
 {"atan2f", 'f', 0, -10.0, 10.0, {.f = atan2f_wrap}},
+F (cosf, -3.1, 3.1)
 F (erfcf, -4.0, 10.0)
 F (erff, -4.0, 4.0)
 F (log10f, 0.01, 11.1)
@@ -85,5 +86,9 @@ VNF (__vn_log1pf, -0.9, 10.0)
 VNF (_ZGVnN4v_log1pf, -0.9, 10.0)
 #endif
 #endif
+#if WANT_SVE_MATH
+SVF (__sv_cosf_x, -3.1, 3.1)
+SVF (_ZGVsMxv_cosf, -3.1, 3.1)
+#endif
 #endif
   // clang-format on
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 900d7d2..8af5d1c 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -15,6 +15,9 @@ rmodes='n'
 flags="${ULPFLAGS:--q}"
 emu="$@"
 
+# Enable SVE testing
+WANT_SVE_MATH=${WANT_SVE_MATH:-0}
+
 FAIL=0
 PASS=0
 
@@ -132,6 +135,10 @@ runv=
 check __v_log10f 1 && runv=1
 runvn=
 check __vn_log10f 1 && runvn=1
+runsv=
+if [ $WANT_SVE_MATH -eq 1 ]; then
+check __sv_cosf 0 && runsv=1
+fi
 
 range_erfc='
    0       0xffff0000   10000
@@ -234,6 +241,11 @@ range_asinhf='
   -0x1p11     -inf  20000
 '
 
+range_sve_cosf='
+ 0    0xffff0000    10000
+ 0x1p-4    0x1p4    500000
+'
+
 # error limits
 L_erfc=3.7
 L_erfcf=1.0
@@ -248,6 +260,8 @@ L_atanf=3.0
 L_log1pf=2.0
 L_asinhf=2.2
 
+L_sve_cosf=1.6
+
 while read G F R
 do
 	[ "$R" = 1 ] || continue
@@ -314,6 +328,11 @@ asinhf __s_asinhf      $runs
 asinhf __v_asinhf      $runv
 asinhf __vn_asinhf     $runvn
 asinhf _ZGVnN4v_asinhf $runvn
+
+if [ $WANT_SVE_MATH -eq 1 ]; then
+sve_cosf __sv_cosf     $runsv
+sve_cosf _ZGVsMxv_cosf $runsv
+fi
 EOF
 
 [ 0 -eq $FAIL ] || {
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 3a4e6b9..d92b3b5 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -68,4 +68,8 @@ F (_ZGVnN2v_log10, Z_log10, log10l, mpfr_log10, 1, 0, d1, 1)
 F (_ZGVnN4v_log1pf, Z_log1pf, log1p, mpfr_log1p, 1, 1, f1, 1)
 #endif
 #endif
+#if WANT_SVE_MATH
+SVF1 (cos)
+ZSVF1 (cos)
+#endif
 #endif
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 06f94e5..df25bf1 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -6,10 +6,15 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-
 #if USE_MPFR
-static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_cos(y,x,r); return mpfr_sin(y,x,r); }
-static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,x,r); return mpfr_cos(y,x,r); }
+static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) {
+  mpfr_cos(y, x, r);
+  return mpfr_sin(y, x, r);
+}
+static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) {
+  mpfr_sin(y, x, r);
+  return mpfr_cos(y, x, r);
+}
 #endif
 
 /* Wrappers for vector functions.  */
@@ -53,5 +58,13 @@ static double Z_erf(double x) { return _ZGVnN2v_erf(argd(x))[0]; }
 static double Z_erfc(double x) { return _ZGVnN2v_erfc(argd(x))[0]; }
 static double Z_log10(double x) { return _ZGVnN2v_log10(argd(x))[0]; }
 #endif
+#if WANT_SVE_MATH
+static float sv_cosf(float x) {
+  return svretf(__sv_cosf_x(svargf(x), svptrue_b32()));
+}
+static float Z_sv_cosf(float x) {
+  return svretf(_ZGVsMxv_cosf(svargf(x), svptrue_b32()));
+}
+#endif
 #endif
 // clang-format on
author	Joe Ramsay <Joe.Ramsay@arm.com>	2022-07-21 13:55:41 +0100
committer	Joe Ramsay <joe.ramsay@arm.com>	2022-07-21 13:55:41 +0100
commit	a40716ffedba4fa94c1a0c3e88857740d86a00b0 (patch)
tree	4fcf4c12b8781d7160cdef97ef2e93c5db3c0bc4
parent	a790e502efe76dbad55a33655f6e3c9066e11325 (diff)
download	arm-optimized-routines-a40716ffedba4fa94c1a0c3e88857740d86a00b0.tar.gz