aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJoe Ramsay <Joe.Ramsay@arm.com>2022-07-21 13:55:41 +0100
committerJoe Ramsay <joe.ramsay@arm.com>2022-07-21 13:55:41 +0100
commita40716ffedba4fa94c1a0c3e88857740d86a00b0 (patch)
tree4fcf4c12b8781d7160cdef97ef2e93c5db3c0bc4
parenta790e502efe76dbad55a33655f6e3c9066e11325 (diff)
downloadarm-optimized-routines-a40716ffedba4fa94c1a0c3e88857740d86a00b0.tar.gz
pl/math: Add Vector/SVE cosf.
An implementation based on SVE trigonometric instructions. It relies on the same range reduction as Vector/Neon cosf, with a slight modification of the shift. The maximum measured error is 2.06ULPs.
-rw-r--r--pl/math/Dir.mk2
-rw-r--r--pl/math/include/mathlib.h8
-rw-r--r--pl/math/sv_cosf_2u1.c75
-rw-r--r--pl/math/sv_math.h160
-rw-r--r--pl/math/test/mathbench_funcs.h5
-rwxr-xr-xpl/math/test/runulp.sh19
-rw-r--r--pl/math/test/ulp_funcs.h4
-rw-r--r--pl/math/test/ulp_wrappers.h19
8 files changed, 288 insertions, 4 deletions
diff --git a/pl/math/Dir.mk b/pl/math/Dir.mk
index 4a96dc6..7909ea0 100644
--- a/pl/math/Dir.mk
+++ b/pl/math/Dir.mk
@@ -128,7 +128,7 @@ check-pl/math-rtest: $(math-host-tools) $(math-tools)
cat $(math-rtests) | build/pl/bin/rtest | $(EMULATOR) build/pl/bin/mathtest $(math-testflags)
check-pl/math-ulp: $(math-tools)
- ULPFLAGS="$(math-ulpflags)" build/pl/bin/runulp.sh $(EMULATOR)
+ WANT_SVE_MATH=$(WANT_SVE_MATH) ULPFLAGS="$(math-ulpflags)" build/pl/bin/runulp.sh $(EMULATOR)
check-pl/math: check-pl/math-test check-pl/math-rtest check-pl/math-ulp
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index c6620c0..5ed266a 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -92,6 +92,14 @@ __vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
__vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t);
#endif
+
+#if WANT_SVE_MATH
+#include <arm_sve.h>
+svfloat32_t __sv_cosf_x (svfloat32_t, svbool_t);
+/* SVE ABI names. */
+svfloat32_t _ZGVsMxv_cosf (svfloat32_t, svbool_t);
+#endif
+
#endif
#endif
diff --git a/pl/math/sv_cosf_2u1.c b/pl/math/sv_cosf_2u1.c
new file mode 100644
index 0000000..70057ea
--- /dev/null
+++ b/pl/math/sv_cosf_2u1.c
@@ -0,0 +1,75 @@
+/*
+ * Single-precision SVE cos(x) function.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#if SV_SUPPORTED
+
+#define NegPio2_1 (sv_f32 (-0x1.921fb6p+0f))
+#define NegPio2_2 (sv_f32 (0x1.777a5cp-25f))
+#define NegPio2_3 (sv_f32 (0x1.ee59dap-50f))
+#define RangeVal (sv_f32 (0x1p20f))
+#define InvPio2 (sv_f32 (0x1.45f306p-1f))
+/* Original shift used in Neon cosf,
+ plus a contribution to set the bit #0 of q
+ as expected by trigonometric instructions. */
+#define Shift (sv_f32 (0x1.800002p+23f))
+#define AbsMask (0x7fffffff)
+
+static NOINLINE sv_f32_t
+__sv_cosf_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp)
+{
+ return sv_call_f32 (cosf, x, y, cmp);
+}
+
+/* A fast SVE implementation of cosf based on trigonometric
+ instructions (FTMAD, FTSSEL, FTSMUL).
+ Maximum measured error: 2.06 ULPs.
+ __sv_cosf(0x1.dea2f2p+19) got 0x1.fffe7ap-6
+ want 0x1.fffe76p-6. */
+sv_f32_t
+__sv_cosf_x (sv_f32_t x, const svbool_t pg)
+{
+ sv_f32_t n, r, r2, y;
+ svbool_t cmp;
+
+ r = sv_as_f32_u32 (svand_n_u32_x (pg, sv_as_u32_f32 (x), AbsMask));
+ cmp = svcmpge_u32 (pg, sv_as_u32_f32 (r), sv_as_u32_f32 (RangeVal));
+
+ /* n = rint(|x|/(pi/2)). */
+ sv_f32_t q = sv_fma_f32_x (pg, InvPio2, r, Shift);
+ n = svsub_f32_x (pg, q, Shift);
+
+ /* r = |x| - n*(pi/2) (range reduction into -pi/4 .. pi/4). */
+ r = sv_fma_f32_x (pg, NegPio2_1, n, r);
+ r = sv_fma_f32_x (pg, NegPio2_2, n, r);
+ r = sv_fma_f32_x (pg, NegPio2_3, n, r);
+
+ /* Final multiplicative factor: 1.0 or x depending on bit #0 of q. */
+ sv_f32_t f = svtssel_f32 (r, sv_as_u32_f32 (q));
+
+ /* cos(r) poly approx. */
+ r2 = svtsmul_f32 (r, sv_as_u32_f32 (q));
+ y = sv_f32 (0.0f);
+ y = svtmad_f32 (y, r2, 4);
+ y = svtmad_f32 (y, r2, 3);
+ y = svtmad_f32 (y, r2, 2);
+ y = svtmad_f32 (y, r2, 1);
+ y = svtmad_f32 (y, r2, 0);
+
+ /* Apply factor. */
+ y = svmul_f32_x (pg, f, y);
+
+ /* No need to pass pg to specialcase here since cmp is a strict subset,
+ guaranteed by the cmpge above. */
+ if (unlikely (svptest_any (pg, cmp)))
+ return __sv_cosf_specialcase (x, y, cmp);
+ return y;
+}
+
+strong_alias (__sv_cosf_x, _ZGVsMxv_cosf)
+
+#endif
diff --git a/pl/math/sv_math.h b/pl/math/sv_math.h
new file mode 100644
index 0000000..14919be
--- /dev/null
+++ b/pl/math/sv_math.h
@@ -0,0 +1,160 @@
+/*
+ * Wrapper functions for SVE ACLE.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef SV_MATH_H
+#define SV_MATH_H
+
+#ifndef WANT_VMATH
+/* Enable the build of vector math code. */
+#define WANT_VMATH 1
+#endif
+#if WANT_VMATH
+
+#if WANT_SVE_MATH
+#define SV_SUPPORTED 1
+
+#include <arm_sve.h>
+#include <stdbool.h>
+
+#include "math_config.h"
+
+typedef float f32_t;
+typedef uint32_t u32_t;
+typedef int32_t s32_t;
+typedef double f64_t;
+typedef uint64_t u64_t;
+typedef int64_t s64_t;
+
+typedef svfloat64_t sv_f64_t;
+typedef svuint64_t sv_u64_t;
+typedef svint64_t sv_s64_t;
+
+typedef svfloat32_t sv_f32_t;
+typedef svuint32_t sv_u32_t;
+typedef svint32_t sv_s32_t;
+
+/* Double precision. */
+static inline sv_s64_t
+sv_s64 (s64_t x)
+{
+ return svdup_n_s64 (x);
+}
+
+static inline sv_u64_t
+sv_u64 (u64_t x)
+{
+ return svdup_n_u64 (x);
+}
+
+static inline sv_f64_t
+sv_f64 (f64_t x)
+{
+ return svdup_n_f64 (x);
+}
+
+static inline sv_f64_t
+sv_fma_f64_x (svbool_t pg, sv_f64_t x, sv_f64_t y, sv_f64_t z)
+{
+ return svmla_f64_x (pg, z, x, y);
+}
+
+/* res = z + x * y with x scalar. */
+static inline sv_f64_t
+sv_fma_n_f64_x (svbool_t pg, f64_t x, sv_f64_t y, sv_f64_t z)
+{
+ return svmla_n_f64_x (pg, z, y, x);
+}
+
+static inline sv_u64_t
+sv_as_u64_f64 (sv_f64_t x)
+{
+ return svreinterpret_u64_f64 (x);
+}
+
+static inline sv_f64_t
+sv_as_f64_u64 (sv_u64_t x)
+{
+ return svreinterpret_f64_u64 (x);
+}
+
+static inline sv_f64_t
+sv_call_f64 (f64_t (*f) (f64_t), sv_f64_t x, sv_f64_t y, svbool_t cmp)
+{
+ svbool_t p = svpfirst (cmp, svpfalse ());
+ while (svptest_any (cmp, p))
+ {
+ f64_t elem = svclastb_n_f64 (p, 0, x);
+ elem = (*f) (elem);
+ sv_f64_t y2 = svdup_n_f64 (elem);
+ y = svsel_f64 (p, y2, y);
+ p = svpnext_b64 (cmp, p);
+ }
+ return y;
+}
+
+/* Single precision. */
+static inline sv_s32_t
+sv_s32 (s32_t x)
+{
+ return svdup_n_s32 (x);
+}
+
+static inline sv_u32_t
+sv_u32 (u32_t x)
+{
+ return svdup_n_u32 (x);
+}
+
+static inline sv_f32_t
+sv_f32 (f32_t x)
+{
+ return svdup_n_f32 (x);
+}
+
+static inline sv_f32_t
+sv_fma_f32_x (svbool_t pg, sv_f32_t x, sv_f32_t y, sv_f32_t z)
+{
+ return svmla_f32_x (pg, z, x, y);
+}
+
+/* res = z + x * y with x scalar. */
+static inline sv_f32_t
+sv_fma_n_f32_x (svbool_t pg, f32_t x, sv_f32_t y, sv_f32_t z)
+{
+ return svmla_n_f32_x (pg, z, y, x);
+}
+
+static inline sv_u32_t
+sv_as_u32_f32 (sv_f32_t x)
+{
+ return svreinterpret_u32_f32 (x);
+}
+
+static inline sv_f32_t
+sv_as_f32_u32 (sv_u32_t x)
+{
+ return svreinterpret_f32_u32 (x);
+}
+
+static inline sv_f32_t
+sv_call_f32 (f32_t (*f) (f32_t), sv_f32_t x, sv_f32_t y, svbool_t cmp)
+{
+ svbool_t p = svpfirst (cmp, svpfalse ());
+ while (svptest_any (cmp, p))
+ {
+ f32_t elem = svclastb_n_f32 (p, 0, x);
+ elem = (*f) (elem);
+ sv_f32_t y2 = svdup_n_f32 (elem);
+ y = svsel_f32 (p, y2, y);
+ p = svpnext_b32 (cmp, p);
+ }
+ return y;
+}
+
+#endif
+#endif
+#endif
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index f681489..5b6a806 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -8,6 +8,7 @@
F (asinhf, -10.0, 10.0)
F (atanf, -10.0, 10.0)
{"atan2f", 'f', 0, -10.0, 10.0, {.f = atan2f_wrap}},
+F (cosf, -3.1, 3.1)
F (erfcf, -4.0, 10.0)
F (erff, -4.0, 4.0)
F (log10f, 0.01, 11.1)
@@ -85,5 +86,9 @@ VNF (__vn_log1pf, -0.9, 10.0)
VNF (_ZGVnN4v_log1pf, -0.9, 10.0)
#endif
#endif
+#if WANT_SVE_MATH
+SVF (__sv_cosf_x, -3.1, 3.1)
+SVF (_ZGVsMxv_cosf, -3.1, 3.1)
+#endif
#endif
// clang-format on
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 900d7d2..8af5d1c 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -15,6 +15,9 @@ rmodes='n'
flags="${ULPFLAGS:--q}"
emu="$@"
+# Enable SVE testing
+WANT_SVE_MATH=${WANT_SVE_MATH:-0}
+
FAIL=0
PASS=0
@@ -132,6 +135,10 @@ runv=
check __v_log10f 1 && runv=1
runvn=
check __vn_log10f 1 && runvn=1
+runsv=
+if [ $WANT_SVE_MATH -eq 1 ]; then
+check __sv_cosf 0 && runsv=1
+fi
range_erfc='
0 0xffff0000 10000
@@ -234,6 +241,11 @@ range_asinhf='
-0x1p11 -inf 20000
'
+range_sve_cosf='
+ 0 0xffff0000 10000
+ 0x1p-4 0x1p4 500000
+'
+
# error limits
L_erfc=3.7
L_erfcf=1.0
@@ -248,6 +260,8 @@ L_atanf=3.0
L_log1pf=2.0
L_asinhf=2.2
+L_sve_cosf=1.6
+
while read G F R
do
[ "$R" = 1 ] || continue
@@ -314,6 +328,11 @@ asinhf __s_asinhf $runs
asinhf __v_asinhf $runv
asinhf __vn_asinhf $runvn
asinhf _ZGVnN4v_asinhf $runvn
+
+if [ $WANT_SVE_MATH -eq 1 ]; then
+sve_cosf __sv_cosf $runsv
+sve_cosf _ZGVsMxv_cosf $runsv
+fi
EOF
[ 0 -eq $FAIL ] || {
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 3a4e6b9..d92b3b5 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -68,4 +68,8 @@ F (_ZGVnN2v_log10, Z_log10, log10l, mpfr_log10, 1, 0, d1, 1)
F (_ZGVnN4v_log1pf, Z_log1pf, log1p, mpfr_log1p, 1, 1, f1, 1)
#endif
#endif
+#if WANT_SVE_MATH
+SVF1 (cos)
+ZSVF1 (cos)
+#endif
#endif
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 06f94e5..df25bf1 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -6,10 +6,15 @@
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-
#if USE_MPFR
-static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_cos(y,x,r); return mpfr_sin(y,x,r); }
-static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,x,r); return mpfr_cos(y,x,r); }
+static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) {
+ mpfr_cos(y, x, r);
+ return mpfr_sin(y, x, r);
+}
+static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) {
+ mpfr_sin(y, x, r);
+ return mpfr_cos(y, x, r);
+}
#endif
/* Wrappers for vector functions. */
@@ -53,5 +58,13 @@ static double Z_erf(double x) { return _ZGVnN2v_erf(argd(x))[0]; }
static double Z_erfc(double x) { return _ZGVnN2v_erfc(argd(x))[0]; }
static double Z_log10(double x) { return _ZGVnN2v_log10(argd(x))[0]; }
#endif
+#if WANT_SVE_MATH
+static float sv_cosf(float x) {
+ return svretf(__sv_cosf_x(svargf(x), svptrue_b32()));
+}
+static float Z_sv_cosf(float x) {
+ return svretf(_ZGVsMxv_cosf(svargf(x), svptrue_b32()));
+}
+#endif
#endif
// clang-format on