diff options
author | Joe Ramsay <Joe.Ramsay@arm.com> | 2022-07-21 13:55:41 +0100 |
---|---|---|
committer | Joe Ramsay <joe.ramsay@arm.com> | 2022-07-21 13:55:41 +0100 |
commit | a40716ffedba4fa94c1a0c3e88857740d86a00b0 (patch) | |
tree | 4fcf4c12b8781d7160cdef97ef2e93c5db3c0bc4 | |
parent | a790e502efe76dbad55a33655f6e3c9066e11325 (diff) | |
download | arm-optimized-routines-a40716ffedba4fa94c1a0c3e88857740d86a00b0.tar.gz |
pl/math: Add Vector/SVE cosf.
An implementation based on SVE trigonometric instructions.
It relies on the same range reduction as Vector/Neon
cosf, with a slight modification of the shift.
The maximum measured error is 2.06ULPs.
-rw-r--r-- | pl/math/Dir.mk | 2 | ||||
-rw-r--r-- | pl/math/include/mathlib.h | 8 | ||||
-rw-r--r-- | pl/math/sv_cosf_2u1.c | 75 | ||||
-rw-r--r-- | pl/math/sv_math.h | 160 | ||||
-rw-r--r-- | pl/math/test/mathbench_funcs.h | 5 | ||||
-rwxr-xr-x | pl/math/test/runulp.sh | 19 | ||||
-rw-r--r-- | pl/math/test/ulp_funcs.h | 4 | ||||
-rw-r--r-- | pl/math/test/ulp_wrappers.h | 19 |
8 files changed, 288 insertions, 4 deletions
diff --git a/pl/math/Dir.mk b/pl/math/Dir.mk index 4a96dc6..7909ea0 100644 --- a/pl/math/Dir.mk +++ b/pl/math/Dir.mk @@ -128,7 +128,7 @@ check-pl/math-rtest: $(math-host-tools) $(math-tools) cat $(math-rtests) | build/pl/bin/rtest | $(EMULATOR) build/pl/bin/mathtest $(math-testflags) check-pl/math-ulp: $(math-tools) - ULPFLAGS="$(math-ulpflags)" build/pl/bin/runulp.sh $(EMULATOR) + WANT_SVE_MATH=$(WANT_SVE_MATH) ULPFLAGS="$(math-ulpflags)" build/pl/bin/runulp.sh $(EMULATOR) check-pl/math: check-pl/math-test check-pl/math-rtest check-pl/math-ulp diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h index c6620c0..5ed266a 100644 --- a/pl/math/include/mathlib.h +++ b/pl/math/include/mathlib.h @@ -92,6 +92,14 @@ __vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t); __vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t); #endif + +#if WANT_SVE_MATH +#include <arm_sve.h> +svfloat32_t __sv_cosf_x (svfloat32_t, svbool_t); +/* SVE ABI names. */ +svfloat32_t _ZGVsMxv_cosf (svfloat32_t, svbool_t); +#endif + #endif #endif diff --git a/pl/math/sv_cosf_2u1.c b/pl/math/sv_cosf_2u1.c new file mode 100644 index 0000000..70057ea --- /dev/null +++ b/pl/math/sv_cosf_2u1.c @@ -0,0 +1,75 @@ +/* + * Single-precision SVE cos(x) function. + * + * Copyright (c) 2019-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#if SV_SUPPORTED + +#define NegPio2_1 (sv_f32 (-0x1.921fb6p+0f)) +#define NegPio2_2 (sv_f32 (0x1.777a5cp-25f)) +#define NegPio2_3 (sv_f32 (0x1.ee59dap-50f)) +#define RangeVal (sv_f32 (0x1p20f)) +#define InvPio2 (sv_f32 (0x1.45f306p-1f)) +/* Original shift used in Neon cosf, + plus a contribution to set the bit #0 of q + as expected by trigonometric instructions. */ +#define Shift (sv_f32 (0x1.800002p+23f)) +#define AbsMask (0x7fffffff) + +static NOINLINE sv_f32_t +__sv_cosf_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp) +{ + return sv_call_f32 (cosf, x, y, cmp); +} + +/* A fast SVE implementation of cosf based on trigonometric + instructions (FTMAD, FTSSEL, FTSMUL). + Maximum measured error: 2.06 ULPs. + __sv_cosf(0x1.dea2f2p+19) got 0x1.fffe7ap-6 + want 0x1.fffe76p-6. */ +sv_f32_t +__sv_cosf_x (sv_f32_t x, const svbool_t pg) +{ + sv_f32_t n, r, r2, y; + svbool_t cmp; + + r = sv_as_f32_u32 (svand_n_u32_x (pg, sv_as_u32_f32 (x), AbsMask)); + cmp = svcmpge_u32 (pg, sv_as_u32_f32 (r), sv_as_u32_f32 (RangeVal)); + + /* n = rint(|x|/(pi/2)). */ + sv_f32_t q = sv_fma_f32_x (pg, InvPio2, r, Shift); + n = svsub_f32_x (pg, q, Shift); + + /* r = |x| - n*(pi/2) (range reduction into -pi/4 .. pi/4). */ + r = sv_fma_f32_x (pg, NegPio2_1, n, r); + r = sv_fma_f32_x (pg, NegPio2_2, n, r); + r = sv_fma_f32_x (pg, NegPio2_3, n, r); + + /* Final multiplicative factor: 1.0 or x depending on bit #0 of q. */ + sv_f32_t f = svtssel_f32 (r, sv_as_u32_f32 (q)); + + /* cos(r) poly approx. */ + r2 = svtsmul_f32 (r, sv_as_u32_f32 (q)); + y = sv_f32 (0.0f); + y = svtmad_f32 (y, r2, 4); + y = svtmad_f32 (y, r2, 3); + y = svtmad_f32 (y, r2, 2); + y = svtmad_f32 (y, r2, 1); + y = svtmad_f32 (y, r2, 0); + + /* Apply factor. */ + y = svmul_f32_x (pg, f, y); + + /* No need to pass pg to specialcase here since cmp is a strict subset, + guaranteed by the cmpge above. */ + if (unlikely (svptest_any (pg, cmp))) + return __sv_cosf_specialcase (x, y, cmp); + return y; +} + +strong_alias (__sv_cosf_x, _ZGVsMxv_cosf) + +#endif diff --git a/pl/math/sv_math.h b/pl/math/sv_math.h new file mode 100644 index 0000000..14919be --- /dev/null +++ b/pl/math/sv_math.h @@ -0,0 +1,160 @@ +/* + * Wrapper functions for SVE ACLE. + * + * Copyright (c) 2019-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef SV_MATH_H +#define SV_MATH_H + +#ifndef WANT_VMATH +/* Enable the build of vector math code. */ +#define WANT_VMATH 1 +#endif +#if WANT_VMATH + +#if WANT_SVE_MATH +#define SV_SUPPORTED 1 + +#include <arm_sve.h> +#include <stdbool.h> + +#include "math_config.h" + +typedef float f32_t; +typedef uint32_t u32_t; +typedef int32_t s32_t; +typedef double f64_t; +typedef uint64_t u64_t; +typedef int64_t s64_t; + +typedef svfloat64_t sv_f64_t; +typedef svuint64_t sv_u64_t; +typedef svint64_t sv_s64_t; + +typedef svfloat32_t sv_f32_t; +typedef svuint32_t sv_u32_t; +typedef svint32_t sv_s32_t; + +/* Double precision. */ +static inline sv_s64_t +sv_s64 (s64_t x) +{ + return svdup_n_s64 (x); +} + +static inline sv_u64_t +sv_u64 (u64_t x) +{ + return svdup_n_u64 (x); +} + +static inline sv_f64_t +sv_f64 (f64_t x) +{ + return svdup_n_f64 (x); +} + +static inline sv_f64_t +sv_fma_f64_x (svbool_t pg, sv_f64_t x, sv_f64_t y, sv_f64_t z) +{ + return svmla_f64_x (pg, z, x, y); +} + +/* res = z + x * y with x scalar. */ +static inline sv_f64_t +sv_fma_n_f64_x (svbool_t pg, f64_t x, sv_f64_t y, sv_f64_t z) +{ + return svmla_n_f64_x (pg, z, y, x); +} + +static inline sv_u64_t +sv_as_u64_f64 (sv_f64_t x) +{ + return svreinterpret_u64_f64 (x); +} + +static inline sv_f64_t +sv_as_f64_u64 (sv_u64_t x) +{ + return svreinterpret_f64_u64 (x); +} + +static inline sv_f64_t +sv_call_f64 (f64_t (*f) (f64_t), sv_f64_t x, sv_f64_t y, svbool_t cmp) +{ + svbool_t p = svpfirst (cmp, svpfalse ()); + while (svptest_any (cmp, p)) + { + f64_t elem = svclastb_n_f64 (p, 0, x); + elem = (*f) (elem); + sv_f64_t y2 = svdup_n_f64 (elem); + y = svsel_f64 (p, y2, y); + p = svpnext_b64 (cmp, p); + } + return y; +} + +/* Single precision. */ +static inline sv_s32_t +sv_s32 (s32_t x) +{ + return svdup_n_s32 (x); +} + +static inline sv_u32_t +sv_u32 (u32_t x) +{ + return svdup_n_u32 (x); +} + +static inline sv_f32_t +sv_f32 (f32_t x) +{ + return svdup_n_f32 (x); +} + +static inline sv_f32_t +sv_fma_f32_x (svbool_t pg, sv_f32_t x, sv_f32_t y, sv_f32_t z) +{ + return svmla_f32_x (pg, z, x, y); +} + +/* res = z + x * y with x scalar. */ +static inline sv_f32_t +sv_fma_n_f32_x (svbool_t pg, f32_t x, sv_f32_t y, sv_f32_t z) +{ + return svmla_n_f32_x (pg, z, y, x); +} + +static inline sv_u32_t +sv_as_u32_f32 (sv_f32_t x) +{ + return svreinterpret_u32_f32 (x); +} + +static inline sv_f32_t +sv_as_f32_u32 (sv_u32_t x) +{ + return svreinterpret_f32_u32 (x); +} + +static inline sv_f32_t +sv_call_f32 (f32_t (*f) (f32_t), sv_f32_t x, sv_f32_t y, svbool_t cmp) +{ + svbool_t p = svpfirst (cmp, svpfalse ()); + while (svptest_any (cmp, p)) + { + f32_t elem = svclastb_n_f32 (p, 0, x); + elem = (*f) (elem); + sv_f32_t y2 = svdup_n_f32 (elem); + y = svsel_f32 (p, y2, y); + p = svpnext_b32 (cmp, p); + } + return y; +} + +#endif +#endif +#endif diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h index f681489..5b6a806 100644 --- a/pl/math/test/mathbench_funcs.h +++ b/pl/math/test/mathbench_funcs.h @@ -8,6 +8,7 @@ F (asinhf, -10.0, 10.0) F (atanf, -10.0, 10.0) {"atan2f", 'f', 0, -10.0, 10.0, {.f = atan2f_wrap}}, +F (cosf, -3.1, 3.1) F (erfcf, -4.0, 10.0) F (erff, -4.0, 4.0) F (log10f, 0.01, 11.1) @@ -85,5 +86,9 @@ VNF (__vn_log1pf, -0.9, 10.0) VNF (_ZGVnN4v_log1pf, -0.9, 10.0) #endif #endif +#if WANT_SVE_MATH +SVF (__sv_cosf_x, -3.1, 3.1) +SVF (_ZGVsMxv_cosf, -3.1, 3.1) +#endif #endif // clang-format on diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh index 900d7d2..8af5d1c 100755 --- a/pl/math/test/runulp.sh +++ b/pl/math/test/runulp.sh @@ -15,6 +15,9 @@ rmodes='n' flags="${ULPFLAGS:--q}" emu="$@" +# Enable SVE testing +WANT_SVE_MATH=${WANT_SVE_MATH:-0} + FAIL=0 PASS=0 @@ -132,6 +135,10 @@ runv= check __v_log10f 1 && runv=1 runvn= check __vn_log10f 1 && runvn=1 +runsv= +if [ $WANT_SVE_MATH -eq 1 ]; then +check __sv_cosf 0 && runsv=1 +fi range_erfc=' 0 0xffff0000 10000 @@ -234,6 +241,11 @@ range_asinhf=' -0x1p11 -inf 20000 ' +range_sve_cosf=' + 0 0xffff0000 10000 + 0x1p-4 0x1p4 500000 +' + # error limits L_erfc=3.7 L_erfcf=1.0 @@ -248,6 +260,8 @@ L_atanf=3.0 L_log1pf=2.0 L_asinhf=2.2 +L_sve_cosf=1.6 + while read G F R do [ "$R" = 1 ] || continue @@ -314,6 +328,11 @@ asinhf __s_asinhf $runs asinhf __v_asinhf $runv asinhf __vn_asinhf $runvn asinhf _ZGVnN4v_asinhf $runvn + +if [ $WANT_SVE_MATH -eq 1 ]; then +sve_cosf __sv_cosf $runsv +sve_cosf _ZGVsMxv_cosf $runsv +fi EOF [ 0 -eq $FAIL ] || { diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h index 3a4e6b9..d92b3b5 100644 --- a/pl/math/test/ulp_funcs.h +++ b/pl/math/test/ulp_funcs.h @@ -68,4 +68,8 @@ F (_ZGVnN2v_log10, Z_log10, log10l, mpfr_log10, 1, 0, d1, 1) F (_ZGVnN4v_log1pf, Z_log1pf, log1p, mpfr_log1p, 1, 1, f1, 1) #endif #endif +#if WANT_SVE_MATH +SVF1 (cos) +ZSVF1 (cos) +#endif #endif diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h index 06f94e5..df25bf1 100644 --- a/pl/math/test/ulp_wrappers.h +++ b/pl/math/test/ulp_wrappers.h @@ -6,10 +6,15 @@ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ - #if USE_MPFR -static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_cos(y,x,r); return mpfr_sin(y,x,r); } -static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,x,r); return mpfr_cos(y,x,r); } +static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { + mpfr_cos(y, x, r); + return mpfr_sin(y, x, r); +} +static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { + mpfr_sin(y, x, r); + return mpfr_cos(y, x, r); +} #endif /* Wrappers for vector functions. */ @@ -53,5 +58,13 @@ static double Z_erf(double x) { return _ZGVnN2v_erf(argd(x))[0]; } static double Z_erfc(double x) { return _ZGVnN2v_erfc(argd(x))[0]; } static double Z_log10(double x) { return _ZGVnN2v_log10(argd(x))[0]; } #endif +#if WANT_SVE_MATH +static float sv_cosf(float x) { + return svretf(__sv_cosf_x(svargf(x), svptrue_b32())); +} +static float Z_sv_cosf(float x) { + return svretf(_ZGVsMxv_cosf(svargf(x), svptrue_b32())); +} +#endif #endif // clang-format on |