diff options
-rw-r--r-- | math/include/mathlib.h | 42 | ||||
-rw-r--r-- | math/s_exp.c | 6 | ||||
-rw-r--r-- | math/s_expf.c | 6 | ||||
-rw-r--r-- | math/s_expf_1u.c | 6 | ||||
-rw-r--r-- | math/test/mathbench.c | 242 | ||||
-rwxr-xr-x | math/test/runulp.sh | 64 | ||||
-rw-r--r-- | math/test/ulp.c | 64 | ||||
-rw-r--r-- | math/test/ulp.h | 27 | ||||
-rw-r--r-- | math/tools/v_exp.sollya | 30 | ||||
-rw-r--r-- | math/v_exp.c | 94 | ||||
-rw-r--r-- | math/v_exp.h | 12 | ||||
-rw-r--r-- | math/v_exp_data.c | 401 | ||||
-rw-r--r-- | math/v_expf.c | 75 | ||||
-rw-r--r-- | math/v_expf_1u.c | 72 | ||||
-rw-r--r-- | math/v_math.h | 604 | ||||
-rw-r--r-- | math/vn_exp.c | 12 | ||||
-rw-r--r-- | math/vn_expf.c | 12 | ||||
-rw-r--r-- | math/vn_expf_1u.c | 11 |
18 files changed, 1749 insertions, 31 deletions
diff --git a/math/include/mathlib.h b/math/include/mathlib.h index eed294b..6d62610 100644 --- a/math/include/mathlib.h +++ b/math/include/mathlib.h @@ -1,10 +1,13 @@ /* * Public API. * - * Copyright (c) 2015-2018, Arm Limited. + * Copyright (c) 2015-2019, Arm Limited. * SPDX-License-Identifier: MIT */ +#ifndef _MATHLIB_H +#define _MATHLIB_H + float expf (float); float exp2f (float); float logf (float); @@ -19,3 +22,40 @@ double exp2 (double); double log (double); double log2 (double); double pow (double, double); + +/* Scalar functions using the vector algorithm with identical result. */ +float __s_expf (float); +float __s_expf_1u (float); +double __s_exp (double); + +#if __aarch64__ +#if __GNUC__ >= 5 +typedef __Float32x4_t __f32x4_t; +typedef __Float64x2_t __f64x2_t; +#elif __clang_major__*100+__clang_minor__ >= 305 +typedef __attribute__((__neon_vector_type__(4))) float __f32x4_t; +typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t; +#else +#error Unsupported compiler +#endif + +/* Vector functions following the base PCS. */ +__f32x4_t __v_expf (__f32x4_t); +__f32x4_t __v_expf_1u (__f32x4_t); +__f64x2_t __v_exp (__f64x2_t); + +#if __GNUC__ >= 9 || __clang_major__ >= 8 +#define __vpcs __attribute__((__aarch64_vector_pcs__)) + +/* Vector functions following the vector PCS. */ +__vpcs __f32x4_t __vn_expf (__f32x4_t); +__vpcs __f32x4_t __vn_expf_1u (__f32x4_t); +__vpcs __f64x2_t __vn_exp (__f64x2_t); + +/* Vector functions following the vector PCS using ABI names. */ +__vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t); +__vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t); +#endif +#endif + +#endif diff --git a/math/s_exp.c b/math/s_exp.c new file mode 100644 index 0000000..ac7246b --- /dev/null +++ b/math/s_exp.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#define SCALAR 1 +#include "v_exp.c" diff --git a/math/s_expf.c b/math/s_expf.c new file mode 100644 index 0000000..3492c46 --- /dev/null +++ b/math/s_expf.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#define SCALAR 1 +#include "v_expf.c" diff --git a/math/s_expf_1u.c b/math/s_expf_1u.c new file mode 100644 index 0000000..eb7bbcb --- /dev/null +++ b/math/s_expf_1u.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#define SCALAR 1 +#include "v_expf_1u.c" diff --git a/math/test/mathbench.c b/math/test/mathbench.c index 1266d11..be7e439 100644 --- a/math/test/mathbench.c +++ b/math/test/mathbench.c @@ -29,6 +29,50 @@ static float Af[N]; static long measurecount = MEASURE; static long itercount = ITER; +#if __aarch64__ +typedef __f64x2_t v_double; + +#define v_double_len() 2 + +static inline v_double +v_double_load (const double *p) +{ + return (v_double){p[0], p[1]}; +} + +static inline v_double +v_double_dup (double x) +{ + return (v_double){x, x}; +} + +typedef __f32x4_t v_float; + +#define v_float_len() 4 + +static inline v_float +v_float_load (const float *p) +{ + return (v_float){p[0], p[1], p[2], p[3]}; +} + +static inline v_float +v_float_dup (float x) +{ + return (v_float){x, x, x, x}; +} +#else +/* dummy definitions to make things compile. */ +typedef double v_double; +typedef float v_float; +#define v_double_len(x) 1 +#define v_double_load(x) (x)[0] +#define v_double_dup(x) (x) +#define v_float_len(x) 1 +#define v_float_load(x) (x)[0] +#define v_float_dup(x) (x) +#endif + static double dummy (double x) { @@ -41,6 +85,32 @@ dummyf (float x) return x; } +static v_double +__v_dummy (v_double x) +{ + return x; +} + +static v_float +__v_dummyf (v_float x) +{ + return x; +} + +#ifdef __vpcs +__vpcs static v_double +__vn_dummy (v_double x) +{ + return x; +} + +__vpcs static v_float +__vn_dummyf (v_float x) +{ + return x; +} +#endif + static double xypow (double x) { @@ -89,42 +159,56 @@ static const struct fun { const char *name; int prec; + int vec; double lo; double hi; union { double (*d) (double); float (*f) (float); + v_double (*vd) (v_double); + v_float (*vf) (v_float); +#ifdef __vpcs + __vpcs v_double (*vnd) (v_double); + __vpcs v_float (*vnf) (v_float); +#endif } fun; } funtab[] = { -#define D(func, lo, hi) {#func, 'd', lo, hi, {.d = func}}, -#define F(func, lo, hi) {#func, 'f', lo, hi, {.f = func}}, +#define D(func, lo, hi) {#func, 'd', 0, lo, hi, {.d = func}}, +#define F(func, lo, hi) {#func, 'f', 0, lo, hi, {.f = func}}, +#define VD(func, lo, hi) {#func, 'd', 'v', lo, hi, {.vd = func}}, +#define VF(func, lo, hi) {#func, 'f', 'v', lo, hi, {.vf = func}}, +#define VND(func, lo, hi) {#func, 'd', 'n', lo, hi, {.vnd = func}}, +#define VNF(func, lo, hi) {#func, 'f', 'n', lo, hi, {.vnf = func}}, D (dummy, 1.0, 2.0) D (exp, -9.9, 9.9) D (exp, 0.5, 1.0) +D (__s_exp, -9.9, 9.9) D (exp2, -9.9, 9.9) D (log, 0.01, 11.1) D (log, 0.999, 1.001) D (log2, 0.01, 11.1) D (log2, 0.999, 1.001) -{"pow", 'd', 0.01, 11.1, {.d = xypow}}, +{"pow", 'd', 0, 0.01, 11.1, {.d = xypow}}, D (xpow, 0.01, 11.1) D (ypow, -9.9, 9.9) F (dummyf, 1.0, 2.0) F (expf, -9.9, 9.9) +F (__s_expf, -9.9, 9.9) +F (__s_expf_1u, -9.9, 9.9) F (exp2f, -9.9, 9.9) F (logf, 0.01, 11.1) F (log2f, 0.01, 11.1) -{"powf", 'f', 0.01, 11.1, {.f = xypowf}}, +{"powf", 'f', 0, 0.01, 11.1, {.f = xypowf}}, F (xpowf, 0.01, 11.1) F (ypowf, -9.9, 9.9) -{"sincosf", 'f', 0.1, 0.7, {.f = sincosf_wrap}}, -{"sincosf", 'f', 0.8, 3.1, {.f = sincosf_wrap}}, -{"sincosf", 'f', -3.1, 3.1, {.f = sincosf_wrap}}, -{"sincosf", 'f', 3.3, 33.3, {.f = sincosf_wrap}}, -{"sincosf", 'f', 100, 1000, {.f = sincosf_wrap}}, -{"sincosf", 'f', 1e6, 1e32, {.f = sincosf_wrap}}, +{"sincosf", 'f', 0, 0.1, 0.7, {.f = sincosf_wrap}}, +{"sincosf", 'f', 0, 0.8, 3.1, {.f = sincosf_wrap}}, +{"sincosf", 'f', 0, -3.1, 3.1, {.f = sincosf_wrap}}, +{"sincosf", 'f', 0, 3.3, 33.3, {.f = sincosf_wrap}}, +{"sincosf", 'f', 0, 100, 1000, {.f = sincosf_wrap}}, +{"sincosf", 'f', 0, 1e6, 1e32, {.f = sincosf_wrap}}, F (sinf, 0.1, 0.7) F (sinf, 0.8, 3.1) F (sinf, -3.1, 3.1) @@ -137,9 +221,29 @@ F (cosf, -3.1, 3.1) F (cosf, 3.3, 33.3) F (cosf, 100, 1000) F (cosf, 1e6, 1e32) +#if __aarch64__ +VD (__v_dummy, 1.0, 2.0) +VD (__v_exp, -9.9, 9.9) +VF (__v_dummyf, 1.0, 2.0) +VF (__v_expf, -9.9, 9.9) +VF (__v_expf_1u, -9.9, 9.9) +#ifdef __vpcs +VND (__vn_dummy, 1.0, 2.0) +VND (__vn_exp, -9.9, 9.9) +VND (_ZGVnN2v_exp, -9.9, 9.9) +VNF (__vn_dummyf, 1.0, 2.0) +VNF (__vn_expf, -9.9, 9.9) +VNF (_ZGVnN4v_expf, -9.9, 9.9) +VNF (__vn_expf_1u, -9.9, 9.9) +#endif +#endif {0}, #undef F #undef D +#undef VF +#undef VD +#undef VNF +#undef VND }; static void @@ -238,6 +342,72 @@ runf_latency (float f (float)) prev = f (Af[i] + prev * z); } +static void +run_v_thruput (v_double f (v_double)) +{ + for (int i = 0; i < N; i += v_double_len ()) + f (v_double_load (A+i)); +} + +static void +runf_v_thruput (v_float f (v_float)) +{ + for (int i = 0; i < N; i += v_float_len ()) + f (v_float_load (Af+i)); +} + +static void +run_v_latency (v_double f (v_double)) +{ + v_double z = v_double_dup (zero); + v_double prev = z; + for (int i = 0; i < N; i += v_double_len ()) + prev = f (v_double_load (A+i) + prev * z); +} + +static void +runf_v_latency (v_float f (v_float)) +{ + v_float z = v_float_dup (zero); + v_float prev = z; + for (int i = 0; i < N; i += v_float_len ()) + prev = f (v_float_load (Af+i) + prev * z); +} + +#ifdef __vpcs +static void +run_vn_thruput (__vpcs v_double f (v_double)) +{ + for (int i = 0; i < N; i += v_double_len ()) + f (v_double_load (A+i)); +} + +static void +runf_vn_thruput (__vpcs v_float f (v_float)) +{ + for (int i = 0; i < N; i += v_float_len ()) + f (v_float_load (Af+i)); +} + +static void +run_vn_latency (__vpcs v_double f (v_double)) +{ + v_double z = v_double_dup (zero); + v_double prev = z; + for (int i = 0; i < N; i += v_double_len ()) + prev = f (v_double_load (A+i) + prev * z); +} + +static void +runf_vn_latency (__vpcs v_float f (v_float)) +{ + v_float z = v_float_dup (zero); + v_float prev = z; + for (int i = 0; i < N; i += v_float_len ()) + prev = f (v_float_load (Af+i) + prev * z); +} +#endif + static uint64_t tic (void) { @@ -267,20 +437,54 @@ bench1 (const struct fun *f, int type, double lo, double hi) uint64_t dt = 0; uint64_t ns100; const char *s = type == 't' ? "rthruput" : "latency"; + int vlen = 1; - if (f->prec == 'd' && type == 't') + if (f->vec && f->prec == 'd') + vlen = v_double_len(); + else if (f->vec && f->prec == 'f') + vlen = v_float_len(); + + if (f->prec == 'd' && type == 't' && f->vec == 0) TIMEIT (run_thruput, f->fun.d); - else if (f->prec == 'd' && type == 'l') + else if (f->prec == 'd' && type == 'l' && f->vec == 0) TIMEIT (run_latency, f->fun.d); - else if (f->prec == 'f' && type == 't') + else if (f->prec == 'f' && type == 't' && f->vec == 0) TIMEIT (runf_thruput, f->fun.f); - else if (f->prec == 'f' && type == 'l') + else if (f->prec == 'f' && type == 'l' && f->vec == 0) TIMEIT (runf_latency, f->fun.f); - - ns100 = (100 * dt + itercount * N / 2) / (itercount * N); - printf ("%7s %8s: %4u.%02u ns/elem %10llu ns in [%g %g]\n", f->name, s, - (unsigned) (ns100 / 100), (unsigned) (ns100 % 100), - (unsigned long long) dt, lo, hi); + else if (f->prec == 'd' && type == 't' && f->vec == 'v') + TIMEIT (run_v_thruput, f->fun.vd); + else if (f->prec == 'd' && type == 'l' && f->vec == 'v') + TIMEIT (run_v_latency, f->fun.vd); + else if (f->prec == 'f' && type == 't' && f->vec == 'v') + TIMEIT (runf_v_thruput, f->fun.vf); + else if (f->prec == 'f' && type == 'l' && f->vec == 'v') + TIMEIT (runf_v_latency, f->fun.vf); +#ifdef __vpcs + else if (f->prec == 'd' && type == 't' && f->vec == 'n') + TIMEIT (run_vn_thruput, f->fun.vnd); + else if (f->prec == 'd' && type == 'l' && f->vec == 'n') + TIMEIT (run_vn_latency, f->fun.vnd); + else if (f->prec == 'f' && type == 't' && f->vec == 'n') + TIMEIT (runf_vn_thruput, f->fun.vnf); + else if (f->prec == 'f' && type == 'l' && f->vec == 'n') + TIMEIT (runf_vn_latency, f->fun.vnf); +#endif + + if (type == 't') + { + ns100 = (100 * dt + itercount * N / 2) / (itercount * N); + printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g]\n", f->name, s, + (unsigned) (ns100 / 100), (unsigned) (ns100 % 100), + (unsigned long long) dt, lo, hi); + } + else if (type == 'l') + { + ns100 = (100 * dt + itercount * N / vlen / 2) / (itercount * N / vlen); + printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g]\n", f->name, s, + (unsigned) (ns100 / 100), (unsigned) (ns100 % 100), + (unsigned long long) dt, lo, hi); + } fflush (stdout); } diff --git a/math/test/runulp.sh b/math/test/runulp.sh index 64db6f3..b2c1827 100755 --- a/math/test/runulp.sh +++ b/math/test/runulp.sh @@ -24,6 +24,10 @@ t() { $emu ./ulp -r $r -e $Lt $flags "$@" && PASS=$((PASS+1)) || FAIL=$((FAIL+1)) } +check() { + $emu ./ulp -f -q "$@" >/dev/null +} + Ldir=0.5 for r in $rmodes do @@ -87,6 +91,66 @@ t powf 0x1.ep-1 0x1.1p0 x 0x1p8 0x1p14 50000 t powf 0x1.ep-1 0x1.1p0 x -0x1p8 -0x1p14 50000 done +# vector functions +Ldir=0.5 +r='n' +flags="${ULPFLAGS:--q} -f" +runv= +check __v_exp 1 && runv=1 +runvn= +check __vn_exp 1 && runvn=1 + +range_exp=' + 0 0xffff000000000000 10000 + 0x1p-6 0x1p6 400000 + -0x1p-6 -0x1p6 400000 + 633.3 733.3 10000 + -633.3 -777.3 10000 +' + +range_expf=' + 0 0xffff0000 10000 + 0x1p-14 0x1p8 500000 + -0x1p-14 -0x1p8 500000 +' + +range_expf_1u="$range_expf" + +# error limits +L_exp=1.9 +L_expf=1.49 +L_expf_1u=0.4 + +# group symbol run +echo " +exp __s_exp 1 +exp __v_exp $runv +exp __vn_exp $runvn +exp _ZGVnN2v_exp $runvn + +expf __s_expf 1 +expf __v_expf $runv +expf __vn_expf $runvn +expf _ZGVnN4v_expf $runvn + +expf_1u __s_expf_1u 1 +expf_1u __v_expf_1u $runv +expf_1u __vn_expf_1u $runvn + +" | while read G F R +do + [ "$R" = 1 ] || continue + case "$G" in \#*) continue ;; esac + eval range="\${range_$G}" + eval L="\${L_$G}" + echo "$range" |while read X + do + [ -n "$X" ] || continue + case "$X" in \#*) continue ;; esac + t $F $X + done +done + [ 0 -eq $FAIL ] || { echo "FAILED $FAIL PASSED $PASS" exit 1 diff --git a/math/test/ulp.c b/math/test/ulp.c index cd15dec..aec8aa5 100644 --- a/math/test/ulp.c +++ b/math/test/ulp.c @@ -13,6 +13,7 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> +#include "mathlib.h" /* Don't depend on mpfr by default. */ #ifndef USE_MPFR @@ -208,11 +209,38 @@ struct conf double errlim; }; +/* A bit of a hack: call vector functions twice with the same + input in lane 0 but a different value in other lanes: once + with an in-range value and then with a special case value. */ +static int secondcall; + +/* Wrappers for vector functions. */ +#if __aarch64__ +typedef __f32x4_t v_float; +typedef __f64x2_t v_double; +static const float fv[2] = {1.0f, -INFINITY}; +static const double dv[2] = {1.0, -INFINITY}; +static inline v_float argf(float x) { return (v_float){x,x,x,fv[secondcall]}; } +static inline v_double argd(double x) { return (v_double){x,dv[secondcall]}; } + +static float v_expf_1u(float x) { return __v_expf_1u(argf(x))[0]; } +static float v_expf(float x) { return __v_expf(argf(x))[0]; } +static double v_exp(double x) { return __v_exp(argd(x))[0]; } +#ifdef __vpcs +static float vn_expf_1u(float x) { return __vn_expf_1u(argf(x))[0]; } +static float vn_expf(float x) { return __vn_expf(argf(x))[0]; } +static double vn_exp(double x) { return __vn_exp(argd(x))[0]; } +static float Z_expf(float x) { return _ZGVnN4v_expf(argf(x))[0]; } +static double Z_exp(double x) { return _ZGVnN2v_exp(argd(x))[0]; } +#endif +#endif + struct fun { const char *name; int arity; int singleprec; + int twice; union { float (*f1) (float); @@ -240,16 +268,16 @@ struct fun static const struct fun fun[] = { #if USE_MPFR -# define F(x, x_long, x_mpfr, a, s, t) \ - {#x, a, s, {.t = x}, {.t = x_long}, {.t = x_mpfr}}, +# define F(x, x_wrap, x_long, x_mpfr, a, s, t, twice) \ + {#x, a, s, twice, {.t = x_wrap}, {.t = x_long}, {.t = x_mpfr}}, #else -# define F(x, x_long, x_mpfr, a, s, t) \ - {#x, a, s, {.t = x}, {.t = x_long}}, +# define F(x, x_wrap, x_long, x_mpfr, a, s, t, twice) \ + {#x, a, s, twice, {.t = x_wrap}, {.t = x_long}}, #endif -#define F1(x) F (x##f, x, mpfr_##x, 1, 1, f1) -#define F2(x) F (x##f, x, mpfr_##x, 2, 1, f2) -#define D1(x) F (x, x##l, mpfr_##x, 1, 0, d1) -#define D2(x) F (x, x##l, mpfr_##x, 2, 0, d2) +#define F1(x) F (x##f, x##f, x, mpfr_##x, 1, 1, f1, 0) +#define F2(x) F (x##f, x##f, x, mpfr_##x, 2, 1, f2, 0) +#define D1(x) F (x, x, x##l, mpfr_##x, 1, 0, d1, 0) +#define D2(x) F (x, x, x##l, mpfr_##x, 2, 0, d2, 0) F1 (sin) F1 (cos) F1 (exp) @@ -262,6 +290,26 @@ static const struct fun fun[] = { D1 (log) D1 (log2) D2 (pow) + F (__s_expf_1u, __s_expf_1u, exp, mpfr_exp, 1, 1, f1, 0) + F (__s_expf, __s_expf, exp, mpfr_exp, 1, 1, f1, 0) + F (__s_exp, __s_exp, expl, mpfr_exp, 1, 0, d1, 0) +#if __aarch64__ + F (__v_expf_1u, v_expf_1u, exp, mpfr_exp, 1, 1, f1, 1) + F (__v_expf, v_expf, exp, mpfr_exp, 1, 1, f1, 1) + F (__v_exp, v_exp, expl, mpfr_exp, 1, 0, d1, 1) +#ifdef __vpcs + F (__vn_expf_1u, vn_expf_1u, exp, mpfr_exp, 1, 1, f1, 1) + F (__vn_expf, vn_expf, exp, mpfr_exp, 1, 1, f1, 1) + F (__vn_exp, vn_exp, expl, mpfr_exp, 1, 0, d1, 1) + F (_ZGVnN4v_expf, Z_expf, exp, mpfr_exp, 1, 1, f1, 1) + F (_ZGVnN2v_exp, Z_exp, expl, mpfr_exp, 1, 0, d1, 1) +#endif +#endif +#undef F +#undef F1 +#undef F2 +#undef D1 +#undef D2 {0}}; /* Boilerplate for generic calls. */ diff --git a/math/test/ulp.h b/math/test/ulp.h index 1f2fe98..a0c3016 100644 --- a/math/test/ulp.h +++ b/math/test/ulp.h @@ -257,11 +257,29 @@ static int T(cmp) (const struct fun *f, struct gen *gen, struct RT(ret) want; struct T(args) a = T(next) (gen); int exgot; + int exgot2; RT(float) ygot; + RT(float) ygot2; + int fail = 0; if (fenv) T(call_fenv) (f, a, r, &ygot, &exgot); else T(call_nofenv) (f, a, r, &ygot, &exgot); + if (f->twice) { + secondcall = 1; + if (fenv) + T(call_fenv) (f, a, r, &ygot2, &exgot2); + else + T(call_nofenv) (f, a, r, &ygot2, &exgot2); + secondcall = 0; + if (RT(asuint) (ygot) != RT(asuint) (ygot2)) + { + fail = 1; + cntfail++; + T(printcall) (f, a); + printf (" got %a then %a for same input\n", ygot, ygot2); + } + } cnt++; int ok = use_mpfr ? T(call_mpfr_fix) (f, a, r, &want, ygot, exgot) @@ -270,7 +288,6 @@ static int T(cmp) (const struct fun *f, struct gen *gen, if (!ok) { int print = 0; - int fail = 0; double err = RT(ulperr) (ygot, &want, r); double abserr = fabs (err); // TODO: count errors below accuracy limit. @@ -280,8 +297,12 @@ static int T(cmp) (const struct fun *f, struct gen *gen, cnt2++; if (abserr > conf->errlim) { - print = fail = 1; - cntfail++; + print = 1; + if (!fail) + { + fail = 1; + cntfail++; + } } if (abserr > maxerr) { diff --git a/math/tools/v_exp.sollya b/math/tools/v_exp.sollya new file mode 100644 index 0000000..c0abb63 --- /dev/null +++ b/math/tools/v_exp.sollya @@ -0,0 +1,30 @@ +// polynomial for approximating e^x +// +// Copyright (c) 2019, Arm Limited. +// SPDX-License-Identifier: MIT + +deg = 4; // poly degree +N = 128; // table entries +b = log(2)/(2*N); // interval +a = -b; + +// find polynomial with minimal abs error + +// return p that minimizes |exp(x) - poly(x) - x^d*p(x)| +approx = proc(poly,d) { + return remez(exp(x)-poly(x), deg-d, [a;b], x^d, 1e-10); +}; + +// first 2 coeffs are fixed, iteratively find optimal double prec coeffs +poly = 1 + x; +for i from 2 to deg do { + p = roundcoefficients(approx(poly,i), [|D ...|]); + poly = poly + x^i*coeff(p,0); +}; + +display = hexadecimal; +print("rel error:", accurateinfnorm(1-poly(x)/exp(x), [a;b], 30)); +print("abs error:", accurateinfnorm(exp(x)-poly(x), [a;b], 30)); +print("in [",a,b,"]"); +print("coeffs:"); +for i from 0 to deg do coeff(poly,i); diff --git a/math/v_exp.c b/math/v_exp.c new file mode 100644 index 0000000..e459d53 --- /dev/null +++ b/math/v_exp.c @@ -0,0 +1,94 @@ +/* + * Double-precision vector e^x function. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "mathlib.h" +#include "v_math.h" +#if V_SUPPORTED +#include "v_exp.h" + +#if V_EXP_TABLE_BITS == 7 +/* maxerr: 1.88 +0.5 ulp + rel error: 1.4337*2^-53 + abs error: 1.4299*2^-53 in [ -ln2/256, ln2/256 ]. */ +#define C1 v_f64 (0x1.ffffffffffd43p-2) +#define C2 v_f64 (0x1.55555c75adbb2p-3) +#define C3 v_f64 (0x1.55555da646206p-5) +#define InvLn2 v_f64 (0x1.71547652b82fep7) /* N/ln2. */ +#define Ln2hi v_f64 (0x1.62e42fefa39efp-8) /* ln2/N. */ +#define Ln2lo v_f64 (0x1.abc9e3b39803f3p-63) +#elif V_EXP_TABLE_BITS == 8 +/* maxerr: 0.54 +0.5 ulp + rel error: 1.4318*2^-58 + abs error: 1.4299*2^-58 in [ -ln2/512, ln2/512 ]. */ +#define C1 v_f64 (0x1.fffffffffffd4p-2) +#define C2 v_f64 (0x1.5555571d6b68cp-3) +#define C3 v_f64 (0x1.5555576a59599p-5) +#define InvLn2 v_f64 (0x1.71547652b82fep8) +#define Ln2hi v_f64 (0x1.62e42fefa39efp-9) +#define Ln2lo v_f64 (0x1.abc9e3b39803f3p-64) +#endif + +#define N (1 << V_EXP_TABLE_BITS) +#define Tab __v_exp_data +#define IndexMask v_u64 (N - 1) +#define Shift v_f64 (0x1.8p+52) +#define Thres v_f64 (704.0) + +VPCS_ATTR +static v_f64_t +specialcase (v_f64_t s, v_f64_t y, v_f64_t n) +{ + v_f64_t absn = v_abs_f64 (n); + + /* 2^(n/N) may overflow, break it up into s1*s2. */ + v_u64_t b = v_cond_u64 (n <= v_f64 (0.0)) & v_u64 (0x6000000000000000); + v_f64_t s1 = v_as_f64_u64 (v_u64 (0x7000000000000000) - b); + v_f64_t s2 = v_as_f64_u64 (v_as_u64_f64 (s) - v_u64 (0x3010000000000000) + b); + v_u64_t cmp = v_cond_u64 (absn > v_f64 (1280.0 * N)); + v_f64_t r1 = s1 * s1; + v_f64_t r0 = v_fma_f64 (y, s2, s2) * s1; + return v_as_f64_u64 ((cmp & v_as_u64_f64 (r1)) | (~cmp & v_as_u64_f64 (r0))); +} + +VPCS_ATTR +v_f64_t +V_NAME(exp) (v_f64_t x) +{ + v_f64_t n, r, r2, s, y, z; + v_u64_t cmp, u, e, i; + + cmp = v_cond_u64 (v_abs_f64 (x) > Thres); + + /* n = round(x/(ln2/N)). */ + z = v_fma_f64 (x, InvLn2, Shift); + u = v_as_u64_f64 (z); + n = z - Shift; + + /* r = x - n*ln2/N. */ + r = x; + r = v_fma_f64 (-Ln2hi, n, r); + r = v_fma_f64 (-Ln2lo, n, r); + + e = u << (52 - V_EXP_TABLE_BITS); + i = u & IndexMask; + + /* y = exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4. */ + r2 = r * r; + y = v_fma_f64 (C2, r, C1); + y = v_fma_f64 (C3, r2, y); + y = v_fma_f64 (y, r2, r); + + /* s = 2^(n/N). */ + u = v_lookup_u64 (Tab, i); + s = v_as_f64_u64 (u + e); + + if (unlikely (v_any_u64 (cmp))) + return specialcase (s, y, n); + return v_fma_f64 (y, s, s); +} +VPCS_ALIAS +#endif diff --git a/math/v_exp.h b/math/v_exp.h new file mode 100644 index 0000000..38713d6 --- /dev/null +++ b/math/v_exp.h @@ -0,0 +1,12 @@ +/* + * Declarations for double-precision e^x vector function. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "v_math.h" + +#define V_EXP_TABLE_BITS 7 + +extern const u64_t __v_exp_data[1 << V_EXP_TABLE_BITS] HIDDEN; diff --git a/math/v_exp_data.c b/math/v_exp_data.c new file mode 100644 index 0000000..19b2701 --- /dev/null +++ b/math/v_exp_data.c @@ -0,0 +1,401 @@ +/* + * Lookup table for double-precision e^x vector function. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "v_exp.h" + +#define N (1 << V_EXP_TABLE_BITS) + +/* 2^(j/N), j=0..N. */ +const u64_t __v_exp_data[] = { +#if N == 128 +0x3ff0000000000000, +0x3feff63da9fb3335, +0x3fefec9a3e778061, +0x3fefe315e86e7f85, +0x3fefd9b0d3158574, +0x3fefd06b29ddf6de, +0x3fefc74518759bc8, +0x3fefbe3ecac6f383, +0x3fefb5586cf9890f, +0x3fefac922b7247f7, +0x3fefa3ec32d3d1a2, +0x3fef9b66affed31b, +0x3fef9301d0125b51, +0x3fef8abdc06c31cc, +0x3fef829aaea92de0, +0x3fef7a98c8a58e51, +0x3fef72b83c7d517b, +0x3fef6af9388c8dea, +0x3fef635beb6fcb75, +0x3fef5be084045cd4, +0x3fef54873168b9aa, +0x3fef4d5022fcd91d, +0x3fef463b88628cd6, +0x3fef3f49917ddc96, +0x3fef387a6e756238, +0x3fef31ce4fb2a63f, +0x3fef2b4565e27cdd, +0x3fef24dfe1f56381, +0x3fef1e9df51fdee1, +0x3fef187fd0dad990, +0x3fef1285a6e4030b, +0x3fef0cafa93e2f56, +0x3fef06fe0a31b715, +0x3fef0170fc4cd831, +0x3feefc08b26416ff, +0x3feef6c55f929ff1, +0x3feef1a7373aa9cb, +0x3feeecae6d05d866, +0x3feee7db34e59ff7, +0x3feee32dc313a8e5, +0x3feedea64c123422, +0x3feeda4504ac801c, +0x3feed60a21f72e2a, +0x3feed1f5d950a897, +0x3feece086061892d, +0x3feeca41ed1d0057, +0x3feec6a2b5c13cd0, +0x3feec32af0d7d3de, +0x3feebfdad5362a27, +0x3feebcb299fddd0d, +0x3feeb9b2769d2ca7, +0x3feeb6daa2cf6642, +0x3feeb42b569d4f82, +0x3feeb1a4ca5d920f, +0x3feeaf4736b527da, +0x3feead12d497c7fd, +0x3feeab07dd485429, +0x3feea9268a5946b7, +0x3feea76f15ad2148, +0x3feea5e1b976dc09, +0x3feea47eb03a5585, +0x3feea34634ccc320, +0x3feea23882552225, +0x3feea155d44ca973, +0x3feea09e667f3bcd, +0x3feea012750bdabf, +0x3fee9fb23c651a2f, +0x3fee9f7df9519484, +0x3fee9f75e8ec5f74, +0x3fee9f9a48a58174, +0x3fee9feb564267c9, +0x3feea0694fde5d3f, +0x3feea11473eb0187, +0x3feea1ed0130c132, +0x3feea2f336cf4e62, +0x3feea427543e1a12, +0x3feea589994cce13, +0x3feea71a4623c7ad, +0x3feea8d99b4492ed, +0x3feeaac7d98a6699, +0x3feeace5422aa0db, +0x3feeaf3216b5448c, +0x3feeb1ae99157736, +0x3feeb45b0b91ffc6, +0x3feeb737b0cdc5e5, +0x3feeba44cbc8520f, +0x3feebd829fde4e50, +0x3feec0f170ca07ba, +0x3feec49182a3f090, +0x3feec86319e32323, +0x3feecc667b5de565, +0x3feed09bec4a2d33, +0x3feed503b23e255d, +0x3feed99e1330b358, +0x3feede6b5579fdbf, +0x3feee36bbfd3f37a, +0x3feee89f995ad3ad, +0x3feeee07298db666, +0x3feef3a2b84f15fb, +0x3feef9728de5593a, +0x3feeff76f2fb5e47, +0x3fef05b030a1064a, +0x3fef0c1e904bc1d2, +0x3fef12c25bd71e09, +0x3fef199bdd85529c, +0x3fef20ab5fffd07a, +0x3fef27f12e57d14b, +0x3fef2f6d9406e7b5, +0x3fef3720dcef9069, +0x3fef3f0b555dc3fa, +0x3fef472d4a07897c, +0x3fef4f87080d89f2, +0x3fef5818dcfba487, +0x3fef60e316c98398, +0x3fef69e603db3285, +0x3fef7321f301b460, +0x3fef7c97337b9b5f, +0x3fef864614f5a129, +0x3fef902ee78b3ff6, +0x3fef9a51fbc74c83, +0x3fefa4afa2a490da, +0x3fefaf482d8e67f1, +0x3fefba1bee615a27, +0x3fefc52b376bba97, +0x3fefd0765b6e4540, +0x3fefdbfdad9cbe14, +0x3fefe7c1819e90d8, +0x3feff3c22b8f71f1, +#elif N == 256 +0x3ff0000000000000, +0x3feffb1afa5abcbf, +0x3feff63da9fb3335, +0x3feff168143b0281, +0x3fefec9a3e778061, +0x3fefe7d42e11bbcc, +0x3fefe315e86e7f85, +0x3fefde5f72f654b1, +0x3fefd9b0d3158574, +0x3fefd50a0e3c1f89, +0x3fefd06b29ddf6de, +0x3fefcbd42b72a836, +0x3fefc74518759bc8, +0x3fefc2bdf66607e0, +0x3fefbe3ecac6f383, +0x3fefb9c79b1f3919, +0x3fefb5586cf9890f, +0x3fefb0f145e46c85, +0x3fefac922b7247f7, +0x3fefa83b23395dec, +0x3fefa3ec32d3d1a2, +0x3fef9fa55fdfa9c5, +0x3fef9b66affed31b, +0x3fef973028d7233e, +0x3fef9301d0125b51, +0x3fef8edbab5e2ab6, +0x3fef8abdc06c31cc, +0x3fef86a814f204ab, +0x3fef829aaea92de0, +0x3fef7e95934f312e, +0x3fef7a98c8a58e51, +0x3fef76a45471c3c2, +0x3fef72b83c7d517b, +0x3fef6ed48695bbc0, +0x3fef6af9388c8dea, +0x3fef672658375d2f, +0x3fef635beb6fcb75, +0x3fef5f99f8138a1c, +0x3fef5be084045cd4, +0x3fef582f95281c6b, +0x3fef54873168b9aa, +0x3fef50e75eb44027, +0x3fef4d5022fcd91d, +0x3fef49c18438ce4d, +0x3fef463b88628cd6, +0x3fef42be3578a819, +0x3fef3f49917ddc96, +0x3fef3bdda27912d1, +0x3fef387a6e756238, +0x3fef351ffb82140a, +0x3fef31ce4fb2a63f, +0x3fef2e85711ece75, +0x3fef2b4565e27cdd, +0x3fef280e341ddf29, +0x3fef24dfe1f56381, +0x3fef21ba7591bb70, +0x3fef1e9df51fdee1, +0x3fef1b8a66d10f13, +0x3fef187fd0dad990, +0x3fef157e39771b2f, +0x3fef1285a6e4030b, +0x3fef0f961f641589, +0x3fef0cafa93e2f56, +0x3fef09d24abd886b, +0x3fef06fe0a31b715, +0x3fef0432edeeb2fd, +0x3fef0170fc4cd831, +0x3feefeb83ba8ea32, +0x3feefc08b26416ff, +0x3feef96266e3fa2d, +0x3feef6c55f929ff1, +0x3feef431a2de883b, +0x3feef1a7373aa9cb, +0x3feeef26231e754a, +0x3feeecae6d05d866, +0x3feeea401b7140ef, +0x3feee7db34e59ff7, +0x3feee57fbfec6cf4, +0x3feee32dc313a8e5, +0x3feee0e544ede173, +0x3feedea64c123422, +0x3feedc70df1c5175, +0x3feeda4504ac801c, +0x3feed822c367a024, +0x3feed60a21f72e2a, +0x3feed3fb2709468a, +0x3feed1f5d950a897, +0x3feecffa3f84b9d4, +0x3feece086061892d, +0x3feecc2042a7d232, +0x3feeca41ed1d0057, +0x3feec86d668b3237, +0x3feec6a2b5c13cd0, +0x3feec4e1e192aed2, +0x3feec32af0d7d3de, +0x3feec17dea6db7d7, +0x3feebfdad5362a27, +0x3feebe41b817c114, +0x3feebcb299fddd0d, +0x3feebb2d81d8abff, +0x3feeb9b2769d2ca7, +0x3feeb8417f4531ee, +0x3feeb6daa2cf6642, +0x3feeb57de83f4eef, +0x3feeb42b569d4f82, +0x3feeb2e2f4f6ad27, +0x3feeb1a4ca5d920f, +0x3feeb070dde910d2, +0x3feeaf4736b527da, +0x3feeae27dbe2c4cf, +0x3feead12d497c7fd, +0x3feeac0827ff07cc, +0x3feeab07dd485429, +0x3feeaa11fba87a03, +0x3feea9268a5946b7, +0x3feea84590998b93, +0x3feea76f15ad2148, +0x3feea6a320dceb71, +0x3feea5e1b976dc09, +0x3feea52ae6cdf6f4, +0x3feea47eb03a5585, +0x3feea3dd1d1929fd, +0x3feea34634ccc320, +0x3feea2b9febc8fb7, +0x3feea23882552225, +0x3feea1c1c70833f6, +0x3feea155d44ca973, +0x3feea0f4b19e9538, +0x3feea09e667f3bcd, +0x3feea052fa75173e, +0x3feea012750bdabf, +0x3fee9fdcddd47645, +0x3fee9fb23c651a2f, +0x3fee9f9298593ae5, +0x3fee9f7df9519484, +0x3fee9f7466f42e87, +0x3fee9f75e8ec5f74, +0x3fee9f8286ead08a, +0x3fee9f9a48a58174, +0x3fee9fbd35d7cbfd, +0x3fee9feb564267c9, +0x3feea024b1ab6e09, +0x3feea0694fde5d3f, +0x3feea0b938ac1cf6, +0x3feea11473eb0187, +0x3feea17b0976cfdb, +0x3feea1ed0130c132, +0x3feea26a62ff86f0, +0x3feea2f336cf4e62, +0x3feea3878491c491, +0x3feea427543e1a12, +0x3feea4d2add106d9, +0x3feea589994cce13, +0x3feea64c1eb941f7, +0x3feea71a4623c7ad, +0x3feea7f4179f5b21, +0x3feea8d99b4492ed, +0x3feea9cad931a436, +0x3feeaac7d98a6699, +0x3feeabd0a478580f, +0x3feeace5422aa0db, +0x3feeae05bad61778, +0x3feeaf3216b5448c, +0x3feeb06a5e0866d9, +0x3feeb1ae99157736, +0x3feeb2fed0282c8a, +0x3feeb45b0b91ffc6, +0x3feeb5c353aa2fe2, +0x3feeb737b0cdc5e5, +0x3feeb8b82b5f98e5, +0x3feeba44cbc8520f, +0x3feebbdd9a7670b3, +0x3feebd829fde4e50, +0x3feebf33e47a22a2, +0x3feec0f170ca07ba, +0x3feec2bb4d53fe0d, +0x3feec49182a3f090, +0x3feec674194bb8d5, +0x3feec86319e32323, +0x3feeca5e8d07f29e, +0x3feecc667b5de565, +0x3feece7aed8eb8bb, +0x3feed09bec4a2d33, +0x3feed2c980460ad8, +0x3feed503b23e255d, +0x3feed74a8af46052, +0x3feed99e1330b358, +0x3feedbfe53c12e59, +0x3feede6b5579fdbf, +0x3feee0e521356eba, +0x3feee36bbfd3f37a, +0x3feee5ff3a3c2774, +0x3feee89f995ad3ad, +0x3feeeb4ce622f2ff, +0x3feeee07298db666, +0x3feef0ce6c9a8952, +0x3feef3a2b84f15fb, +0x3feef68415b749b1, +0x3feef9728de5593a, +0x3feefc6e29f1c52a, +0x3feeff76f2fb5e47, +0x3fef028cf22749e4, +0x3fef05b030a1064a, +0x3fef08e0b79a6f1f, +0x3fef0c1e904bc1d2, +0x3fef0f69c3f3a207, +0x3fef12c25bd71e09, +0x3fef16286141b33d, +0x3fef199bdd85529c, +0x3fef1d1cd9fa652c, +0x3fef20ab5fffd07a, +0x3fef244778fafb22, +0x3fef27f12e57d14b, +0x3fef2ba88988c933, +0x3fef2f6d9406e7b5, +0x3fef33405751c4db, +0x3fef3720dcef9069, +0x3fef3b0f2e6d1675, +0x3fef3f0b555dc3fa, +0x3fef43155b5bab74, +0x3fef472d4a07897c, +0x3fef4b532b08c968, +0x3fef4f87080d89f2, +0x3fef53c8eacaa1d6, +0x3fef5818dcfba487, +0x3fef5c76e862e6d3, +0x3fef60e316c98398, +0x3fef655d71ff6075, +0x3fef69e603db3285, +0x3fef6e7cd63a8315, +0x3fef7321f301b460, +0x3fef77d5641c0658, +0x3fef7c97337b9b5f, +0x3fef81676b197d17, +0x3fef864614f5a129, +0x3fef8b333b16ee12, +0x3fef902ee78b3ff6, +0x3fef953924676d76, +0x3fef9a51fbc74c83, +0x3fef9f7977cdb740, +0x3fefa4afa2a490da, +0x3fefa9f4867cca6e, +0x3fefaf482d8e67f1, +0x3fefb4aaa2188510, +0x3fefba1bee615a27, +0x3fefbf9c1cb6412a, +0x3fefc52b376bba97, +0x3fefcac948dd7274, +0x3fefd0765b6e4540, +0x3fefd632798844f8, +0x3fefdbfdad9cbe14, +0x3fefe1d802243c89, +0x3fefe7c1819e90d8, +0x3fefedba3692d514, +0x3feff3c22b8f71f1, +0x3feff9d96b2a23d9, +#endif +}; diff --git a/math/v_expf.c b/math/v_expf.c new file mode 100644 index 0000000..f536701 --- /dev/null +++ b/math/v_expf.c @@ -0,0 +1,75 @@ +/* + * Single-precision vector e^x function. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "mathlib.h" +#include "v_math.h" +#if V_SUPPORTED + +static const float Poly[] = { + /* maxerr: 1.45358 +0.5 ulp. */ + 0x1.0e4020p-7f, + 0x1.573e2ep-5f, + 0x1.555e66p-3f, + 0x1.fffdb6p-2f, + 0x1.ffffecp-1f, +}; +#define C0 v_f32 (Poly[0]) +#define C1 v_f32 (Poly[1]) +#define C2 v_f32 (Poly[2]) +#define C3 v_f32 (Poly[3]) +#define C4 v_f32 (Poly[4]) + +#define Shift v_f32 (0x1.8p23f) +#define InvLn2 v_f32 (0x1.715476p+0f) +#define Ln2hi v_f32 (0x1.62e4p-1f) +#define Ln2lo v_f32 (0x1.7f7d1cp-20f) + +VPCS_ATTR +static v_f32_t +specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale) +{ + /* 2^n may overflow, break it up into s1*s2. */ + v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x82000000); + v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b); + v_f32_t s2 = v_as_f32_u32 (e - b); + v_u32_t cmp2 = v_cond_u32 (absn > v_f32 (192.0f)); + v_u32_t r2 = v_as_u32_f32 (s1 * s1); + v_u32_t r1 = v_as_u32_f32 (v_fma_f32 (poly, s2, s2) * s1); + /* Similar to r1 but avoids double rounding in the subnormal range. */ + v_u32_t r0 = v_as_u32_f32 (v_fma_f32 (poly, scale, scale)); + return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0)); +} + +VPCS_ATTR +v_f32_t +V_NAME(expf) (v_f32_t x) +{ + v_f32_t n, r, r2, scale, p, q, poly, absn, z; + v_u32_t cmp, e; + + /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ + z = v_fma_f32 (x, InvLn2, Shift); + n = z - Shift; + r = v_fma_f32 (n, -Ln2hi, x); + r = v_fma_f32 (n, -Ln2lo, r); + e = v_as_u32_f32 (z) << 23; + scale = v_as_f32_u32 (e + v_u32 (0x3f800000)); + absn = v_abs_f32 (n); + cmp = v_cond_u32 (absn > v_f32 (126.0f)); + r2 = r * r; + p = v_fma_f32 (C0, r, C1); + q = v_fma_f32 (C2, r, C3); + q = v_fma_f32 (p, r2, q); + p = C4 * r; + poly = v_fma_f32 (q, r2, p); + if (unlikely (v_any_u32 (cmp))) + return specialcase (poly, n, e, absn, cmp, scale); + return v_fma_f32 (poly, scale, scale); +} +VPCS_ALIAS +#endif diff --git a/math/v_expf_1u.c b/math/v_expf_1u.c new file mode 100644 index 0000000..37d3d1e --- /dev/null +++ b/math/v_expf_1u.c @@ -0,0 +1,72 @@ +/* + * Single-precision vector e^x function. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "mathlib.h" +#include "v_math.h" +#if V_SUPPORTED + +static const float Poly[] = { + /* maxerr: 0.36565 +0.5 ulp. */ + 0x1.6a6000p-10f, + 0x1.12718ep-7f, + 0x1.555af0p-5f, + 0x1.555430p-3f, + 0x1.fffff4p-2f, +}; +#define C0 v_f32 (Poly[0]) +#define C1 v_f32 (Poly[1]) +#define C2 v_f32 (Poly[2]) +#define C3 v_f32 (Poly[3]) +#define C4 v_f32 (Poly[4]) + +#define Shift v_f32 (0x1.8p23f) +#define InvLn2 v_f32 (0x1.715476p+0f) +#define Ln2hi v_f32 (0x1.62e4p-1f) +#define Ln2lo v_f32 (0x1.7f7d1cp-20f) + +VPCS_ATTR +static v_f32_t +specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn) +{ + /* 2^n may overflow, break it up into s1*s2. */ + v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x83000000); + v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b); + v_f32_t s2 = v_as_f32_u32 (e - b); + v_u32_t cmp = v_cond_u32 (absn > v_f32 (192.0f)); + v_f32_t r1 = s1 * s1; + v_f32_t r0 = poly * s1 * s2; + return v_as_f32_u32 ((cmp & v_as_u32_f32 (r1)) | (~cmp & v_as_u32_f32 (r0))); +} + +VPCS_ATTR +v_f32_t +V_NAME(expf_1u) (v_f32_t x) +{ + v_f32_t n, r, scale, poly, absn, z; + v_u32_t cmp, e; + + /* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)] + x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ + z = v_fma_f32 (x, InvLn2, Shift); + n = z - Shift; + r = v_fma_f32 (n, -Ln2hi, x); + r = v_fma_f32 (n, -Ln2lo, r); + e = v_as_u32_f32 (z) << 23; + scale = v_as_f32_u32 (e + v_u32 (0x3f800000)); + absn = v_abs_f32 (n); + cmp = v_cond_u32 (absn > v_f32 (126.0f)); + poly = v_fma_f32 (C0, r, C1); + poly = v_fma_f32 (poly, r, C2); + poly = v_fma_f32 (poly, r, C3); + poly = v_fma_f32 (poly, r, C4); + poly = v_fma_f32 (poly, r, v_f32 (1.0f)); + poly = v_fma_f32 (poly, r, v_f32 (1.0f)); + if (unlikely (v_any_u32 (cmp))) + return specialcase (poly, n, e, absn); + return scale * poly; +} +#endif diff --git a/math/v_math.h b/math/v_math.h new file mode 100644 index 0000000..f2bf24e --- /dev/null +++ b/math/v_math.h @@ -0,0 +1,604 @@ +/* + * Vector math abstractions. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#ifndef _V_MATH_H +#define _V_MATH_H + +/* The goal of this header is to allow vector and scalar + build of the same algorithm, the provided intrinsic + wrappers are also vector length agnostic so they can + be implemented for SVE too (or other simd architectures) + and then the code should work on those targets too. */ + +#if SCALAR +#define V_NAME(x) __s_##x +#elif VPCS && __aarch64__ +#define V_NAME(x) __vn_##x +#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs)) +#else +#define V_NAME(x) __v_##x +#endif + +#ifndef VPCS_ATTR +#define VPCS_ATTR +#endif +#ifndef VPCS_ALIAS +#define VPCS_ALIAS +#endif + +#include <stdint.h> +#include "math_config.h" + +typedef float f32_t; +typedef uint32_t u32_t; +typedef int32_t s32_t; +typedef double f64_t; +typedef uint64_t u64_t; +typedef int64_t s64_t; + +/* reinterpret as type1 from type2. */ +static inline u32_t +as_u32_f32 (f32_t x) +{ + union { f32_t f; u32_t u; } r = {x}; + return r.u; +} +static inline f32_t +as_f32_u32 (u32_t x) +{ + union { u32_t u; f32_t f; } r = {x}; + return r.f; +} +static inline s32_t +as_s32_u32 (u32_t x) +{ + union { u32_t u; s32_t i; } r = {x}; + return r.i; +} +static inline u32_t +as_u32_s32 (s32_t x) +{ + union { s32_t i; u32_t u; } r = {x}; + return r.u; +} +static inline u64_t +as_u64_f64 (f64_t x) +{ + union { f64_t f; u64_t u; } r = {x}; + return r.u; +} +static inline f64_t +as_f64_u64 (u64_t x) +{ + union { u64_t u; f64_t f; } r = {x}; + return r.f; +} +static inline s64_t +as_s64_u64 (u64_t x) +{ + union { u64_t u; s64_t i; } r = {x}; + return r.i; +} +static inline u64_t +as_u64_s64 (s64_t x) +{ + union { s64_t i; u64_t u; } r = {x}; + return r.u; +} + +#if SCALAR +#define V_SUPPORTED 1 +typedef f32_t v_f32_t; +typedef u32_t v_u32_t; +typedef s32_t v_s32_t; +typedef f64_t v_f64_t; +typedef u64_t v_u64_t; +typedef s64_t v_s64_t; + +static inline int +v_lanes32 (void) +{ + return 1; +} + +static inline v_f32_t +v_f32 (f32_t x) +{ + return x; +} +static inline v_u32_t +v_u32 (u32_t x) +{ + return x; +} +static inline v_s32_t +v_s32 (s32_t x) +{ + return x; +} + +static inline f32_t +v_get_f32 (v_f32_t x, int i) +{ + return x; +} +static inline u32_t +v_get_u32 (v_u32_t x, int i) +{ + return x; +} +static inline s32_t +v_get_s32 (v_s32_t x, int i) +{ + return x; +} + +static inline void +v_set_f32 (v_f32_t *x, int i, f32_t v) +{ + *x = v; +} +static inline void +v_set_u32 (v_u32_t *x, int i, u32_t v) +{ + *x = v; +} +static inline void +v_set_s32 (v_s32_t *x, int i, s32_t v) +{ + *x = v; +} + +/* true if any elements of a v_cond result is non-zero. */ +static inline int +v_any_u32 (v_u32_t x) +{ + return x != 0; +} +/* to wrap the result of relational operators. */ +static inline v_u32_t +v_cond_u32 (v_u32_t x) +{ + return x ? -1 : 0; +} +static inline v_f32_t +v_abs_f32 (v_f32_t x) +{ + return __builtin_fabsf (x); +} +static inline v_f32_t +v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z) +{ + return __builtin_fmaf (x, y, z); +} +static inline v_f32_t +v_round_f32 (v_f32_t x) +{ + return __builtin_roundf (x); +} +static inline v_s32_t +v_round_s32 (v_f32_t x) +{ + return __builtin_lroundf (x); /* relies on -fno-math-errno. */ +} +/* convert to type1 from type2. */ +static inline v_f32_t +v_to_f32_s32 (v_s32_t x) +{ + return x; +} +static inline v_f32_t +v_to_f32_u32 (v_u32_t x) +{ + return x; +} +/* reinterpret as type1 from type2. */ +static inline v_u32_t +v_as_u32_f32 (v_f32_t x) +{ + union { v_f32_t f; v_u32_t u; } r = {x}; + return r.u; +} +static inline v_f32_t +v_as_f32_u32 (v_u32_t x) +{ + union { v_u32_t u; v_f32_t f; } r = {x}; + return r.f; +} +static inline v_s32_t +v_as_s32_u32 (v_u32_t x) +{ + union { v_u32_t u; v_s32_t i; } r = {x}; + return r.i; +} +static inline v_u32_t +v_as_u32_s32 (v_s32_t x) +{ + union { v_s32_t i; v_u32_t u; } r = {x}; + return r.u; +} +static inline v_f32_t +v_lookup_f32 (const f32_t *tab, v_u32_t idx) +{ + return tab[idx]; +} +static inline v_u32_t +v_lookup_u32 (const u32_t *tab, v_u32_t idx) +{ + return tab[idx]; +} +static inline v_f32_t +v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p) +{ + return f (x); +} +static inline v_f32_t +v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y, + v_u32_t p) +{ + return f (x1, x2); +} + +static inline v_f64_t +v_f64 (f64_t x) +{ + return x; +} +static inline v_u64_t +v_u64 (u64_t x) +{ + return x; +} +static inline v_s64_t +v_s64 (s64_t x) +{ + return x; +} +/* true if any elements of a v_cond result is non-zero. */ +static inline int +v_any_u64 (v_u64_t x) +{ + return x != 0; +} +/* to wrap the result of relational operators. */ +static inline v_u64_t +v_cond_u64 (v_u64_t x) +{ + return x ? -1 : 0; +} +static inline v_f64_t +v_abs_f64 (v_f64_t x) +{ + return __builtin_fabs (x); +} +static inline v_f64_t +v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z) +{ + return __builtin_fma (x, y, z); +} +static inline v_f64_t +v_round_f64 (v_f64_t x) +{ + return __builtin_round (x); +} +static inline v_s64_t +v_round_s64 (v_f64_t x) +{ + return __builtin_lround (x); /* relies on -fno-math-errno. */ +} +/* convert to type1 from type2. */ +static inline v_f64_t +v_to_f64_s64 (v_s64_t x) +{ + return x; +} +static inline v_f64_t +v_to_f64_u64 (v_u64_t x) +{ + return x; +} +/* reinterpret as type1 from type2. */ +static inline v_u64_t +v_as_u64_f64 (v_f64_t x) +{ + union { v_f64_t f; v_u64_t u; } r = {x}; + return r.u; +} +static inline v_f64_t +v_as_f64_u64 (v_u64_t x) +{ + union { v_u64_t u; v_f64_t f; } r = {x}; + return r.f; +} +static inline v_s64_t +v_as_s64_u64 (v_u64_t x) +{ + union { v_u64_t u; v_s64_t i; } r = {x}; + return r.i; +} +static inline v_u64_t +v_as_u64_s64 (v_s64_t x) +{ + union { v_s64_t i; v_u64_t u; } r = {x}; + return r.u; +} +static inline v_f64_t +v_lookup_f64 (const f64_t *tab, v_u64_t idx) +{ + return tab[idx]; +} +static inline v_u64_t +v_lookup_u64 (const u64_t *tab, v_u64_t idx) +{ + return tab[idx]; +} +static inline v_f64_t +v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p) +{ + return f (x); +} + +#elif __aarch64__ +#define V_SUPPORTED 1 +#include <arm_neon.h> +typedef float32x4_t v_f32_t; +typedef uint32x4_t v_u32_t; +typedef int32x4_t v_s32_t; +typedef float64x2_t v_f64_t; +typedef uint64x2_t v_u64_t; +typedef int64x2_t v_s64_t; + +static inline int +v_lanes32 (void) +{ + return 4; +} + +static inline v_f32_t +v_f32 (f32_t x) +{ + return (v_f32_t){x, x, x, x}; +} +static inline v_u32_t +v_u32 (u32_t x) +{ + return (v_u32_t){x, x, x, x}; +} +static inline v_s32_t +v_s32 (s32_t x) +{ + return (v_s32_t){x, x, x, x}; +} + +static inline f32_t +v_get_f32 (v_f32_t x, int i) +{ + return x[i]; +} +static inline u32_t +v_get_u32 (v_u32_t x, int i) +{ + return x[i]; +} +static inline s32_t +v_get_s32 (v_s32_t x, int i) +{ + return x[i]; +} + +static inline void +v_set_f32 (v_f32_t *x, int i, f32_t v) +{ + (*x)[i] = v; +} +static inline void +v_set_u32 (v_u32_t *x, int i, u32_t v) +{ + (*x)[i] = v; +} +static inline void +v_set_s32 (v_s32_t *x, int i, s32_t v) +{ + (*x)[i] = v; +} + +/* true if any elements of a v_cond result is non-zero. */ +static inline int +v_any_u32 (v_u32_t x) +{ + /* assume elements in x are either 0 or -1u. */ + return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0; +} +/* to wrap the result of relational operators. */ +static inline v_u32_t +v_cond_u32 (v_u32_t x) +{ + return x; +} +static inline v_f32_t +v_abs_f32 (v_f32_t x) +{ + return vabsq_f32 (x); +} +static inline v_f32_t +v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z) +{ + return vfmaq_f32 (z, x, y); +} +static inline v_f32_t +v_round_f32 (v_f32_t x) +{ + return vrndaq_f32 (x); +} +static inline v_s32_t +v_round_s32 (v_f32_t x) +{ + return vcvtaq_s32_f32 (x); +} +/* convert to type1 from type2. */ +static inline v_f32_t +v_to_f32_s32 (v_s32_t x) +{ + return (v_f32_t){x[0], x[1], x[2], x[3]}; +} +static inline v_f32_t +v_to_f32_u32 (v_u32_t x) +{ + return (v_f32_t){x[0], x[1], x[2], x[3]}; +} +/* reinterpret as type1 from type2. */ +static inline v_u32_t +v_as_u32_f32 (v_f32_t x) +{ + union { v_f32_t f; v_u32_t u; } r = {x}; + return r.u; +} +static inline v_f32_t +v_as_f32_u32 (v_u32_t x) +{ + union { v_u32_t u; v_f32_t f; } r = {x}; + return r.f; +} +static inline v_s32_t +v_as_s32_u32 (v_u32_t x) +{ + union { v_u32_t u; v_s32_t i; } r = {x}; + return r.i; +} +static inline v_u32_t +v_as_u32_s32 (v_s32_t x) +{ + union { v_s32_t i; v_u32_t u; } r = {x}; + return r.u; +} +static inline v_f32_t +v_lookup_f32 (const f32_t *tab, v_u32_t idx) +{ + return (v_f32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]}; +} +static inline v_u32_t +v_lookup_u32 (const u32_t *tab, v_u32_t idx) +{ + return (v_u32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]}; +} +static inline v_f32_t +v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p) +{ + return (v_f32_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1], + p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]}; +} +static inline v_f32_t +v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y, + v_u32_t p) +{ + return ( + v_f32_t){p[0] ? f (x1[0], x2[0]) : y[0], p[1] ? f (x1[1], x2[1]) : y[1], + p[2] ? f (x1[2], x2[2]) : y[2], p[3] ? f (x1[3], x2[3]) : y[3]}; +} + +static inline v_f64_t +v_f64 (f64_t x) +{ + return (v_f64_t){x, x}; +} +static inline v_u64_t +v_u64 (u64_t x) +{ + return (v_u64_t){x, x}; +} +static inline v_s64_t +v_s64 (s64_t x) +{ + return (v_s64_t){x, x}; +} +/* true if any elements of a v_cond result is non-zero. */ +static inline int +v_any_u64 (v_u64_t x) +{ + /* assume elements in x are either 0 or -1u. */ + return vpaddd_u64 (x) != 0; +} +/* to wrap the result of relational operators. */ +static inline v_u64_t +v_cond_u64 (v_u64_t x) +{ + return x; +} +static inline v_f64_t +v_abs_f64 (v_f64_t x) +{ + return vabsq_f64 (x); +} +static inline v_f64_t +v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z) +{ + return vfmaq_f64 (z, x, y); +} +static inline v_f64_t +v_round_f64 (v_f64_t x) +{ + return vrndaq_f64 (x); +} +static inline v_s64_t +v_round_s64 (v_f64_t x) +{ + return vcvtaq_s64_f64 (x); +} +/* convert to type1 from type2. */ +static inline v_f64_t +v_to_f64_s64 (v_s64_t x) +{ + return (v_f64_t){x[0], x[1]}; +} +static inline v_f64_t +v_to_f64_u64 (v_u64_t x) +{ + return (v_f64_t){x[0], x[1]}; +} +/* reinterpret as type1 from type2. */ +static inline v_u64_t +v_as_u64_f64 (v_f64_t x) +{ + union { v_f64_t f; v_u64_t u; } r = {x}; + return r.u; +} +static inline v_f64_t +v_as_f64_u64 (v_u64_t x) +{ + union { v_u64_t u; v_f64_t f; } r = {x}; + return r.f; +} +static inline v_s64_t +v_as_s64_u64 (v_u64_t x) +{ + union { v_u64_t u; v_s64_t i; } r = {x}; + return r.i; +} +static inline v_u64_t +v_as_u64_s64 (v_s64_t x) +{ + union { v_s64_t i; v_u64_t u; } r = {x}; + return r.u; +} +static inline v_f64_t +v_lookup_f64 (const f64_t *tab, v_u64_t idx) +{ + return (v_f64_t){tab[idx[0]], tab[idx[1]]}; +} +static inline v_u64_t +v_lookup_u64 (const u64_t *tab, v_u64_t idx) +{ + return (v_u64_t){tab[idx[0]], tab[idx[1]]}; +} +static inline v_f64_t +v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p) +{ + return (v_f64_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1]}; +} +#endif + +#endif diff --git a/math/vn_exp.c b/math/vn_exp.c new file mode 100644 index 0000000..06e269d --- /dev/null +++ b/math/vn_exp.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_exp. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#include "mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS strong_alias (__vn_exp, _ZGVnN2v_exp) +#include "v_exp.c" +#endif diff --git a/math/vn_expf.c b/math/vn_expf.c new file mode 100644 index 0000000..0652907 --- /dev/null +++ b/math/vn_expf.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_expf. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#include "mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS strong_alias (__vn_expf, _ZGVnN4v_expf) +#include "v_expf.c" +#endif diff --git a/math/vn_expf_1u.c b/math/vn_expf_1u.c new file mode 100644 index 0000000..3be7768 --- /dev/null +++ b/math/vn_expf_1u.c @@ -0,0 +1,11 @@ +/* + * AdvSIMD vector PCS variant of __v_expf_1u. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#include "mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#include "v_expf_1u.c" +#endif |