diff options
author | Pierre Blanchard <pierre.blanchard@arm.com> | 2020-11-05 16:26:36 +0000 |
---|---|---|
committer | Szabolcs Nagy <szabolcs.nagy@arm.com> | 2020-11-13 14:43:49 +0000 |
commit | 00e5afdbaf600cb8d56072685e987800d8678248 (patch) | |
tree | 312326613eb7e581fdf0e17766d48cf01d40da65 /math | |
parent | fc3fcc91ba7d79b30876d8e7ee68b8442a50da5c (diff) | |
download | arm-optimized-routines-00e5afdbaf600cb8d56072685e987800d8678248.tar.gz |
math: add scalar erf
Only tested in round-to-nearest mode. The expected worst case error
is 1.01 ULP near x=1.25. Benchmarked over random x in [-6,6] and
can increase performance by > 2x (> 3.5x for throughput) on big ooo
cores compared to the implementation in glibc 2.28.
Includes data for erfc too, but this patch only adds erf.
Diffstat (limited to 'math')
-rw-r--r-- | math/erf.c | 244 | ||||
-rw-r--r-- | math/erf_data.c | 85 | ||||
-rw-r--r-- | math/math_config.h | 19 | ||||
-rw-r--r-- | math/test/mathbench.c | 1 | ||||
-rwxr-xr-x | math/test/runulp.sh | 8 | ||||
-rw-r--r-- | math/test/testcases/directed/erf.tst | 17 | ||||
-rw-r--r-- | math/test/ulp.c | 1 |
7 files changed, 375 insertions, 0 deletions
diff --git a/math/erf.c b/math/erf.c new file mode 100644 index 0000000..97b543b --- /dev/null +++ b/math/erf.c @@ -0,0 +1,244 @@ +/* + * Double-precision erf(x) function. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "math_config.h" +#include <math.h> +#include <stdint.h> + +#define TwoOverSqrtPiMinusOne 0x1.06eba8214db69p-3 +#define C 0x1.b0ac16p-1 +#define PA __erf_data.erf_poly_A +#define NA __erf_data.erf_ratio_N_A +#define DA __erf_data.erf_ratio_D_A +#define NB __erf_data.erf_ratio_N_B +#define DB __erf_data.erf_ratio_D_B +#define PC __erf_data.erfc_poly_C +#define PD __erf_data.erfc_poly_D +#define PE __erf_data.erfc_poly_E +#define PF __erf_data.erfc_poly_F + +/* Top 32 bits of a double. */ +static inline uint32_t +top32 (double x) +{ + return asuint64 (x) >> 32; +} + +/* Fast erf implementation using a mix of + rational and polynomial approximations. + Highest measured error is 1.01 ULPs at 0x1.39956ac43382fp+0. */ +double +erf (double x) +{ + /* Get top word and sign. */ + uint32_t ix = top32 (x); + uint32_t ia = ix & 0x7fffffff; + uint32_t sign = ix >> 31; + + /* Normalized and subnormal cases */ + if (ia < 0x3feb0000) + { /* a = |x| < 0.84375. */ + + if (ia < 0x3e300000) + { /* a < 2^(-28). */ + if (ia < 0x00800000) + { /* a < 2^(-1015). */ + double y = x + TwoOverSqrtPiMinusOne * x; + return check_uflow (y); + } + return x + TwoOverSqrtPiMinusOne * x; + } + + double x2 = x * x; + + if (ia < 0x3fe00000) + { /* a < 0.5 - Use polynomial approximation. */ + double r1 = fma (x2, PA[1], PA[0]); + double r2 = fma (x2, PA[3], PA[2]); + double r3 = fma (x2, PA[5], PA[4]); + double r4 = fma (x2, PA[7], PA[6]); + double r5 = fma (x2, PA[9], PA[8]); + double x4 = x2 * x2; + double r = r5; + r = fma (x4, r, r4); + r = fma (x4, r, r3); + r = fma (x4, r, r2); + r = fma (x4, r, r1); + return fma (r, x, x); /* This fma is crucial for accuracy. */ + } + else + { /* 0.5 <= a < 0.84375 - Use rational approximation. */ + double x4, x8, r1n, r2n, r1d, r2d, r3d; + + r1n = fma (x2, NA[1], NA[0]); + x4 = x2 * x2; + r2n = fma (x2, NA[3], NA[2]); + x8 = x4 * x4; + r1d = fma (x2, DA[0], 1.0); + r2d = fma (x2, DA[2], DA[1]); + r3d = fma (x2, DA[4], DA[3]); + double P = r1n + x4 * r2n + x8 * NA[4]; + double Q = r1d + x4 * r2d + x8 * r3d; + return fma (P / Q, x, x); + } + } + else if (ia < 0x3ff40000) + { /* 0.84375 <= |x| < 1.25. */ + double a2, a4, a6, r1n, r2n, r3n, r4n, r1d, r2d, r3d, r4d; + double a = fabs (x) - 1.0; + r1n = fma (a, NB[1], NB[0]); + a2 = a * a; + r1d = fma (a, DB[0], 1.0); + a4 = a2 * a2; + r2n = fma (a, NB[3], NB[2]); + a6 = a4 * a2; + r2d = fma (a, DB[2], DB[1]); + r3n = fma (a, NB[5], NB[4]); + r3d = fma (a, DB[4], DB[3]); + r4n = NB[6]; + r4d = DB[5]; + double P = r1n + a2 * r2n + a4 * r3n + a6 * r4n; + double Q = r1d + a2 * r2d + a4 * r3d + a6 * r4d; + if (sign) + return -C - P / Q; + else + return C + P / Q; + } + else if (ia < 0x40000000) + { /* 1.25 <= |x| < 2.0. */ + double a = fabs (x); + a = a - 1.25; + + double r1 = fma (a, PC[1], PC[0]); + double r2 = fma (a, PC[3], PC[2]); + double r3 = fma (a, PC[5], PC[4]); + double r4 = fma (a, PC[7], PC[6]); + double r5 = fma (a, PC[9], PC[8]); + double r6 = fma (a, PC[11], PC[10]); + double r7 = fma (a, PC[13], PC[12]); + double r8 = fma (a, PC[15], PC[14]); + + double a2 = a * a; + + double r = r8; + r = fma (a2, r, r7); + r = fma (a2, r, r6); + r = fma (a2, r, r5); + r = fma (a2, r, r4); + r = fma (a2, r, r3); + r = fma (a2, r, r2); + r = fma (a2, r, r1); + + if (sign) + return -1.0 + r; + else + return 1.0 - r; + } + else if (ia < 0x400a0000) + { /* 2 <= |x| < 3.25. */ + double a = fabs (x); + a = fma (0.5, a, -1.0); + + double r1 = fma (a, PD[1], PD[0]); + double r2 = fma (a, PD[3], PD[2]); + double r3 = fma (a, PD[5], PD[4]); + double r4 = fma (a, PD[7], PD[6]); + double r5 = fma (a, PD[9], PD[8]); + double r6 = fma (a, PD[11], PD[10]); + double r7 = fma (a, PD[13], PD[12]); + double r8 = fma (a, PD[15], PD[14]); + double r9 = fma (a, PD[17], PD[16]); + + double a2 = a * a; + + double r = r9; + r = fma (a2, r, r8); + r = fma (a2, r, r7); + r = fma (a2, r, r6); + r = fma (a2, r, r5); + r = fma (a2, r, r4); + r = fma (a2, r, r3); + r = fma (a2, r, r2); + r = fma (a2, r, r1); + + if (sign) + return -1.0 + r; + else + return 1.0 - r; + } + else if (ia < 0x40100000) + { /* 3.25 <= |x| < 4.0. */ + double a = fabs (x); + a = a - 3.25; + + double r1 = fma (a, PE[1], PE[0]); + double r2 = fma (a, PE[3], PE[2]); + double r3 = fma (a, PE[5], PE[4]); + double r4 = fma (a, PE[7], PE[6]); + double r5 = fma (a, PE[9], PE[8]); + double r6 = fma (a, PE[11], PE[10]); + double r7 = fma (a, PE[13], PE[12]); + + double a2 = a * a; + + double r = r7; + r = fma (a2, r, r6); + r = fma (a2, r, r5); + r = fma (a2, r, r4); + r = fma (a2, r, r3); + r = fma (a2, r, r2); + r = fma (a2, r, r1); + + if (sign) + return -1.0 + r; + else + return 1.0 - r; + } + else if (ia < 0x4017a000) + { /* 4 <= |x| < 5.90625. */ + double a = fabs (x); + a = fma (0.5, a, -2.0); + + double r1 = fma (a, PF[1], PF[0]); + double r2 = fma (a, PF[3], PF[2]); + double r3 = fma (a, PF[5], PF[4]); + double r4 = fma (a, PF[7], PF[6]); + double r5 = fma (a, PF[9], PF[8]); + double r6 = fma (a, PF[11], PF[10]); + double r7 = fma (a, PF[13], PF[12]); + double r8 = fma (a, PF[15], PF[14]); + double r9 = PF[16]; + + double a2 = a * a; + + double r = r9; + r = fma (a2, r, r8); + r = fma (a2, r, r7); + r = fma (a2, r, r6); + r = fma (a2, r, r5); + r = fma (a2, r, r4); + r = fma (a2, r, r3); + r = fma (a2, r, r2); + r = fma (a2, r, r1); + + if (sign) + return -1.0 + r; + else + return 1.0 - r; + } + else + { + /* Special cases : erf(nan)=nan, erf(+inf)=+1 and erf(-inf)=-1. */ + if (unlikely (ia >= 0x7ff00000)) + return (double) (1.0 - (sign << 1)) + 1.0 / x; + + if (sign) + return -1.0; + else + return 1.0; + } +} diff --git a/math/erf_data.c b/math/erf_data.c new file mode 100644 index 0000000..807875b --- /dev/null +++ b/math/erf_data.c @@ -0,0 +1,85 @@ +/* + * Shared data between erf and erfc. + * + * Copyright (c) 2019-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "math_config.h" + +/* +Minimax approximation of erf +*/ +const struct erf_data __erf_data = { +.erf_poly_A = { +#if ERF_POLY_A_NCOEFFS == 10 +0x1.06eba8214db68p-3, -0x1.812746b037948p-2, 0x1.ce2f21a03872p-4, +-0x1.b82ce30e6548p-6, 0x1.565bcc360a2f2p-8, -0x1.c02d812bc979ap-11, +0x1.f99bddfc1ebe9p-14, -0x1.f42c457cee912p-17, 0x1.b0e414ec20ee9p-20, +-0x1.18c47fd143c5ep-23 +#endif +}, +/* Rational approximation on [0x1p-28, 0.84375] */ +.erf_ratio_N_A = { +0x1.06eba8214db68p-3, -0x1.4cd7d691cb913p-2, -0x1.d2a51dbd7194fp-6, +-0x1.7a291236668e4p-8, -0x1.8ead6120016acp-16 +}, +.erf_ratio_D_A = { +0x1.97779cddadc09p-2, 0x1.0a54c5536cebap-4, 0x1.4d022c4d36b0fp-8, +0x1.15dc9221c1a1p-13, -0x1.09c4342a2612p-18 +}, +/* Rational approximation on [0.84375, 1.25] */ +.erf_ratio_N_B = { +-0x1.359b8bef77538p-9, 0x1.a8d00ad92b34dp-2, -0x1.7d240fbb8c3f1p-2, +0x1.45fca805120e4p-2, -0x1.c63983d3e28ecp-4, 0x1.22a36599795ebp-5, +-0x1.1bf380a96073fp-9 +}, +.erf_ratio_D_B = { +0x1.b3e6618eee323p-4, 0x1.14af092eb6f33p-1, 0x1.2635cd99fe9a7p-4, +0x1.02660e763351fp-3, 0x1.bedc26b51dd1cp-7, 0x1.88b545735151dp-7 +}, +.erfc_poly_C = { +#if ERFC_POLY_C_NCOEFFS == 16 +/* Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=15 a=1.25 b=2 c=1 d=1.25 */ +0x1.3bcd133aa0ffcp-4, -0x1.e4652fadcb702p-3, 0x1.2ebf3dcca0446p-2, +-0x1.571d01c62d66p-3, 0x1.93a9a8f5b3413p-8, 0x1.8281cbcc2cd52p-5, +-0x1.5cffd86b4de16p-6, -0x1.db4ccf595053ep-9, 0x1.757cbf8684edap-8, +-0x1.ce7dfd2a9e56ap-11, -0x1.99ee3bc5a3263p-11, 0x1.3c57cf9213f5fp-12, +0x1.60692996bf254p-14, -0x1.6e44cb7c1fa2ap-14, 0x1.9d4484ac482b2p-16, +-0x1.578c9e375d37p-19 +#endif +}, +.erfc_poly_D = { +#if ERFC_POLY_D_NCOEFFS == 18 +/* Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=17 a=2 b=3.25 c=2 d=2 */ +0x1.328f5ec350e5p-8, -0x1.529b9e8cf8e99p-5, 0x1.529b9e8cd9e71p-3, +-0x1.8b0ae3a023bf2p-2, 0x1.1a2c592599d82p-1, -0x1.ace732477e494p-2, +-0x1.e1a06a27920ffp-6, 0x1.bae92a6d27af6p-2, -0x1.a15470fcf5ce7p-2, +0x1.bafe45d18e213p-6, 0x1.0d950680d199ap-2, -0x1.8c9481e8f22e3p-3, +-0x1.158450ed5c899p-4, 0x1.c01f2973b44p-3, -0x1.73ed2827546a7p-3, +0x1.47733687d1ff7p-4, -0x1.2dec70d00b8e1p-6, 0x1.a947ab83cd4fp-10 +#endif +}, +.erfc_poly_E = { +#if ERFC_POLY_E_NCOEFFS == 14 +/* Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=13 a=3.25 b=4 c=1 d=3.25 */ +0x1.20c13035539e4p-18, -0x1.e9b5e8d16df7ep-16, 0x1.8de3cd4733bf9p-14, +-0x1.9aa48beb8382fp-13, 0x1.2c7d713370a9fp-12, -0x1.490b12110b9e2p-12, +0x1.1459c5d989d23p-12, -0x1.64b28e9f1269p-13, 0x1.57c76d9d05cf8p-14, +-0x1.bf271d9951cf8p-16, 0x1.db7ea4d4535c9p-19, 0x1.91c2e102d5e49p-20, +-0x1.e9f0826c2149ep-21, 0x1.60eebaea236e1p-23 +#endif +}, +.erfc_poly_F = { +#if ERFC_POLY_F_NCOEFFS == 17 +/* Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=16 a=4 b=5.90625 c=2 d=4 */ +0x1.08ddd130d1fa6p-26, -0x1.10b146f59ff06p-22, 0x1.10b135328b7b2p-19, +-0x1.6039988e7575fp-17, 0x1.497d365e19367p-15, -0x1.da48d9afac83ep-14, +0x1.1024c9b1fbb48p-12, -0x1.fc962e7066272p-12, 0x1.87297282d4651p-11, +-0x1.f057b255f8c59p-11, 0x1.0228d0eee063p-10, -0x1.b1b21b84ec41cp-11, +0x1.1ead8ae9e1253p-11, -0x1.1e708fba37fccp-12, 0x1.9559363991edap-14, +-0x1.68c827b783d9cp-16, 0x1.2ec4adeccf4a2p-19 +#endif +} +}; + diff --git a/math/math_config.h b/math/math_config.h index 8a51504..09755ad 100644 --- a/math/math_config.h +++ b/math/math_config.h @@ -440,4 +440,23 @@ extern const struct erff_data float erff_poly_B[7]; } __erff_data HIDDEN; +#define ERF_POLY_A_ORDER 19 +#define ERF_POLY_A_NCOEFFS 10 +#define ERFC_POLY_C_NCOEFFS 16 +#define ERFC_POLY_D_NCOEFFS 18 +#define ERFC_POLY_E_NCOEFFS 14 +#define ERFC_POLY_F_NCOEFFS 17 +extern const struct erf_data +{ + double erf_poly_A[ERF_POLY_A_NCOEFFS]; + double erf_ratio_N_A[5]; + double erf_ratio_D_A[5]; + double erf_ratio_N_B[7]; + double erf_ratio_D_B[6]; + double erfc_poly_C[ERFC_POLY_C_NCOEFFS]; + double erfc_poly_D[ERFC_POLY_D_NCOEFFS]; + double erfc_poly_E[ERFC_POLY_E_NCOEFFS]; + double erfc_poly_F[ERFC_POLY_F_NCOEFFS]; +} __erf_data HIDDEN; + #endif diff --git a/math/test/mathbench.c b/math/test/mathbench.c index 2bca52b..2e8aaf8 100644 --- a/math/test/mathbench.c +++ b/math/test/mathbench.c @@ -248,6 +248,7 @@ D (log2, 0.999, 1.001) {"pow", 'd', 0, 0.01, 11.1, {.d = xypow}}, D (xpow, 0.01, 11.1) D (ypow, -9.9, 9.9) +D (erf, -6.0, 6.0) F (dummyf, 1.0, 2.0) F (expf, -9.9, 9.9) diff --git a/math/test/runulp.sh b/math/test/runulp.sh index 283145e..492a6f2 100755 --- a/math/test/runulp.sh +++ b/math/test/runulp.sh @@ -72,6 +72,14 @@ t pow 0x1.ffffffffffff0p-1 0x1.0000000000008p0 x 0x1p60 0x1p68 50000 t pow 0x1.ffffffffff000p-1 0x1p0 x 0x1p50 0x1p52 50000 t pow -0x1.ffffffffff000p-1 -0x1p0 x 0x1p50 0x1p52 50000 +L=1.0 +t erf 0 0xffff000000000000 10000 +t erf 0x1p-1022 0x1p-26 40000 +t erf -0x1p-1022 -0x1p-26 40000 +t erf 0x1p-26 0x1p3 40000 +t erf -0x1p-26 -0x1p3 40000 +t erf 0 inf 40000 + L=0.01 t expf 0 0xffff0000 10000 t expf 0x1p-14 0x1p8 50000 diff --git a/math/test/testcases/directed/erf.tst b/math/test/testcases/directed/erf.tst new file mode 100644 index 0000000..7fa4d18 --- /dev/null +++ b/math/test/testcases/directed/erf.tst @@ -0,0 +1,17 @@ +; erf.tst - Directed test cases for erf +; +; Copyright (c) 2007-2020, Arm Limited. +; SPDX-License-Identifier: MIT + +func=erf op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 +func=erf op1=fff80000.00000001 result=7ff80000.00000001 errno=0 +func=erf op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=erf op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=erf op1=7ff00000.00000000 result=3ff00000.00000000 errno=0 +func=erf op1=fff00000.00000000 result=bff00000.00000000 errno=0 +func=erf op1=00000000.00000000 result=00000000.00000000 errno=ERANGE +func=erf op1=80000000.00000000 result=80000000.00000000 errno=ERANGE +func=erf op1=00000000.00000001 result=00000000.00000001 errno=0 status=ux +func=erf op1=80000000.00000001 result=80000000.00000001 errno=0 status=ux +func=erf op1=3ff00000.00000000 result=3feaf767.a741088a.c6d errno=0 +func=erf op1=bff00000.00000000 result=bfeaf767.a741088a.c6d errno=0 diff --git a/math/test/ulp.c b/math/test/ulp.c index 3701c93..05a5321 100644 --- a/math/test/ulp.c +++ b/math/test/ulp.c @@ -337,6 +337,7 @@ static const struct fun fun[] = { D1 (log) D1 (log2) D2 (pow) + D1 (erf) #if WANT_VMATH F (__s_sinf, __s_sinf, sin, mpfr_sin, 1, 1, f1, 0) F (__s_cosf, __s_cosf, cos, mpfr_cos, 1, 1, f1, 0) |