diff options
author | Pierre Blanchard <pierre.blanchard@arm.com> | 2020-10-29 15:50:19 +0000 |
---|---|---|
committer | Szabolcs Nagy <szabolcs.nagy@arm.com> | 2020-10-29 16:36:28 +0000 |
commit | 1a79237df2fc1020ccce8528ed54977681302752 (patch) | |
tree | ecc2bbb51711e7cd12e36658e9f3c47d401b8139 /math | |
parent | 0f4ae0c5b561de25acb10130fd5e473ec038f89d (diff) | |
download | arm-optimized-routines-1a79237df2fc1020ccce8528ed54977681302752.tar.gz |
math: add scalar erff.
In round-to-nearest mode the maximum error is 1.09 ULP.
Compared to glibc-2.28 erff: throughput is about 2.2x better,
latency is about 1.5x better on some AArch64 cores (on random
input in [-4,4]).
There are further optimization and quality improvement opportunities.
Diffstat (limited to 'math')
-rw-r--r-- | math/erff.c | 104 | ||||
-rw-r--r-- | math/erff_data.c | 22 | ||||
-rw-r--r-- | math/math_config.h | 24 | ||||
-rw-r--r-- | math/math_errf.c | 14 | ||||
-rw-r--r-- | math/test/mathbench.c | 1 | ||||
-rwxr-xr-x | math/test/runulp.sh | 11 | ||||
-rw-r--r-- | math/test/testcases/directed/erff.tst | 19 | ||||
-rw-r--r-- | math/test/ulp.c | 1 |
8 files changed, 196 insertions, 0 deletions
diff --git a/math/erff.c b/math/erff.c new file mode 100644 index 0000000..9b9ccc2 --- /dev/null +++ b/math/erff.c @@ -0,0 +1,104 @@ +/* + * Single-precision erf(x) function. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include <stdint.h> +#include <math.h> +#include "math_config.h" + +#define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f +#define A __erff_data.erff_poly_A +#define B __erff_data.erff_poly_B + +/* Top 12 bits of a float. */ +static inline uint32_t +top12 (float x) +{ + return asuint (x) >> 20; +} + +/* Efficient implementation of erff + using either a pure polynomial approximation or + the exponential of a polynomial. + Worst-case error is 1.09ulps at 0x1.c111acp-1. */ +float +erff (float x) +{ + float r, x2, u; + + /* Get top word. */ + uint32_t ix = asuint (x); + uint32_t sign = ix >> 31; + uint32_t ia12 = top12 (x) & 0x7ff; + + /* Limit of both intervals is 0.875 for performance reasons but coefficients + computed on [0.0, 0.921875] and [0.921875, 4.0], which brought accuracy + from 0.94 to 1.1ulps. */ + if (ia12 < 0x3f6) + { /* a = |x| < 0.875. */ + + /* Tiny and subnormal cases. */ + if (unlikely (ia12 < 0x318)) + { /* |x| < 2^(-28). */ + if (unlikely (ia12 < 0x040)) + { /* |x| < 2^(-119). */ + float y = x + TwoOverSqrtPiMinusOne * x; + return check_uflowf (y); + } + return x + TwoOverSqrtPiMinusOne * x; + } + + x2 = x * x; + + /* Normalized cases (|x| < 0.921875). Use Horner scheme for x+x*P(x^2). */ + r = A[5]; + r = fmaf (r, x2, A[4]); + r = fmaf (r, x2, A[3]); + r = fmaf (r, x2, A[2]); + r = fmaf (r, x2, A[1]); + r = fmaf (r, x2, A[0]); + r = fmaf (r, x, x); + } + else if (ia12 < 0x408) + { /* |x| < 4.0 - Use a custom Estrin scheme. */ + + float a = fabsf (x); + /* Start with Estrin scheme on high order (small magnitude) coefficients. */ + r = fmaf (B[6], a, B[5]); + u = fmaf (B[4], a, B[3]); + x2 = x * x; + r = fmaf (r, x2, u); + /* Then switch to pure Horner scheme. */ + r = fmaf (r, a, B[2]); + r = fmaf (r, a, B[1]); + r = fmaf (r, a, B[0]); + r = fmaf (r, a, a); + /* Single precision exponential with ~0.5ulps, + ensures erff has max. rel. error + < 1ulp on [0.921875, 4.0], + < 1.1ulps on [0.875, 4.0]. */ + r = expf (-r); + /* Explicit copysign (calling copysignf increases latency). */ + if (sign) + r = -1.0f + r; + else + r = 1.0f - r; + } + else + { /* |x| >= 4.0. */ + + /* Special cases : erff(nan)=nan, erff(+inf)=+1 and erff(-inf)=-1. */ + if (unlikely (ia12 >= 0x7f8)) + return (1.f - (float) ((ix >> 31) << 1)) + 1.f / x; + + /* Explicit copysign (calling copysignf increases latency). */ + if (sign) + r = -1.0f; + else + r = 1.0f; + } + return r; +} diff --git a/math/erff_data.c b/math/erff_data.c new file mode 100644 index 0000000..fa6b1ef --- /dev/null +++ b/math/erff_data.c @@ -0,0 +1,22 @@ +/* + * Data for approximation of erff. + * + * Copyright (c) 2019-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "math_config.h" + +/* Minimax approximation of erff. */ +const struct erff_data __erff_data = { +.erff_poly_A = { +0x1.06eba6p-03f, -0x1.8126e0p-02f, 0x1.ce1a46p-04f, +-0x1.b68bd2p-06f, 0x1.473f48p-08f, -0x1.3a1a82p-11f +}, +.erff_poly_B = { +0x1.079d0cp-3f, 0x1.450aa0p-1f, 0x1.b55cb0p-4f, +-0x1.8d6300p-6f, 0x1.fd1336p-9f, -0x1.91d2ccp-12f, +0x1.222900p-16f +} +}; + diff --git a/math/math_config.h b/math/math_config.h index 85fc584..8a51504 100644 --- a/math/math_config.h +++ b/math/math_config.h @@ -298,6 +298,24 @@ check_uflow (double x) return WANT_ERRNO ? __math_check_uflow (x) : x; } +/* Check if the result overflowed to infinity. */ +HIDDEN float __math_check_oflowf (float); +/* Check if the result underflowed to 0. */ +HIDDEN float __math_check_uflowf (float); + +/* Check if the result overflowed to infinity. */ +static inline float +check_oflowf (float x) +{ + return WANT_ERRNO ? __math_check_oflowf (x) : x; +} + +/* Check if the result underflowed to 0. */ +static inline float +check_uflowf (float x) +{ + return WANT_ERRNO ? __math_check_uflowf (x) : x; +} /* Shared between expf, exp2f and powf. */ #define EXP2F_TABLE_BITS 5 @@ -416,4 +434,10 @@ extern const struct pow_log_data struct {double invc, pad, logc, logctail;} tab[1 << POW_LOG_TABLE_BITS]; } __pow_log_data HIDDEN; +extern const struct erff_data +{ + float erff_poly_A[6]; + float erff_poly_B[7]; +} __erff_data HIDDEN; + #endif diff --git a/math/math_errf.c b/math/math_errf.c index 07154c5..771d982 100644 --- a/math/math_errf.c +++ b/math/math_errf.c @@ -64,3 +64,17 @@ __math_invalidf (float x) float y = (x - x) / (x - x); return isnan (x) ? y : with_errnof (y, EDOM); } + +/* Check result and set errno if necessary. */ + +HIDDEN float +__math_check_uflowf (float y) +{ + return y == 0.0f ? with_errnof (y, ERANGE) : y; +} + +HIDDEN float +__math_check_oflowf (float y) +{ + return isinf (y) ? with_errnof (y, ERANGE) : y; +} diff --git a/math/test/mathbench.c b/math/test/mathbench.c index 33ceda3..2bca52b 100644 --- a/math/test/mathbench.c +++ b/math/test/mathbench.c @@ -275,6 +275,7 @@ F (cosf, -3.1, 3.1) F (cosf, 3.3, 33.3) F (cosf, 100, 1000) F (cosf, 1e6, 1e32) +F (erff, -4.0, 4.0) #if WANT_VMATH D (__s_sin, -3.1, 3.1) D (__s_cos, -3.1, 3.1) diff --git a/math/test/runulp.sh b/math/test/runulp.sh index a8c391b..283145e 100755 --- a/math/test/runulp.sh +++ b/math/test/runulp.sh @@ -119,6 +119,17 @@ t powf 0x1p-70 0x1p70 x 0x1p-1 0x1p1 50000 t powf 0x1p-70 0x1p70 x -0x1p-1 -0x1p1 50000 t powf 0x1.ep-1 0x1.1p0 x 0x1p8 0x1p14 50000 t powf 0x1.ep-1 0x1.1p0 x -0x1p8 -0x1p14 50000 + +L=0.6 +Ldir=0.9 +t erff 0 0xffff0000 10000 +t erff 0x1p-127 0x1p-26 40000 +t erff -0x1p-127 -0x1p-26 40000 +t erff 0x1p-26 0x1p3 40000 +t erff -0x1p-26 -0x1p3 40000 +t erff 0 inf 40000 +Ldir=0.5 + done # vector functions diff --git a/math/test/testcases/directed/erff.tst b/math/test/testcases/directed/erff.tst new file mode 100644 index 0000000..8d4ec40 --- /dev/null +++ b/math/test/testcases/directed/erff.tst @@ -0,0 +1,19 @@ +; erff.tst +; +; Copyright 2009 ARM Limited. All rights reserved. +; +; $Id$ +; $URL$ + +func=erff op1=7fc00001 result=7fc00001 errno=0 +func=erff op1=ffc00001 result=7fc00001 errno=0 +func=erff op1=7f800001 result=7fc00001 errno=0 status=i +func=erff op1=ff800001 result=7fc00001 errno=0 status=i +func=erff op1=7f800000 result=3f800000 errno=0 +func=erff op1=ff800000 result=bf800000 errno=0 +func=erff op1=00000000 result=00000000 errno=ERANGE +func=erff op1=80000000 result=80000000 errno=ERANGE +func=erff op1=00000001 result=00000001 errno=0 status=ux +func=erff op1=80000001 result=80000001 errno=0 status=ux +func=erff op1=3f800000 result=3f57bb3d.3a0 errno=0 +func=erff op1=bf800000 result=bf57bb3d.3a0 errno=0 diff --git a/math/test/ulp.c b/math/test/ulp.c index 371567a..3701c93 100644 --- a/math/test/ulp.c +++ b/math/test/ulp.c @@ -331,6 +331,7 @@ static const struct fun fun[] = { F1 (log) F1 (log2) F2 (pow) + F1 (erf) D1 (exp) D1 (exp2) D1 (log) |