diff options
Diffstat (limited to 'pl/math/erff_1u5.c')
-rw-r--r-- | pl/math/erff_1u5.c | 108 |
1 files changed, 108 insertions, 0 deletions
diff --git a/pl/math/erff_1u5.c b/pl/math/erff_1u5.c new file mode 100644 index 0000000..1a69872 --- /dev/null +++ b/pl/math/erff_1u5.c @@ -0,0 +1,108 @@ +/* + * Single-precision erf(x) function. + * + * Copyright (c) 2020-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "estrinf.h" +#include "hornerf.h" +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f +#define A __erff_data.erff_poly_A +#define B __erff_data.erff_poly_B + +/* Top 12 bits of a float. */ +static inline uint32_t +top12 (float x) +{ + return asuint (x) >> 20; +} + +/* Efficient implementation of erff using either a pure polynomial approximation + or the exponential of a polynomial. Worst-case error is 1.09ulps at + 0x1.c111acp-1. */ +float +erff (float x) +{ + float r, x2; + + /* Get top word. */ + uint32_t ix = asuint (x); + uint32_t sign = ix >> 31; + uint32_t ia12 = top12 (x) & 0x7ff; + + /* Limit of both intervals is 0.875 for performance reasons but coefficients + computed on [0.0, 0.921875] and [0.921875, 4.0], which brought accuracy + from 0.94 to 1.1ulps. */ + if (ia12 < 0x3f6) + { /* a = |x| < 0.875. */ + + /* Tiny and subnormal cases. */ + if (unlikely (ia12 < 0x318)) + { /* |x| < 2^(-28). */ + if (unlikely (ia12 < 0x040)) + { /* |x| < 2^(-119). */ + float y = fmaf (TwoOverSqrtPiMinusOne, x, x); + return check_uflowf (y); + } + return x + TwoOverSqrtPiMinusOne * x; + } + + x2 = x * x; + + /* Normalized cases (|x| < 0.921875) - Use Horner scheme for x+x*P(x^2). + */ +#define C(i) A[i] + r = fmaf (HORNER_5 (x2, C), x, x); +#undef C + } + else if (ia12 < 0x408) + { /* |x| < 4.0 - Use a custom Estrin scheme. */ + + float a = fabsf (x); + /* Use Estrin scheme on high order (small magnitude) coefficients. */ +#define C(i) B[i] + r = ESTRIN_3_ (a, x * x, C, 3); +#undef C + /* Then switch to pure Horner scheme. */ + r = fmaf (r, a, B[2]); + r = fmaf (r, a, B[1]); + r = fmaf (r, a, B[0]); + r = fmaf (r, a, a); + /* Single precision exponential with ~0.5ulps ensures erff has maximum + relative error below 1ulp on [0.921875, 4.0] and below 1.1ulps on + [0.875, 4.0]. */ + r = expf (-r); + /* Explicit copysign (calling copysignf increases latency). */ + if (sign) + r = -1.0f + r; + else + r = 1.0f - r; + } + else + { /* |x| >= 4.0. */ + + /* Special cases : erff(nan)=nan, erff(+inf)=+1 and erff(-inf)=-1. */ + if (unlikely (ia12 >= 0x7f8)) + return (1.f - (float) ((ix >> 31) << 1)) + 1.f / x; + + /* Explicit copysign (calling copysignf increases latency). */ + if (sign) + r = -1.0f; + else + r = 1.0f; + } + return r; +} + +PL_SIG (S, F, 1, erf, -4.0, 4.0) +PL_TEST_ULP (erff, 0.6) +PL_TEST_INTERVAL (erff, 0, 0xffff0000, 10000) +PL_TEST_INTERVAL (erff, 0x1p-127, 0x1p-26, 40000) +PL_TEST_INTERVAL (erff, -0x1p-127, -0x1p-26, 40000) +PL_TEST_INTERVAL (erff, 0x1p-26, 0x1p3, 40000) +PL_TEST_INTERVAL (erff, -0x1p-26, -0x1p3, 40000) +PL_TEST_INTERVAL (erff, 0, inf, 40000) |