aboutsummaryrefslogtreecommitdiff
path: root/math
diff options
context:
space:
mode:
authorPierre Blanchard <pierre.blanchard@arm.com>2020-10-29 15:50:19 +0000
committerSzabolcs Nagy <szabolcs.nagy@arm.com>2020-10-29 16:36:28 +0000
commit1a79237df2fc1020ccce8528ed54977681302752 (patch)
treeecc2bbb51711e7cd12e36658e9f3c47d401b8139 /math
parent0f4ae0c5b561de25acb10130fd5e473ec038f89d (diff)
downloadarm-optimized-routines-1a79237df2fc1020ccce8528ed54977681302752.tar.gz
math: add scalar erff.
In round-to-nearest mode the maximum error is 1.09 ULP. Compared to glibc-2.28 erff: throughput is about 2.2x better, latency is about 1.5x better on some AArch64 cores (on random input in [-4,4]). There are further optimization and quality improvement opportunities.
Diffstat (limited to 'math')
-rw-r--r--math/erff.c104
-rw-r--r--math/erff_data.c22
-rw-r--r--math/math_config.h24
-rw-r--r--math/math_errf.c14
-rw-r--r--math/test/mathbench.c1
-rwxr-xr-xmath/test/runulp.sh11
-rw-r--r--math/test/testcases/directed/erff.tst19
-rw-r--r--math/test/ulp.c1
8 files changed, 196 insertions, 0 deletions
diff --git a/math/erff.c b/math/erff.c
new file mode 100644
index 0000000..9b9ccc2
--- /dev/null
+++ b/math/erff.c
@@ -0,0 +1,104 @@
+/*
+ * Single-precision erf(x) function.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <math.h>
+#include "math_config.h"
+
+#define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f
+#define A __erff_data.erff_poly_A
+#define B __erff_data.erff_poly_B
+
+/* Top 12 bits of a float. */
+static inline uint32_t
+top12 (float x)
+{
+ return asuint (x) >> 20;
+}
+
+/* Efficient implementation of erff
+ using either a pure polynomial approximation or
+ the exponential of a polynomial.
+ Worst-case error is 1.09ulps at 0x1.c111acp-1. */
+float
+erff (float x)
+{
+ float r, x2, u;
+
+ /* Get top word. */
+ uint32_t ix = asuint (x);
+ uint32_t sign = ix >> 31;
+ uint32_t ia12 = top12 (x) & 0x7ff;
+
+ /* Limit of both intervals is 0.875 for performance reasons but coefficients
+ computed on [0.0, 0.921875] and [0.921875, 4.0], which brought accuracy
+ from 0.94 to 1.1ulps. */
+ if (ia12 < 0x3f6)
+ { /* a = |x| < 0.875. */
+
+ /* Tiny and subnormal cases. */
+ if (unlikely (ia12 < 0x318))
+ { /* |x| < 2^(-28). */
+ if (unlikely (ia12 < 0x040))
+ { /* |x| < 2^(-119). */
+ float y = x + TwoOverSqrtPiMinusOne * x;
+ return check_uflowf (y);
+ }
+ return x + TwoOverSqrtPiMinusOne * x;
+ }
+
+ x2 = x * x;
+
+ /* Normalized cases (|x| < 0.921875). Use Horner scheme for x+x*P(x^2). */
+ r = A[5];
+ r = fmaf (r, x2, A[4]);
+ r = fmaf (r, x2, A[3]);
+ r = fmaf (r, x2, A[2]);
+ r = fmaf (r, x2, A[1]);
+ r = fmaf (r, x2, A[0]);
+ r = fmaf (r, x, x);
+ }
+ else if (ia12 < 0x408)
+ { /* |x| < 4.0 - Use a custom Estrin scheme. */
+
+ float a = fabsf (x);
+ /* Start with Estrin scheme on high order (small magnitude) coefficients. */
+ r = fmaf (B[6], a, B[5]);
+ u = fmaf (B[4], a, B[3]);
+ x2 = x * x;
+ r = fmaf (r, x2, u);
+ /* Then switch to pure Horner scheme. */
+ r = fmaf (r, a, B[2]);
+ r = fmaf (r, a, B[1]);
+ r = fmaf (r, a, B[0]);
+ r = fmaf (r, a, a);
+ /* Single precision exponential with ~0.5ulps,
+ ensures erff has max. rel. error
+ < 1ulp on [0.921875, 4.0],
+ < 1.1ulps on [0.875, 4.0]. */
+ r = expf (-r);
+ /* Explicit copysign (calling copysignf increases latency). */
+ if (sign)
+ r = -1.0f + r;
+ else
+ r = 1.0f - r;
+ }
+ else
+ { /* |x| >= 4.0. */
+
+ /* Special cases : erff(nan)=nan, erff(+inf)=+1 and erff(-inf)=-1. */
+ if (unlikely (ia12 >= 0x7f8))
+ return (1.f - (float) ((ix >> 31) << 1)) + 1.f / x;
+
+ /* Explicit copysign (calling copysignf increases latency). */
+ if (sign)
+ r = -1.0f;
+ else
+ r = 1.0f;
+ }
+ return r;
+}
diff --git a/math/erff_data.c b/math/erff_data.c
new file mode 100644
index 0000000..fa6b1ef
--- /dev/null
+++ b/math/erff_data.c
@@ -0,0 +1,22 @@
+/*
+ * Data for approximation of erff.
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "math_config.h"
+
+/* Minimax approximation of erff. */
+const struct erff_data __erff_data = {
+.erff_poly_A = {
+0x1.06eba6p-03f, -0x1.8126e0p-02f, 0x1.ce1a46p-04f,
+-0x1.b68bd2p-06f, 0x1.473f48p-08f, -0x1.3a1a82p-11f
+},
+.erff_poly_B = {
+0x1.079d0cp-3f, 0x1.450aa0p-1f, 0x1.b55cb0p-4f,
+-0x1.8d6300p-6f, 0x1.fd1336p-9f, -0x1.91d2ccp-12f,
+0x1.222900p-16f
+}
+};
+
diff --git a/math/math_config.h b/math/math_config.h
index 85fc584..8a51504 100644
--- a/math/math_config.h
+++ b/math/math_config.h
@@ -298,6 +298,24 @@ check_uflow (double x)
return WANT_ERRNO ? __math_check_uflow (x) : x;
}
+/* Check if the result overflowed to infinity. */
+HIDDEN float __math_check_oflowf (float);
+/* Check if the result underflowed to 0. */
+HIDDEN float __math_check_uflowf (float);
+
+/* Check if the result overflowed to infinity. */
+static inline float
+check_oflowf (float x)
+{
+ return WANT_ERRNO ? __math_check_oflowf (x) : x;
+}
+
+/* Check if the result underflowed to 0. */
+static inline float
+check_uflowf (float x)
+{
+ return WANT_ERRNO ? __math_check_uflowf (x) : x;
+}
/* Shared between expf, exp2f and powf. */
#define EXP2F_TABLE_BITS 5
@@ -416,4 +434,10 @@ extern const struct pow_log_data
struct {double invc, pad, logc, logctail;} tab[1 << POW_LOG_TABLE_BITS];
} __pow_log_data HIDDEN;
+extern const struct erff_data
+{
+ float erff_poly_A[6];
+ float erff_poly_B[7];
+} __erff_data HIDDEN;
+
#endif
diff --git a/math/math_errf.c b/math/math_errf.c
index 07154c5..771d982 100644
--- a/math/math_errf.c
+++ b/math/math_errf.c
@@ -64,3 +64,17 @@ __math_invalidf (float x)
float y = (x - x) / (x - x);
return isnan (x) ? y : with_errnof (y, EDOM);
}
+
+/* Check result and set errno if necessary. */
+
+HIDDEN float
+__math_check_uflowf (float y)
+{
+ return y == 0.0f ? with_errnof (y, ERANGE) : y;
+}
+
+HIDDEN float
+__math_check_oflowf (float y)
+{
+ return isinf (y) ? with_errnof (y, ERANGE) : y;
+}
diff --git a/math/test/mathbench.c b/math/test/mathbench.c
index 33ceda3..2bca52b 100644
--- a/math/test/mathbench.c
+++ b/math/test/mathbench.c
@@ -275,6 +275,7 @@ F (cosf, -3.1, 3.1)
F (cosf, 3.3, 33.3)
F (cosf, 100, 1000)
F (cosf, 1e6, 1e32)
+F (erff, -4.0, 4.0)
#if WANT_VMATH
D (__s_sin, -3.1, 3.1)
D (__s_cos, -3.1, 3.1)
diff --git a/math/test/runulp.sh b/math/test/runulp.sh
index a8c391b..283145e 100755
--- a/math/test/runulp.sh
+++ b/math/test/runulp.sh
@@ -119,6 +119,17 @@ t powf 0x1p-70 0x1p70 x 0x1p-1 0x1p1 50000
t powf 0x1p-70 0x1p70 x -0x1p-1 -0x1p1 50000
t powf 0x1.ep-1 0x1.1p0 x 0x1p8 0x1p14 50000
t powf 0x1.ep-1 0x1.1p0 x -0x1p8 -0x1p14 50000
+
+L=0.6
+Ldir=0.9
+t erff 0 0xffff0000 10000
+t erff 0x1p-127 0x1p-26 40000
+t erff -0x1p-127 -0x1p-26 40000
+t erff 0x1p-26 0x1p3 40000
+t erff -0x1p-26 -0x1p3 40000
+t erff 0 inf 40000
+Ldir=0.5
+
done
# vector functions
diff --git a/math/test/testcases/directed/erff.tst b/math/test/testcases/directed/erff.tst
new file mode 100644
index 0000000..8d4ec40
--- /dev/null
+++ b/math/test/testcases/directed/erff.tst
@@ -0,0 +1,19 @@
+; erff.tst
+;
+; Copyright 2009 ARM Limited. All rights reserved.
+;
+; $Id$
+; $URL$
+
+func=erff op1=7fc00001 result=7fc00001 errno=0
+func=erff op1=ffc00001 result=7fc00001 errno=0
+func=erff op1=7f800001 result=7fc00001 errno=0 status=i
+func=erff op1=ff800001 result=7fc00001 errno=0 status=i
+func=erff op1=7f800000 result=3f800000 errno=0
+func=erff op1=ff800000 result=bf800000 errno=0
+func=erff op1=00000000 result=00000000 errno=ERANGE
+func=erff op1=80000000 result=80000000 errno=ERANGE
+func=erff op1=00000001 result=00000001 errno=0 status=ux
+func=erff op1=80000001 result=80000001 errno=0 status=ux
+func=erff op1=3f800000 result=3f57bb3d.3a0 errno=0
+func=erff op1=bf800000 result=bf57bb3d.3a0 errno=0
diff --git a/math/test/ulp.c b/math/test/ulp.c
index 371567a..3701c93 100644
--- a/math/test/ulp.c
+++ b/math/test/ulp.c
@@ -331,6 +331,7 @@ static const struct fun fun[] = {
F1 (log)
F1 (log2)
F2 (pow)
+ F1 (erf)
D1 (exp)
D1 (exp2)
D1 (log)