aboutsummaryrefslogtreecommitdiff
path: root/math/pow.c
diff options
context:
space:
mode:
authorSzabolcs Nagy <szabolcs.nagy@arm.com>2018-06-21 17:53:15 +0100
committerSzabolcs Nagy <szabolcs.nagy@arm.com>2018-06-22 14:38:31 +0100
commita6230320c149a8e1ae790433fe0828e6060c53fa (patch)
treea39a631bc71a5786cda7746fceb7d04d5007ca24 /math/pow.c
parentdb6e4e96bea641fff18006803c4b1eea19d664f7 (diff)
downloadarm-optimized-routines-a6230320c149a8e1ae790433fe0828e6060c53fa.tar.gz
Improve pow implementation
The log part of pow got rewritten to use a slightly different algorithm. This improves precision and throughput while keeps the same table size. Near 1 cases are no longer special cased, there is a slight performance regression in that case. And when the fma instruction is not available this algorithm is expected to have slightly worse performance. Worst-case error improved from 0.67 ULP to 0.57 ULP. On Cortex-A72 i see thruput near 1: 7% worse latency near 1: 2% worse thruput general: 8% better latency general: 2% better
Diffstat (limited to 'math/pow.c')
-rw-r--r--math/pow.c98
1 files changed, 42 insertions, 56 deletions
diff --git a/math/pow.c b/math/pow.c
index c9c0798..3bc26b5 100644
--- a/math/pow.c
+++ b/math/pow.c
@@ -22,18 +22,17 @@
#include "math_config.h"
/*
-Worst-case error: 0.67 ULP (~= ulperr_exp + 1024*Ln2*relerr_log*2^53)
-relerr_log: 1.8 * 2^-66 (Relative error of log)
+Worst-case error: 0.57 ULP (~= ulperr_exp + 1024*Ln2*relerr_log*2^53)
+relerr_log: 1.3 * 2^-68 (Relative error of log, 1.4 * 2^-68 without fma)
ulperr_exp: 0.509 ULP (ULP error of exp)
*/
#define T __pow_log_data.tab
-#define B __pow_log_data.poly1
#define A __pow_log_data.poly
#define Ln2hi __pow_log_data.ln2hi
#define Ln2lo __pow_log_data.ln2lo
#define N (1 << POW_LOG_TABLE_BITS)
-#define OFF 0x3fe6000000000000
+#define OFF 0x3fe6955500000000
static inline uint32_t
top12 (double x)
@@ -45,37 +44,10 @@ static inline double_t
log_inline (uint64_t ix, double_t *tail)
{
/* double_t for better performance on targets with FLT_EVAL_METHOD==2. */
- double_t w, z, zhi, zlo, r, r2, r3, y, invc, logc, kd, hi, lo, khi, rhi, rlo, p, q;
+ double_t z, r, y, invc, logc, logctail, kd, hi, t1, t2, lo, lo1, lo2, p;
uint64_t iz, tmp;
int k, i;
-#if POW_LOG_POLY1_ORDER == 9
-# define LO asuint64 (1 - 0x1.1p-7)
-# define HI asuint64 (1 + 0x1.98p-7)
-#endif
- if (unlikely (ix - LO < HI - LO))
- {
- r = asdouble (ix) - 1.0;
- /* Split r into top and bottom half. */
- w = r * 0x1p27;
- rhi = r + w - w;
- rlo = r - rhi;
- /* Compute r - r*r/2 precisely into hi+lo. */
- w = rhi*rhi*B[0]; /* B[0] == -0.5. */
- hi = r + w;
- lo = r - hi + w;
- lo += B[0]*rlo*(rhi + r);
- r2 = r*r;
- r3 = r*r2;
-#if POW_LOG_POLY1_ORDER == 9
- p = B[1] + r*(B[2] + r*B[3] + r2*B[4] + r3*(B[5] + r*B[6] + r2*B[7]));
-#endif
- q = lo + r3*p;
- y = hi + q;
- *tail = (hi - y) + q;
- return y;
- }
-
/* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
The range is split into N subintervals.
The ith subinterval contains z and c is near its center. */
@@ -83,41 +55,55 @@ log_inline (uint64_t ix, double_t *tail)
i = (tmp >> (52 - POW_LOG_TABLE_BITS)) % N;
k = (int64_t) tmp >> 52; /* arithmetic shift */
iz = ix - (tmp & 0xfffULL << 52);
- invc = T[i].invc;
- logc = T[i].logc;
z = asdouble (iz);
- zhi = asdouble ((iz + (1ULL<<31)) & (-1ULL << 32));
- zlo = z - zhi;
-
- /* log(x) = log1p(z/c-1) + log(c) + k*Ln2 */
- rhi = zhi * invc - 1.0;
- rlo = zlo * invc;
kd = (double_t) k;
- /* hi + lo = r + log(c) + k*Ln2. */
- khi = kd * Ln2hi;
- w = khi + logc;
- lo = khi - w + logc;
- hi = w + rhi;
- lo = w - hi + rhi + (lo + kd*Ln2lo) + rlo;
+ /* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */
+ invc = T[i].invc;
+ logc = T[i].logc;
+ logctail = T[i].logctail;
- /* log(x) = lo + (log1p(r) - r) + hi. */
- /* Evaluation is optimized assuming superscalar pipelined execution. */
+ /* r = z/c - 1, arranged to be exact. */
#if HAVE_FAST_FMA
r = fma (z, invc, -1.0);
#else
+ double_t zhi = asdouble (iz & (-1ULL << 32));
+ double_t zlo = z - zhi;
+ double_t rhi = zhi * invc - 1.0;
+ double_t rlo = zlo * invc;
r = rhi + rlo;
#endif
- r2 = r * r;
-#if POW_LOG_POLY_ORDER == 7
- p = lo + r*r2*(A[1] + r*A[2] + r2*(A[3] + r*A[4] + r2*A[5]));
+ /* k*Ln2 + log(c) + r. */
+ t1 = kd * Ln2hi + logc;
+ t2 = t1 + r;
+ lo1 = kd * Ln2lo + logctail;
+ lo2 = t1 - t2 + r;
+
+ /* Evaluation is optimized assuming superscalar pipelined execution. */
+ double_t ar, ar2, ar3, lo3, lo4;
+ ar = A[0] * r; /* A[0] = -0.5. */
+ ar2 = r * ar;
+ ar3 = r * ar2;
+ /* k*Ln2 + log(c) + r + A[0]*r*r. */
+#if HAVE_FAST_FMA
+ hi = t2 + ar2;
+ lo3 = fma (ar, r, -ar2);
+ lo4 = t2 - hi + ar2;
+#else
+ double_t arhi = A[0] * rhi;
+ double_t arhi2 = rhi * arhi;
+ hi = t2 + arhi2;
+ lo3 = rlo * (ar + arhi);
+ lo4 = t2 - hi + arhi2;
+#endif
+ /* p = log1p(r) - r - A[0]*r*r. */
+#if POW_LOG_POLY_ORDER == 8
+ p = ar3*(A[1] + r*A[2] + ar2*(A[3] + r*A[4] + ar2*(A[5] + r*A[6])));
#endif
- q = A[0]*r2; /* A[0] == -0.5. */
- w = q + hi;
- p += hi - w + q;
- y = p + w;
- *tail = w - y + p;
+ lo = lo1 + lo2 + lo3 + lo4 + p;
+ y = hi + lo;
+ *tail = hi - y + lo;
return y;
}