Improve pow implementation

The log part of pow got rewritten to use a slightly different algorithm. This improves precision and throughput while keeps the same table size. Near 1 cases are no longer special cased, there is a slight performance regression in that case. And when the fma instruction is not available this algorithm is expected to have slightly worse performance. Worst-case error improved from 0.67 ULP to 0.57 ULP. On Cortex-A72 i see thruput near 1: 7% worse latency near 1: 2% worse thruput general: 8% better latency general: 2% better
author: Szabolcs Nagy <szabolcs.nagy@arm.com> 2018-06-21 17:53:15 +0100
committer: Szabolcs Nagy <szabolcs.nagy@arm.com> 2018-06-22 14:38:31 +0100
commit: a6230320c149a8e1ae790433fe0828e6060c53fa (patch)
tree: a39a631bc71a5786cda7746fceb7d04d5007ca24 /math/pow.c
parent: db6e4e96bea641fff18006803c4b1eea19d664f7 (diff)
download: arm-optimized-routines-a6230320c149a8e1ae790433fe0828e6060c53fa.tar.gz
1 files changed, 42 insertions, 56 deletions
diff --git a/math/pow.c b/math/pow.c
index c9c0798..3bc26b5 100644
--- a/math/pow.c
+++ b/math/pow.c
@@ -22,18 +22,17 @@
 #include "math_config.h"
 
 /*
-Worst-case error: 0.67 ULP (~= ulperr_exp + 1024*Ln2*relerr_log*2^53)
-relerr_log: 1.8 * 2^-66 (Relative error of log)
+Worst-case error: 0.57 ULP (~= ulperr_exp + 1024*Ln2*relerr_log*2^53)
+relerr_log: 1.3 * 2^-68 (Relative error of log, 1.4 * 2^-68 without fma)
 ulperr_exp: 0.509 ULP (ULP error of exp)
 */
 
 #define T __pow_log_data.tab
-#define B __pow_log_data.poly1
 #define A __pow_log_data.poly
 #define Ln2hi __pow_log_data.ln2hi
 #define Ln2lo __pow_log_data.ln2lo
 #define N (1 << POW_LOG_TABLE_BITS)
-#define OFF 0x3fe6000000000000
+#define OFF 0x3fe6955500000000
 
 static inline uint32_t
 top12 (double x)
@@ -45,37 +44,10 @@ static inline double_t
 log_inline (uint64_t ix, double_t *tail)
 {
   /* double_t for better performance on targets with FLT_EVAL_METHOD==2.  */
-  double_t w, z, zhi, zlo, r, r2, r3, y, invc, logc, kd, hi, lo, khi, rhi, rlo, p, q;
+  double_t z, r, y, invc, logc, logctail, kd, hi, t1, t2, lo, lo1, lo2, p;
   uint64_t iz, tmp;
   int k, i;
 
-#if POW_LOG_POLY1_ORDER == 9
-# define LO asuint64 (1 - 0x1.1p-7)
-# define HI asuint64 (1 + 0x1.98p-7)
-#endif
-  if (unlikely (ix - LO < HI - LO))
-    {
-      r = asdouble (ix) - 1.0;
-      /* Split r into top and bottom half.  */
-      w = r * 0x1p27;
-      rhi = r + w - w;
-      rlo = r - rhi;
-      /* Compute r - r*r/2 precisely into hi+lo.  */
-      w = rhi*rhi*B[0]; /* B[0] == -0.5.  */
-      hi = r + w;
-      lo = r - hi + w;
-      lo += B[0]*rlo*(rhi + r);
-      r2 = r*r;
-      r3 = r*r2;
-#if POW_LOG_POLY1_ORDER == 9
-      p = B[1] + r*(B[2] + r*B[3] + r2*B[4] + r3*(B[5] + r*B[6] + r2*B[7]));
-#endif
-      q = lo + r3*p;
-      y = hi + q;
-      *tail = (hi - y) + q;
-      return y;
-    }
-
   /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
      The range is split into N subintervals.
      The ith subinterval contains z and c is near its center.  */
@@ -83,41 +55,55 @@ log_inline (uint64_t ix, double_t *tail)
   i = (tmp >> (52 - POW_LOG_TABLE_BITS)) % N;
   k = (int64_t) tmp >> 52; /* arithmetic shift */
   iz = ix - (tmp & 0xfffULL << 52);
-  invc = T[i].invc;
-  logc = T[i].logc;
   z = asdouble (iz);
-  zhi = asdouble ((iz + (1ULL<<31)) & (-1ULL << 32));
-  zlo = z - zhi;
-
-  /* log(x) = log1p(z/c-1) + log(c) + k*Ln2 */
-  rhi = zhi * invc - 1.0;
-  rlo = zlo * invc;
   kd = (double_t) k;
 
-  /* hi + lo = r + log(c) + k*Ln2.  */
-  khi = kd * Ln2hi;
-  w = khi + logc;
-  lo = khi - w + logc;
-  hi = w + rhi;
-  lo = w - hi + rhi + (lo + kd*Ln2lo) + rlo;
+  /* log(x) = k*Ln2 + log(c) + log1p(z/c-1).  */
+  invc = T[i].invc;
+  logc = T[i].logc;
+  logctail = T[i].logctail;
 
-  /* log(x) = lo + (log1p(r) - r) + hi.  */
-  /* Evaluation is optimized assuming superscalar pipelined execution.  */
+  /* r = z/c - 1, arranged to be exact.  */
 #if HAVE_FAST_FMA
   r = fma (z, invc, -1.0);
 #else
+  double_t zhi = asdouble (iz & (-1ULL << 32));
+  double_t zlo = z - zhi;
+  double_t rhi = zhi * invc - 1.0;
+  double_t rlo = zlo * invc;
   r = rhi + rlo;
 #endif
-  r2 = r * r;
 
-#if POW_LOG_POLY_ORDER == 7
-  p = lo + r*r2*(A[1] + r*A[2] + r2*(A[3] + r*A[4] + r2*A[5]));
+  /* k*Ln2 + log(c) + r.  */
+  t1 = kd * Ln2hi + logc;
+  t2 = t1 + r;
+  lo1 = kd * Ln2lo + logctail;
+  lo2 = t1 - t2 + r;
+
+  /* Evaluation is optimized assuming superscalar pipelined execution.  */
+  double_t ar, ar2, ar3, lo3, lo4;
+  ar = A[0] * r; /* A[0] = -0.5.  */
+  ar2 = r * ar;
+  ar3 = r * ar2;
+  /* k*Ln2 + log(c) + r + A[0]*r*r.  */
+#if HAVE_FAST_FMA
+  hi = t2 + ar2;
+  lo3 = fma (ar, r, -ar2);
+  lo4 = t2 - hi + ar2;
+#else
+  double_t arhi = A[0] * rhi;
+  double_t arhi2 = rhi * arhi;
+  hi = t2 + arhi2;
+  lo3 = rlo * (ar + arhi);
+  lo4 = t2 - hi + arhi2;
+#endif
+  /* p = log1p(r) - r - A[0]*r*r.  */
+#if POW_LOG_POLY_ORDER == 8
+  p = ar3*(A[1] + r*A[2] + ar2*(A[3] + r*A[4] + ar2*(A[5] + r*A[6])));
 #endif
-  q = A[0]*r2; /* A[0] == -0.5.  */
-  w = q + hi;
-  p += hi - w + q;
-  y = p + w;
-  *tail = w - y + p;
+  lo = lo1 + lo2 + lo3 + lo4 + p;
+  y = hi + lo;
+  *tail = hi - y + lo;
   return y;
 }
author	Szabolcs Nagy <szabolcs.nagy@arm.com>	2018-06-21 17:53:15 +0100
committer	Szabolcs Nagy <szabolcs.nagy@arm.com>	2018-06-22 14:38:31 +0100
commit	a6230320c149a8e1ae790433fe0828e6060c53fa (patch)
tree	a39a631bc71a5786cda7746fceb7d04d5007ca24 /math/pow.c
parent	db6e4e96bea641fff18006803c4b1eea19d664f7 (diff)
download	arm-optimized-routines-a6230320c149a8e1ae790433fe0828e6060c53fa.tar.gz