Add vector exp, expf and related vector math support code

Vector math routines are added to the same libmathlib library as scalar ones. The difficulty is that they are not always available, the external abi depends on the compiler version used for the build. Currently only aarch64 AdvSIMD is supported, there are 4 new sets of symbols: __s_foo is a scalar function with identical result to the vector one, __v_foo is a vector function using the base PCS, __vn_foo uses the vector PCS and _ZGV*_foo is the vector ABI symbol alias of vn_foo for a scalar math function foo. The test and benchmark code got extended to handle vector functions. Vector functions aim for < 5 ulp worst case error, only support nearest rounding mode and don't support floating-point exceptions. Vector functions may call scalar functions to handle special cases, but for a single value they should return the same result independently of values in other vector lanes or the position of the value in the vector. The __v_expf and __v_expf_1u polynomials were produced by searching the coefficient space with some heuristics and ideas from https://arxiv.org/abs/1508.03211 Their worst case error is 1.95 and 0.866 ulp respectively. The exp polynomial was produced by sollya, it uses a 128 element (1KB) lookup table and has 2.38 ulp worst case error.
author: Szabolcs Nagy <szabolcs.nagy@arm.com> 2019-07-18 12:51:41 +0100
committer: Szabolcs Nagy <szabolcs.nagy@arm.com> 2019-10-14 11:58:46 +0100
commit: 7a1f4cfdecb2060158bff1c9428ed2bb5442cc6f (patch)
tree: a2ce7dee1d1dd0af37d23f6263f6a1538baffbf5 /math/test
parent: a88f3f60c4b9891f12522586a9d445175611e2c8 (diff)
download: arm-optimized-routines-7a1f4cfdecb2060158bff1c9428ed2bb5442cc6f.tar.gz
4 files changed, 367 insertions, 30 deletions
diff --git a/math/test/mathbench.c b/math/test/mathbench.c
index 1266d11..be7e439 100644
--- a/math/test/mathbench.c
+++ b/math/test/mathbench.c
@@ -29,6 +29,50 @@ static float Af[N];
 static long measurecount = MEASURE;
 static long itercount = ITER;
 
+#if __aarch64__
+typedef __f64x2_t v_double;
+
+#define v_double_len() 2
+
+static inline v_double
+v_double_load (const double *p)
+{
+  return (v_double){p[0], p[1]};
+}
+
+static inline v_double
+v_double_dup (double x)
+{
+  return (v_double){x, x};
+}
+
+typedef __f32x4_t v_float;
+
+#define v_float_len() 4
+
+static inline v_float
+v_float_load (const float *p)
+{
+  return (v_float){p[0], p[1], p[2], p[3]};
+}
+
+static inline v_float
+v_float_dup (float x)
+{
+  return (v_float){x, x, x, x};
+}
+#else
+/* dummy definitions to make things compile.  */
+typedef double v_double;
+typedef float v_float;
+#define v_double_len(x) 1
+#define v_double_load(x) (x)[0]
+#define v_double_dup(x) (x)
+#define v_float_len(x) 1
+#define v_float_load(x) (x)[0]
+#define v_float_dup(x) (x)
+#endif
+
 static double
 dummy (double x)
 {
@@ -41,6 +85,32 @@ dummyf (float x)
   return x;
 }
 
+static v_double
+__v_dummy (v_double x)
+{
+  return x;
+}
+
+static v_float
+__v_dummyf (v_float x)
+{
+  return x;
+}
+
+#ifdef __vpcs
+__vpcs static v_double
+__vn_dummy (v_double x)
+{
+  return x;
+}
+
+__vpcs static v_float
+__vn_dummyf (v_float x)
+{
+  return x;
+}
+#endif
+
 static double
 xypow (double x)
 {
@@ -89,42 +159,56 @@ static const struct fun
 {
   const char *name;
   int prec;
+  int vec;
   double lo;
   double hi;
   union
   {
     double (*d) (double);
     float (*f) (float);
+    v_double (*vd) (v_double);
+    v_float (*vf) (v_float);
+#ifdef __vpcs
+    __vpcs v_double (*vnd) (v_double);
+    __vpcs v_float (*vnf) (v_float);
+#endif
   } fun;
 } funtab[] = {
-#define D(func, lo, hi) {#func, 'd', lo, hi, {.d = func}},
-#define F(func, lo, hi) {#func, 'f', lo, hi, {.f = func}},
+#define D(func, lo, hi) {#func, 'd', 0, lo, hi, {.d = func}},
+#define F(func, lo, hi) {#func, 'f', 0, lo, hi, {.f = func}},
+#define VD(func, lo, hi) {#func, 'd', 'v', lo, hi, {.vd = func}},
+#define VF(func, lo, hi) {#func, 'f', 'v', lo, hi, {.vf = func}},
+#define VND(func, lo, hi) {#func, 'd', 'n', lo, hi, {.vnd = func}},
+#define VNF(func, lo, hi) {#func, 'f', 'n', lo, hi, {.vnf = func}},
 D (dummy, 1.0, 2.0)
 D (exp, -9.9, 9.9)
 D (exp, 0.5, 1.0)
+D (__s_exp, -9.9, 9.9)
 D (exp2, -9.9, 9.9)
 D (log, 0.01, 11.1)
 D (log, 0.999, 1.001)
 D (log2, 0.01, 11.1)
 D (log2, 0.999, 1.001)
-{"pow", 'd', 0.01, 11.1, {.d = xypow}},
+{"pow", 'd', 0, 0.01, 11.1, {.d = xypow}},
 D (xpow, 0.01, 11.1)
 D (ypow, -9.9, 9.9)
 
 F (dummyf, 1.0, 2.0)
 F (expf, -9.9, 9.9)
+F (__s_expf, -9.9, 9.9)
+F (__s_expf_1u, -9.9, 9.9)
 F (exp2f, -9.9, 9.9)
 F (logf, 0.01, 11.1)
 F (log2f, 0.01, 11.1)
-{"powf", 'f', 0.01, 11.1, {.f = xypowf}},
+{"powf", 'f', 0, 0.01, 11.1, {.f = xypowf}},
 F (xpowf, 0.01, 11.1)
 F (ypowf, -9.9, 9.9)
-{"sincosf", 'f', 0.1, 0.7, {.f = sincosf_wrap}},
-{"sincosf", 'f', 0.8, 3.1, {.f = sincosf_wrap}},
-{"sincosf", 'f', -3.1, 3.1, {.f = sincosf_wrap}},
-{"sincosf", 'f', 3.3, 33.3, {.f = sincosf_wrap}},
-{"sincosf", 'f', 100, 1000, {.f = sincosf_wrap}},
-{"sincosf", 'f', 1e6, 1e32, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 0.1, 0.7, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 0.8, 3.1, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, -3.1, 3.1, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 3.3, 33.3, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 100, 1000, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 1e6, 1e32, {.f = sincosf_wrap}},
 F (sinf, 0.1, 0.7)
 F (sinf, 0.8, 3.1)
 F (sinf, -3.1, 3.1)
@@ -137,9 +221,29 @@ F (cosf, -3.1, 3.1)
 F (cosf, 3.3, 33.3)
 F (cosf, 100, 1000)
 F (cosf, 1e6, 1e32)
+#if __aarch64__
+VD (__v_dummy, 1.0, 2.0)
+VD (__v_exp, -9.9, 9.9)
+VF (__v_dummyf, 1.0, 2.0)
+VF (__v_expf, -9.9, 9.9)
+VF (__v_expf_1u, -9.9, 9.9)
+#ifdef __vpcs
+VND (__vn_dummy, 1.0, 2.0)
+VND (__vn_exp, -9.9, 9.9)
+VND (_ZGVnN2v_exp, -9.9, 9.9)
+VNF (__vn_dummyf, 1.0, 2.0)
+VNF (__vn_expf, -9.9, 9.9)
+VNF (_ZGVnN4v_expf, -9.9, 9.9)
+VNF (__vn_expf_1u, -9.9, 9.9)
+#endif
+#endif
 {0},
 #undef F
 #undef D
+#undef VF
+#undef VD
+#undef VNF
+#undef VND
 };
 
 static void
@@ -238,6 +342,72 @@ runf_latency (float f (float))
     prev = f (Af[i] + prev * z);
 }
 
+static void
+run_v_thruput (v_double f (v_double))
+{
+  for (int i = 0; i < N; i += v_double_len ())
+    f (v_double_load (A+i));
+}
+
+static void
+runf_v_thruput (v_float f (v_float))
+{
+  for (int i = 0; i < N; i += v_float_len ())
+    f (v_float_load (Af+i));
+}
+
+static void
+run_v_latency (v_double f (v_double))
+{
+  v_double z = v_double_dup (zero);
+  v_double prev = z;
+  for (int i = 0; i < N; i += v_double_len ())
+    prev = f (v_double_load (A+i) + prev * z);
+}
+
+static void
+runf_v_latency (v_float f (v_float))
+{
+  v_float z = v_float_dup (zero);
+  v_float prev = z;
+  for (int i = 0; i < N; i += v_float_len ())
+    prev = f (v_float_load (Af+i) + prev * z);
+}
+
+#ifdef __vpcs
+static void
+run_vn_thruput (__vpcs v_double f (v_double))
+{
+  for (int i = 0; i < N; i += v_double_len ())
+    f (v_double_load (A+i));
+}
+
+static void
+runf_vn_thruput (__vpcs v_float f (v_float))
+{
+  for (int i = 0; i < N; i += v_float_len ())
+    f (v_float_load (Af+i));
+}
+
+static void
+run_vn_latency (__vpcs v_double f (v_double))
+{
+  v_double z = v_double_dup (zero);
+  v_double prev = z;
+  for (int i = 0; i < N; i += v_double_len ())
+    prev = f (v_double_load (A+i) + prev * z);
+}
+
+static void
+runf_vn_latency (__vpcs v_float f (v_float))
+{
+  v_float z = v_float_dup (zero);
+  v_float prev = z;
+  for (int i = 0; i < N; i += v_float_len ())
+    prev = f (v_float_load (Af+i) + prev * z);
+}
+#endif
+
 static uint64_t
 tic (void)
 {
@@ -267,20 +437,54 @@ bench1 (const struct fun *f, int type, double lo, double hi)
   uint64_t dt = 0;
   uint64_t ns100;
   const char *s = type == 't' ? "rthruput" : "latency";
+  int vlen = 1;
 
-  if (f->prec == 'd' && type == 't')
+  if (f->vec && f->prec == 'd')
+    vlen = v_double_len();
+  else if (f->vec && f->prec == 'f')
+    vlen = v_float_len();
+
+  if (f->prec == 'd' && type == 't' && f->vec == 0)
     TIMEIT (run_thruput, f->fun.d);
-  else if (f->prec == 'd' && type == 'l')
+  else if (f->prec == 'd' && type == 'l' && f->vec == 0)
     TIMEIT (run_latency, f->fun.d);
-  else if (f->prec == 'f' && type == 't')
+  else if (f->prec == 'f' && type == 't' && f->vec == 0)
     TIMEIT (runf_thruput, f->fun.f);
-  else if (f->prec == 'f' && type == 'l')
+  else if (f->prec == 'f' && type == 'l' && f->vec == 0)
     TIMEIT (runf_latency, f->fun.f);
-
-  ns100 = (100 * dt + itercount * N / 2) / (itercount * N);
-  printf ("%7s %8s: %4u.%02u ns/elem %10llu ns in [%g %g]\n", f->name, s,
-	  (unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
-	  (unsigned long long) dt, lo, hi);
+  else if (f->prec == 'd' && type == 't' && f->vec == 'v')
+    TIMEIT (run_v_thruput, f->fun.vd);
+  else if (f->prec == 'd' && type == 'l' && f->vec == 'v')
+    TIMEIT (run_v_latency, f->fun.vd);
+  else if (f->prec == 'f' && type == 't' && f->vec == 'v')
+    TIMEIT (runf_v_thruput, f->fun.vf);
+  else if (f->prec == 'f' && type == 'l' && f->vec == 'v')
+    TIMEIT (runf_v_latency, f->fun.vf);
+#ifdef __vpcs
+  else if (f->prec == 'd' && type == 't' && f->vec == 'n')
+    TIMEIT (run_vn_thruput, f->fun.vnd);
+  else if (f->prec == 'd' && type == 'l' && f->vec == 'n')
+    TIMEIT (run_vn_latency, f->fun.vnd);
+  else if (f->prec == 'f' && type == 't' && f->vec == 'n')
+    TIMEIT (runf_vn_thruput, f->fun.vnf);
+  else if (f->prec == 'f' && type == 'l' && f->vec == 'n')
+    TIMEIT (runf_vn_latency, f->fun.vnf);
+#endif
+
+  if (type == 't')
+    {
+      ns100 = (100 * dt + itercount * N / 2) / (itercount * N);
+      printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g]\n", f->name, s,
+	      (unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
+	      (unsigned long long) dt, lo, hi);
+    }
+  else if (type == 'l')
+    {
+      ns100 = (100 * dt + itercount * N / vlen / 2) / (itercount * N / vlen);
+      printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g]\n", f->name, s,
+	      (unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
+	      (unsigned long long) dt, lo, hi);
+    }
   fflush (stdout);
 }
 
diff --git a/math/test/runulp.sh b/math/test/runulp.sh
index 64db6f3..b2c1827 100755
--- a/math/test/runulp.sh
+++ b/math/test/runulp.sh
@@ -24,6 +24,10 @@ t() {
 	$emu ./ulp -r $r -e $Lt $flags "$@" && PASS=$((PASS+1)) || FAIL=$((FAIL+1))
 }
 
+check() {
+	$emu ./ulp -f -q "$@" >/dev/null
+}
+
 Ldir=0.5
 for r in $rmodes
 do
@@ -87,6 +91,66 @@ t powf  0x1.ep-1 0x1.1p0 x  0x1p8 0x1p14  50000
 t powf  0x1.ep-1 0x1.1p0 x -0x1p8 -0x1p14 50000
 done
 
+# vector functions
+Ldir=0.5
+r='n'
+flags="${ULPFLAGS:--q} -f"
+runv=
+check __v_exp 1 && runv=1
+runvn=
+check __vn_exp 1 && runvn=1
+
+range_exp='
+  0 0xffff000000000000 10000
+  0x1p-6     0x1p6     400000
+ -0x1p-6    -0x1p6     400000
+  633.3      733.3     10000
+ -633.3     -777.3     10000
+'
+
+range_expf='
+  0    0xffff0000    10000
+  0x1p-14   0x1p8    500000
+ -0x1p-14  -0x1p8    500000
+'
+
+range_expf_1u="$range_expf"
+
+# error limits
+L_exp=1.9
+L_expf=1.49
+L_expf_1u=0.4
+
+# group symbol run
+echo "
+exp  __s_exp       1
+exp  __v_exp       $runv
+exp  __vn_exp      $runvn
+exp  _ZGVnN2v_exp  $runvn
+
+expf __s_expf      1
+expf __v_expf      $runv
+expf __vn_expf     $runvn
+expf _ZGVnN4v_expf $runvn
+
+expf_1u __s_expf_1u   1
+expf_1u __v_expf_1u   $runv
+expf_1u __vn_expf_1u  $runvn
+
+" | while read G F R
+do
+	[ "$R" = 1 ] || continue
+	case "$G" in \#*) continue ;; esac
+	eval range="\${range_$G}"
+	eval L="\${L_$G}"
+	echo "$range" |while read X
+	do
+		[ -n "$X" ] || continue
+		case "$X" in \#*) continue ;; esac
+		t $F $X
+	done
+done
+
 [ 0 -eq $FAIL ] || {
 	echo "FAILED $FAIL PASSED $PASS"
 	exit 1
diff --git a/math/test/ulp.c b/math/test/ulp.c
index cd15dec..aec8aa5 100644
--- a/math/test/ulp.c
+++ b/math/test/ulp.c
@@ -13,6 +13,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include "mathlib.h"
 
 /* Don't depend on mpfr by default.  */
 #ifndef USE_MPFR
@@ -208,11 +209,38 @@ struct conf
   double errlim;
 };
 
+/* A bit of a hack: call vector functions twice with the same
+   input in lane 0 but a different value in other lanes: once
+   with an in-range value and then with a special case value.  */
+static int secondcall;
+
+/* Wrappers for vector functions.  */
+#if __aarch64__
+typedef __f32x4_t v_float;
+typedef __f64x2_t v_double;
+static const float fv[2] = {1.0f, -INFINITY};
+static const double dv[2] = {1.0, -INFINITY};
+static inline v_float argf(float x) { return (v_float){x,x,x,fv[secondcall]}; }
+static inline v_double argd(double x) { return (v_double){x,dv[secondcall]}; }
+
+static float v_expf_1u(float x) { return __v_expf_1u(argf(x))[0]; }
+static float v_expf(float x) { return __v_expf(argf(x))[0]; }
+static double v_exp(double x) { return __v_exp(argd(x))[0]; }
+#ifdef __vpcs
+static float vn_expf_1u(float x) { return __vn_expf_1u(argf(x))[0]; }
+static float vn_expf(float x) { return __vn_expf(argf(x))[0]; }
+static double vn_exp(double x) { return __vn_exp(argd(x))[0]; }
+static float Z_expf(float x) { return _ZGVnN4v_expf(argf(x))[0]; }
+static double Z_exp(double x) { return _ZGVnN2v_exp(argd(x))[0]; }
+#endif
+#endif
+
 struct fun
 {
   const char *name;
   int arity;
   int singleprec;
+  int twice;
   union
   {
     float (*f1) (float);
@@ -240,16 +268,16 @@ struct fun
 
 static const struct fun fun[] = {
 #if USE_MPFR
-# define F(x, x_long, x_mpfr, a, s, t) \
-  {#x, a, s, {.t = x}, {.t = x_long}, {.t = x_mpfr}},
+# define F(x, x_wrap, x_long, x_mpfr, a, s, t, twice) \
+  {#x, a, s, twice, {.t = x_wrap}, {.t = x_long}, {.t = x_mpfr}},
 #else
-# define F(x, x_long, x_mpfr, a, s, t) \
-  {#x, a, s, {.t = x}, {.t = x_long}},
+# define F(x, x_wrap, x_long, x_mpfr, a, s, t, twice) \
+  {#x, a, s, twice, {.t = x_wrap}, {.t = x_long}},
 #endif
-#define F1(x) F (x##f, x, mpfr_##x, 1, 1, f1)
-#define F2(x) F (x##f, x, mpfr_##x, 2, 1, f2)
-#define D1(x) F (x, x##l, mpfr_##x, 1, 0, d1)
-#define D2(x) F (x, x##l, mpfr_##x, 2, 0, d2)
+#define F1(x) F (x##f, x##f, x, mpfr_##x, 1, 1, f1, 0)
+#define F2(x) F (x##f, x##f, x, mpfr_##x, 2, 1, f2, 0)
+#define D1(x) F (x, x, x##l, mpfr_##x, 1, 0, d1, 0)
+#define D2(x) F (x, x, x##l, mpfr_##x, 2, 0, d2, 0)
  F1 (sin)
  F1 (cos)
  F1 (exp)
@@ -262,6 +290,26 @@ static const struct fun fun[] = {
  D1 (log)
  D1 (log2)
  D2 (pow)
+ F (__s_expf_1u, __s_expf_1u, exp, mpfr_exp, 1, 1, f1, 0)
+ F (__s_expf, __s_expf, exp, mpfr_exp, 1, 1, f1, 0)
+ F (__s_exp, __s_exp, expl, mpfr_exp, 1, 0, d1, 0)
+#if __aarch64__
+ F (__v_expf_1u, v_expf_1u, exp, mpfr_exp, 1, 1, f1, 1)
+ F (__v_expf, v_expf, exp, mpfr_exp, 1, 1, f1, 1)
+ F (__v_exp, v_exp, expl, mpfr_exp, 1, 0, d1, 1)
+#ifdef __vpcs
+ F (__vn_expf_1u, vn_expf_1u, exp, mpfr_exp, 1, 1, f1, 1)
+ F (__vn_expf, vn_expf, exp, mpfr_exp, 1, 1, f1, 1)
+ F (__vn_exp, vn_exp, expl, mpfr_exp, 1, 0, d1, 1)
+ F (_ZGVnN4v_expf, Z_expf, exp, mpfr_exp, 1, 1, f1, 1)
+ F (_ZGVnN2v_exp, Z_exp, expl, mpfr_exp, 1, 0, d1, 1)
+#endif
+#endif
+#undef F
+#undef F1
+#undef F2
+#undef D1
+#undef D2
  {0}};
 
 /* Boilerplate for generic calls.  */
diff --git a/math/test/ulp.h b/math/test/ulp.h
index 1f2fe98..a0c3016 100644
--- a/math/test/ulp.h
+++ b/math/test/ulp.h
@@ -257,11 +257,29 @@ static int T(cmp) (const struct fun *f, struct gen *gen,
       struct RT(ret) want;
       struct T(args) a = T(next) (gen);
       int exgot;
+      int exgot2;
       RT(float) ygot;
+      RT(float) ygot2;
+      int fail = 0;
       if (fenv)
 	T(call_fenv) (f, a, r, &ygot, &exgot);
       else
 	T(call_nofenv) (f, a, r, &ygot, &exgot);
+      if (f->twice) {
+	secondcall = 1;
+	if (fenv)
+	  T(call_fenv) (f, a, r, &ygot2, &exgot2);
+	else
+	  T(call_nofenv) (f, a, r, &ygot2, &exgot2);
+	secondcall = 0;
+	if (RT(asuint) (ygot) != RT(asuint) (ygot2))
+	  {
+	    fail = 1;
+	    cntfail++;
+	    T(printcall) (f, a);
+	    printf (" got %a then %a for same input\n", ygot, ygot2);
+	  }
+      }
       cnt++;
       int ok = use_mpfr
 		 ? T(call_mpfr_fix) (f, a, r, &want, ygot, exgot)
@@ -270,7 +288,6 @@ static int T(cmp) (const struct fun *f, struct gen *gen,
       if (!ok)
 	{
 	  int print = 0;
-	  int fail = 0;
 	  double err = RT(ulperr) (ygot, &want, r);
 	  double abserr = fabs (err);
 	  // TODO: count errors below accuracy limit.
@@ -280,8 +297,12 @@ static int T(cmp) (const struct fun *f, struct gen *gen,
 	    cnt2++;
 	  if (abserr > conf->errlim)
 	    {
-	      print = fail = 1;
-	      cntfail++;
+	      print = 1;
+	      if (!fail)
+		{
+		  fail = 1;
+		  cntfail++;
+		}
 	    }
 	  if (abserr > maxerr)
 	    {
author	Szabolcs Nagy <szabolcs.nagy@arm.com>	2019-07-18 12:51:41 +0100
committer	Szabolcs Nagy <szabolcs.nagy@arm.com>	2019-10-14 11:58:46 +0100
commit	7a1f4cfdecb2060158bff1c9428ed2bb5442cc6f (patch)
tree	a2ce7dee1d1dd0af37d23f6263f6a1538baffbf5 /math/test
parent	a88f3f60c4b9891f12522586a9d445175611e2c8 (diff)
download	arm-optimized-routines-7a1f4cfdecb2060158bff1c9428ed2bb5442cc6f.tar.gz