18 files changed, 1749 insertions, 31 deletions
diff --git a/math/include/mathlib.h b/math/include/mathlib.h
index eed294b..6d62610 100644
--- a/math/include/mathlib.h
+++ b/math/include/mathlib.h
@@ -1,10 +1,13 @@
 /*
  * Public API.
  *
- * Copyright (c) 2015-2018, Arm Limited.
+ * Copyright (c) 2015-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
+#ifndef _MATHLIB_H
+#define _MATHLIB_H
+
 float expf (float);
 float exp2f (float);
 float logf (float);
@@ -19,3 +22,40 @@ double exp2 (double);
 double log (double);
 double log2 (double);
 double pow (double, double);
+
+/* Scalar functions using the vector algorithm with identical result.  */
+float __s_expf (float);
+float __s_expf_1u (float);
+double __s_exp (double);
+
+#if __aarch64__
+#if __GNUC__ >= 5
+typedef __Float32x4_t __f32x4_t;
+typedef __Float64x2_t __f64x2_t;
+#elif __clang_major__*100+__clang_minor__ >= 305
+typedef __attribute__((__neon_vector_type__(4))) float __f32x4_t;
+typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t;
+#else
+#error Unsupported compiler
+#endif
+
+/* Vector functions following the base PCS.  */
+__f32x4_t __v_expf (__f32x4_t);
+__f32x4_t __v_expf_1u (__f32x4_t);
+__f64x2_t __v_exp (__f64x2_t);
+
+#if __GNUC__ >= 9 || __clang_major__ >= 8
+#define __vpcs __attribute__((__aarch64_vector_pcs__))
+
+/* Vector functions following the vector PCS.  */
+__vpcs __f32x4_t __vn_expf (__f32x4_t);
+__vpcs __f32x4_t __vn_expf_1u (__f32x4_t);
+__vpcs __f64x2_t __vn_exp (__f64x2_t);
+
+/* Vector functions following the vector PCS using ABI names.  */
+__vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t);
+#endif
+#endif
+
+#endif
diff --git a/math/s_exp.c b/math/s_exp.c
new file mode 100644
index 0000000..ac7246b
--- /dev/null
+++ b/math/s_exp.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#define SCALAR 1
+#include "v_exp.c"
diff --git a/math/s_expf.c b/math/s_expf.c
new file mode 100644
index 0000000..3492c46
--- /dev/null
+++ b/math/s_expf.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#define SCALAR 1
+#include "v_expf.c"
diff --git a/math/s_expf_1u.c b/math/s_expf_1u.c
new file mode 100644
index 0000000..eb7bbcb
--- /dev/null
+++ b/math/s_expf_1u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#define SCALAR 1
+#include "v_expf_1u.c"
diff --git a/math/test/mathbench.c b/math/test/mathbench.c
index 1266d11..be7e439 100644
--- a/math/test/mathbench.c
+++ b/math/test/mathbench.c
@@ -29,6 +29,50 @@ static float Af[N];
 static long measurecount = MEASURE;
 static long itercount = ITER;
 
+#if __aarch64__
+typedef __f64x2_t v_double;
+
+#define v_double_len() 2
+
+static inline v_double
+v_double_load (const double *p)
+{
+  return (v_double){p[0], p[1]};
+}
+
+static inline v_double
+v_double_dup (double x)
+{
+  return (v_double){x, x};
+}
+
+typedef __f32x4_t v_float;
+
+#define v_float_len() 4
+
+static inline v_float
+v_float_load (const float *p)
+{
+  return (v_float){p[0], p[1], p[2], p[3]};
+}
+
+static inline v_float
+v_float_dup (float x)
+{
+  return (v_float){x, x, x, x};
+}
+#else
+/* dummy definitions to make things compile.  */
+typedef double v_double;
+typedef float v_float;
+#define v_double_len(x) 1
+#define v_double_load(x) (x)[0]
+#define v_double_dup(x) (x)
+#define v_float_len(x) 1
+#define v_float_load(x) (x)[0]
+#define v_float_dup(x) (x)
+#endif
+
 static double
 dummy (double x)
 {
@@ -41,6 +85,32 @@ dummyf (float x)
   return x;
 }
 
+static v_double
+__v_dummy (v_double x)
+{
+  return x;
+}
+
+static v_float
+__v_dummyf (v_float x)
+{
+  return x;
+}
+
+#ifdef __vpcs
+__vpcs static v_double
+__vn_dummy (v_double x)
+{
+  return x;
+}
+
+__vpcs static v_float
+__vn_dummyf (v_float x)
+{
+  return x;
+}
+#endif
+
 static double
 xypow (double x)
 {
@@ -89,42 +159,56 @@ static const struct fun
 {
   const char *name;
   int prec;
+  int vec;
   double lo;
   double hi;
   union
   {
     double (*d) (double);
     float (*f) (float);
+    v_double (*vd) (v_double);
+    v_float (*vf) (v_float);
+#ifdef __vpcs
+    __vpcs v_double (*vnd) (v_double);
+    __vpcs v_float (*vnf) (v_float);
+#endif
   } fun;
 } funtab[] = {
-#define D(func, lo, hi) {#func, 'd', lo, hi, {.d = func}},
-#define F(func, lo, hi) {#func, 'f', lo, hi, {.f = func}},
+#define D(func, lo, hi) {#func, 'd', 0, lo, hi, {.d = func}},
+#define F(func, lo, hi) {#func, 'f', 0, lo, hi, {.f = func}},
+#define VD(func, lo, hi) {#func, 'd', 'v', lo, hi, {.vd = func}},
+#define VF(func, lo, hi) {#func, 'f', 'v', lo, hi, {.vf = func}},
+#define VND(func, lo, hi) {#func, 'd', 'n', lo, hi, {.vnd = func}},
+#define VNF(func, lo, hi) {#func, 'f', 'n', lo, hi, {.vnf = func}},
 D (dummy, 1.0, 2.0)
 D (exp, -9.9, 9.9)
 D (exp, 0.5, 1.0)
+D (__s_exp, -9.9, 9.9)
 D (exp2, -9.9, 9.9)
 D (log, 0.01, 11.1)
 D (log, 0.999, 1.001)
 D (log2, 0.01, 11.1)
 D (log2, 0.999, 1.001)
-{"pow", 'd', 0.01, 11.1, {.d = xypow}},
+{"pow", 'd', 0, 0.01, 11.1, {.d = xypow}},
 D (xpow, 0.01, 11.1)
 D (ypow, -9.9, 9.9)
 
 F (dummyf, 1.0, 2.0)
 F (expf, -9.9, 9.9)
+F (__s_expf, -9.9, 9.9)
+F (__s_expf_1u, -9.9, 9.9)
 F (exp2f, -9.9, 9.9)
 F (logf, 0.01, 11.1)
 F (log2f, 0.01, 11.1)
-{"powf", 'f', 0.01, 11.1, {.f = xypowf}},
+{"powf", 'f', 0, 0.01, 11.1, {.f = xypowf}},
 F (xpowf, 0.01, 11.1)
 F (ypowf, -9.9, 9.9)
-{"sincosf", 'f', 0.1, 0.7, {.f = sincosf_wrap}},
-{"sincosf", 'f', 0.8, 3.1, {.f = sincosf_wrap}},
-{"sincosf", 'f', -3.1, 3.1, {.f = sincosf_wrap}},
-{"sincosf", 'f', 3.3, 33.3, {.f = sincosf_wrap}},
-{"sincosf", 'f', 100, 1000, {.f = sincosf_wrap}},
-{"sincosf", 'f', 1e6, 1e32, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 0.1, 0.7, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 0.8, 3.1, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, -3.1, 3.1, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 3.3, 33.3, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 100, 1000, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 1e6, 1e32, {.f = sincosf_wrap}},
 F (sinf, 0.1, 0.7)
 F (sinf, 0.8, 3.1)
 F (sinf, -3.1, 3.1)
@@ -137,9 +221,29 @@ F (cosf, -3.1, 3.1)
 F (cosf, 3.3, 33.3)
 F (cosf, 100, 1000)
 F (cosf, 1e6, 1e32)
+#if __aarch64__
+VD (__v_dummy, 1.0, 2.0)
+VD (__v_exp, -9.9, 9.9)
+VF (__v_dummyf, 1.0, 2.0)
+VF (__v_expf, -9.9, 9.9)
+VF (__v_expf_1u, -9.9, 9.9)
+#ifdef __vpcs
+VND (__vn_dummy, 1.0, 2.0)
+VND (__vn_exp, -9.9, 9.9)
+VND (_ZGVnN2v_exp, -9.9, 9.9)
+VNF (__vn_dummyf, 1.0, 2.0)
+VNF (__vn_expf, -9.9, 9.9)
+VNF (_ZGVnN4v_expf, -9.9, 9.9)
+VNF (__vn_expf_1u, -9.9, 9.9)
+#endif
+#endif
 {0},
 #undef F
 #undef D
+#undef VF
+#undef VD
+#undef VNF
+#undef VND
 };
 
 static void
@@ -238,6 +342,72 @@ runf_latency (float f (float))
     prev = f (Af[i] + prev * z);
 }
 
+static void
+run_v_thruput (v_double f (v_double))
+{
+  for (int i = 0; i < N; i += v_double_len ())
+    f (v_double_load (A+i));
+}
+
+static void
+runf_v_thruput (v_float f (v_float))
+{
+  for (int i = 0; i < N; i += v_float_len ())
+    f (v_float_load (Af+i));
+}
+
+static void
+run_v_latency (v_double f (v_double))
+{
+  v_double z = v_double_dup (zero);
+  v_double prev = z;
+  for (int i = 0; i < N; i += v_double_len ())
+    prev = f (v_double_load (A+i) + prev * z);
+}
+
+static void
+runf_v_latency (v_float f (v_float))
+{
+  v_float z = v_float_dup (zero);
+  v_float prev = z;
+  for (int i = 0; i < N; i += v_float_len ())
+    prev = f (v_float_load (Af+i) + prev * z);
+}
+
+#ifdef __vpcs
+static void
+run_vn_thruput (__vpcs v_double f (v_double))
+{
+  for (int i = 0; i < N; i += v_double_len ())
+    f (v_double_load (A+i));
+}
+
+static void
+runf_vn_thruput (__vpcs v_float f (v_float))
+{
+  for (int i = 0; i < N; i += v_float_len ())
+    f (v_float_load (Af+i));
+}
+
+static void
+run_vn_latency (__vpcs v_double f (v_double))
+{
+  v_double z = v_double_dup (zero);
+  v_double prev = z;
+  for (int i = 0; i < N; i += v_double_len ())
+    prev = f (v_double_load (A+i) + prev * z);
+}
+
+static void
+runf_vn_latency (__vpcs v_float f (v_float))
+{
+  v_float z = v_float_dup (zero);
+  v_float prev = z;
+  for (int i = 0; i < N; i += v_float_len ())
+    prev = f (v_float_load (Af+i) + prev * z);
+}
+#endif
+
 static uint64_t
 tic (void)
 {
@@ -267,20 +437,54 @@ bench1 (const struct fun *f, int type, double lo, double hi)
   uint64_t dt = 0;
   uint64_t ns100;
   const char *s = type == 't' ? "rthruput" : "latency";
+  int vlen = 1;
 
-  if (f->prec == 'd' && type == 't')
+  if (f->vec && f->prec == 'd')
+    vlen = v_double_len();
+  else if (f->vec && f->prec == 'f')
+    vlen = v_float_len();
+
+  if (f->prec == 'd' && type == 't' && f->vec == 0)
     TIMEIT (run_thruput, f->fun.d);
-  else if (f->prec == 'd' && type == 'l')
+  else if (f->prec == 'd' && type == 'l' && f->vec == 0)
     TIMEIT (run_latency, f->fun.d);
-  else if (f->prec == 'f' && type == 't')
+  else if (f->prec == 'f' && type == 't' && f->vec == 0)
     TIMEIT (runf_thruput, f->fun.f);
-  else if (f->prec == 'f' && type == 'l')
+  else if (f->prec == 'f' && type == 'l' && f->vec == 0)
     TIMEIT (runf_latency, f->fun.f);
-
-  ns100 = (100 * dt + itercount * N / 2) / (itercount * N);
-  printf ("%7s %8s: %4u.%02u ns/elem %10llu ns in [%g %g]\n", f->name, s,
-	  (unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
-	  (unsigned long long) dt, lo, hi);
+  else if (f->prec == 'd' && type == 't' && f->vec == 'v')
+    TIMEIT (run_v_thruput, f->fun.vd);
+  else if (f->prec == 'd' && type == 'l' && f->vec == 'v')
+    TIMEIT (run_v_latency, f->fun.vd);
+  else if (f->prec == 'f' && type == 't' && f->vec == 'v')
+    TIMEIT (runf_v_thruput, f->fun.vf);
+  else if (f->prec == 'f' && type == 'l' && f->vec == 'v')
+    TIMEIT (runf_v_latency, f->fun.vf);
+#ifdef __vpcs
+  else if (f->prec == 'd' && type == 't' && f->vec == 'n')
+    TIMEIT (run_vn_thruput, f->fun.vnd);
+  else if (f->prec == 'd' && type == 'l' && f->vec == 'n')
+    TIMEIT (run_vn_latency, f->fun.vnd);
+  else if (f->prec == 'f' && type == 't' && f->vec == 'n')
+    TIMEIT (runf_vn_thruput, f->fun.vnf);
+  else if (f->prec == 'f' && type == 'l' && f->vec == 'n')
+    TIMEIT (runf_vn_latency, f->fun.vnf);
+#endif
+
+  if (type == 't')
+    {
+      ns100 = (100 * dt + itercount * N / 2) / (itercount * N);
+      printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g]\n", f->name, s,
+	      (unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
+	      (unsigned long long) dt, lo, hi);
+    }
+  else if (type == 'l')
+    {
+      ns100 = (100 * dt + itercount * N / vlen / 2) / (itercount * N / vlen);
+      printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g]\n", f->name, s,
+	      (unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
+	      (unsigned long long) dt, lo, hi);
+    }
   fflush (stdout);
 }
 
diff --git a/math/test/runulp.sh b/math/test/runulp.sh
index 64db6f3..b2c1827 100755
--- a/math/test/runulp.sh
+++ b/math/test/runulp.sh
@@ -24,6 +24,10 @@ t() {
 	$emu ./ulp -r $r -e $Lt $flags "$@" && PASS=$((PASS+1)) || FAIL=$((FAIL+1))
 }
 
+check() {
+	$emu ./ulp -f -q "$@" >/dev/null
+}
+
 Ldir=0.5
 for r in $rmodes
 do
@@ -87,6 +91,66 @@ t powf  0x1.ep-1 0x1.1p0 x  0x1p8 0x1p14  50000
 t powf  0x1.ep-1 0x1.1p0 x -0x1p8 -0x1p14 50000
 done
 
+# vector functions
+Ldir=0.5
+r='n'
+flags="${ULPFLAGS:--q} -f"
+runv=
+check __v_exp 1 && runv=1
+runvn=
+check __vn_exp 1 && runvn=1
+
+range_exp='
+  0 0xffff000000000000 10000
+  0x1p-6     0x1p6     400000
+ -0x1p-6    -0x1p6     400000
+  633.3      733.3     10000
+ -633.3     -777.3     10000
+'
+
+range_expf='
+  0    0xffff0000    10000
+  0x1p-14   0x1p8    500000
+ -0x1p-14  -0x1p8    500000
+'
+
+range_expf_1u="$range_expf"
+
+# error limits
+L_exp=1.9
+L_expf=1.49
+L_expf_1u=0.4
+
+# group symbol run
+echo "
+exp  __s_exp       1
+exp  __v_exp       $runv
+exp  __vn_exp      $runvn
+exp  _ZGVnN2v_exp  $runvn
+
+expf __s_expf      1
+expf __v_expf      $runv
+expf __vn_expf     $runvn
+expf _ZGVnN4v_expf $runvn
+
+expf_1u __s_expf_1u   1
+expf_1u __v_expf_1u   $runv
+expf_1u __vn_expf_1u  $runvn
+
+" | while read G F R
+do
+	[ "$R" = 1 ] || continue
+	case "$G" in \#*) continue ;; esac
+	eval range="\${range_$G}"
+	eval L="\${L_$G}"
+	echo "$range" |while read X
+	do
+		[ -n "$X" ] || continue
+		case "$X" in \#*) continue ;; esac
+		t $F $X
+	done
+done
+
 [ 0 -eq $FAIL ] || {
 	echo "FAILED $FAIL PASSED $PASS"
 	exit 1
diff --git a/math/test/ulp.c b/math/test/ulp.c
index cd15dec..aec8aa5 100644
--- a/math/test/ulp.c
+++ b/math/test/ulp.c
@@ -13,6 +13,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include "mathlib.h"
 
 /* Don't depend on mpfr by default.  */
 #ifndef USE_MPFR
@@ -208,11 +209,38 @@ struct conf
   double errlim;
 };
 
+/* A bit of a hack: call vector functions twice with the same
+   input in lane 0 but a different value in other lanes: once
+   with an in-range value and then with a special case value.  */
+static int secondcall;
+
+/* Wrappers for vector functions.  */
+#if __aarch64__
+typedef __f32x4_t v_float;
+typedef __f64x2_t v_double;
+static const float fv[2] = {1.0f, -INFINITY};
+static const double dv[2] = {1.0, -INFINITY};
+static inline v_float argf(float x) { return (v_float){x,x,x,fv[secondcall]}; }
+static inline v_double argd(double x) { return (v_double){x,dv[secondcall]}; }
+
+static float v_expf_1u(float x) { return __v_expf_1u(argf(x))[0]; }
+static float v_expf(float x) { return __v_expf(argf(x))[0]; }
+static double v_exp(double x) { return __v_exp(argd(x))[0]; }
+#ifdef __vpcs
+static float vn_expf_1u(float x) { return __vn_expf_1u(argf(x))[0]; }
+static float vn_expf(float x) { return __vn_expf(argf(x))[0]; }
+static double vn_exp(double x) { return __vn_exp(argd(x))[0]; }
+static float Z_expf(float x) { return _ZGVnN4v_expf(argf(x))[0]; }
+static double Z_exp(double x) { return _ZGVnN2v_exp(argd(x))[0]; }
+#endif
+#endif
+
 struct fun
 {
   const char *name;
   int arity;
   int singleprec;
+  int twice;
   union
   {
     float (*f1) (float);
@@ -240,16 +268,16 @@ struct fun
 
 static const struct fun fun[] = {
 #if USE_MPFR
-# define F(x, x_long, x_mpfr, a, s, t) \
-  {#x, a, s, {.t = x}, {.t = x_long}, {.t = x_mpfr}},
+# define F(x, x_wrap, x_long, x_mpfr, a, s, t, twice) \
+  {#x, a, s, twice, {.t = x_wrap}, {.t = x_long}, {.t = x_mpfr}},
 #else
-# define F(x, x_long, x_mpfr, a, s, t) \
-  {#x, a, s, {.t = x}, {.t = x_long}},
+# define F(x, x_wrap, x_long, x_mpfr, a, s, t, twice) \
+  {#x, a, s, twice, {.t = x_wrap}, {.t = x_long}},
 #endif
-#define F1(x) F (x##f, x, mpfr_##x, 1, 1, f1)
-#define F2(x) F (x##f, x, mpfr_##x, 2, 1, f2)
-#define D1(x) F (x, x##l, mpfr_##x, 1, 0, d1)
-#define D2(x) F (x, x##l, mpfr_##x, 2, 0, d2)
+#define F1(x) F (x##f, x##f, x, mpfr_##x, 1, 1, f1, 0)
+#define F2(x) F (x##f, x##f, x, mpfr_##x, 2, 1, f2, 0)
+#define D1(x) F (x, x, x##l, mpfr_##x, 1, 0, d1, 0)
+#define D2(x) F (x, x, x##l, mpfr_##x, 2, 0, d2, 0)
  F1 (sin)
  F1 (cos)
  F1 (exp)
@@ -262,6 +290,26 @@ static const struct fun fun[] = {
  D1 (log)
  D1 (log2)
  D2 (pow)
+ F (__s_expf_1u, __s_expf_1u, exp, mpfr_exp, 1, 1, f1, 0)
+ F (__s_expf, __s_expf, exp, mpfr_exp, 1, 1, f1, 0)
+ F (__s_exp, __s_exp, expl, mpfr_exp, 1, 0, d1, 0)
+#if __aarch64__
+ F (__v_expf_1u, v_expf_1u, exp, mpfr_exp, 1, 1, f1, 1)
+ F (__v_expf, v_expf, exp, mpfr_exp, 1, 1, f1, 1)
+ F (__v_exp, v_exp, expl, mpfr_exp, 1, 0, d1, 1)
+#ifdef __vpcs
+ F (__vn_expf_1u, vn_expf_1u, exp, mpfr_exp, 1, 1, f1, 1)
+ F (__vn_expf, vn_expf, exp, mpfr_exp, 1, 1, f1, 1)
+ F (__vn_exp, vn_exp, expl, mpfr_exp, 1, 0, d1, 1)
+ F (_ZGVnN4v_expf, Z_expf, exp, mpfr_exp, 1, 1, f1, 1)
+ F (_ZGVnN2v_exp, Z_exp, expl, mpfr_exp, 1, 0, d1, 1)
+#endif
+#endif
+#undef F
+#undef F1
+#undef F2
+#undef D1
+#undef D2
  {0}};
 
 /* Boilerplate for generic calls.  */
diff --git a/math/test/ulp.h b/math/test/ulp.h
index 1f2fe98..a0c3016 100644
--- a/math/test/ulp.h
+++ b/math/test/ulp.h
@@ -257,11 +257,29 @@ static int T(cmp) (const struct fun *f, struct gen *gen,
       struct RT(ret) want;
       struct T(args) a = T(next) (gen);
       int exgot;
+      int exgot2;
       RT(float) ygot;
+      RT(float) ygot2;
+      int fail = 0;
       if (fenv)
 	T(call_fenv) (f, a, r, &ygot, &exgot);
       else
 	T(call_nofenv) (f, a, r, &ygot, &exgot);
+      if (f->twice) {
+	secondcall = 1;
+	if (fenv)
+	  T(call_fenv) (f, a, r, &ygot2, &exgot2);
+	else
+	  T(call_nofenv) (f, a, r, &ygot2, &exgot2);
+	secondcall = 0;
+	if (RT(asuint) (ygot) != RT(asuint) (ygot2))
+	  {
+	    fail = 1;
+	    cntfail++;
+	    T(printcall) (f, a);
+	    printf (" got %a then %a for same input\n", ygot, ygot2);
+	  }
+      }
       cnt++;
       int ok = use_mpfr
 		 ? T(call_mpfr_fix) (f, a, r, &want, ygot, exgot)
@@ -270,7 +288,6 @@ static int T(cmp) (const struct fun *f, struct gen *gen,
       if (!ok)
 	{
 	  int print = 0;
-	  int fail = 0;
 	  double err = RT(ulperr) (ygot, &want, r);
 	  double abserr = fabs (err);
 	  // TODO: count errors below accuracy limit.
@@ -280,8 +297,12 @@ static int T(cmp) (const struct fun *f, struct gen *gen,
 	    cnt2++;
 	  if (abserr > conf->errlim)
 	    {
-	      print = fail = 1;
-	      cntfail++;
+	      print = 1;
+	      if (!fail)
+		{
+		  fail = 1;
+		  cntfail++;
+		}
 	    }
 	  if (abserr > maxerr)
 	    {
diff --git a/math/tools/v_exp.sollya b/math/tools/v_exp.sollya
new file mode 100644
index 0000000..c0abb63
--- /dev/null
+++ b/math/tools/v_exp.sollya
@@ -0,0 +1,30 @@
+// polynomial for approximating e^x
+//
+// Copyright (c) 2019, Arm Limited.
+// SPDX-License-Identifier: MIT
+
+deg = 4; // poly degree
+N = 128; // table entries
+b = log(2)/(2*N);  // interval
+a = -b;
+
+// find polynomial with minimal abs error
+
+// return p that minimizes |exp(x) - poly(x) - x^d*p(x)|
+approx = proc(poly,d) {
+  return remez(exp(x)-poly(x), deg-d, [a;b], x^d, 1e-10);
+};
+
+// first 2 coeffs are fixed, iteratively find optimal double prec coeffs
+poly = 1 + x;
+for i from 2 to deg do {
+  p = roundcoefficients(approx(poly,i), [|D ...|]);
+  poly = poly + x^i*coeff(p,0);
+};
+
+display = hexadecimal;
+print("rel error:", accurateinfnorm(1-poly(x)/exp(x), [a;b], 30));
+print("abs error:", accurateinfnorm(exp(x)-poly(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 0 to deg do coeff(poly,i);
diff --git a/math/v_exp.c b/math/v_exp.c
new file mode 100644
index 0000000..e459d53
--- /dev/null
+++ b/math/v_exp.c
@@ -0,0 +1,94 @@
+/*
+ * Double-precision vector e^x function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#if V_SUPPORTED
+#include "v_exp.h"
+
+#if V_EXP_TABLE_BITS == 7
+/* maxerr: 1.88 +0.5 ulp
+   rel error: 1.4337*2^-53
+   abs error: 1.4299*2^-53 in [ -ln2/256, ln2/256 ].  */
+#define C1 v_f64 (0x1.ffffffffffd43p-2)
+#define C2 v_f64 (0x1.55555c75adbb2p-3)
+#define C3 v_f64 (0x1.55555da646206p-5)
+#define InvLn2 v_f64 (0x1.71547652b82fep7) /* N/ln2.  */
+#define Ln2hi v_f64 (0x1.62e42fefa39efp-8) /* ln2/N.  */
+#define Ln2lo v_f64 (0x1.abc9e3b39803f3p-63)
+#elif V_EXP_TABLE_BITS == 8
+/* maxerr: 0.54 +0.5 ulp
+   rel error: 1.4318*2^-58
+   abs error: 1.4299*2^-58 in [ -ln2/512, ln2/512 ].  */
+#define C1 v_f64 (0x1.fffffffffffd4p-2)
+#define C2 v_f64 (0x1.5555571d6b68cp-3)
+#define C3 v_f64 (0x1.5555576a59599p-5)
+#define InvLn2 v_f64 (0x1.71547652b82fep8)
+#define Ln2hi v_f64 (0x1.62e42fefa39efp-9)
+#define Ln2lo v_f64 (0x1.abc9e3b39803f3p-64)
+#endif
+
+#define N (1 << V_EXP_TABLE_BITS)
+#define Tab __v_exp_data
+#define IndexMask v_u64 (N - 1)
+#define Shift v_f64 (0x1.8p+52)
+#define Thres v_f64 (704.0)
+
+VPCS_ATTR
+static v_f64_t
+specialcase (v_f64_t s, v_f64_t y, v_f64_t n)
+{
+  v_f64_t absn = v_abs_f64 (n);
+
+  /* 2^(n/N) may overflow, break it up into s1*s2.  */
+  v_u64_t b = v_cond_u64 (n <= v_f64 (0.0)) & v_u64 (0x6000000000000000);
+  v_f64_t s1 = v_as_f64_u64 (v_u64 (0x7000000000000000) - b);
+  v_f64_t s2 = v_as_f64_u64 (v_as_u64_f64 (s) - v_u64 (0x3010000000000000) + b);
+  v_u64_t cmp = v_cond_u64 (absn > v_f64 (1280.0 * N));
+  v_f64_t r1 = s1 * s1;
+  v_f64_t r0 = v_fma_f64 (y, s2, s2) * s1;
+  return v_as_f64_u64 ((cmp & v_as_u64_f64 (r1)) | (~cmp & v_as_u64_f64 (r0)));
+}
+
+VPCS_ATTR
+v_f64_t
+V_NAME(exp) (v_f64_t x)
+{
+  v_f64_t n, r, r2, s, y, z;
+  v_u64_t cmp, u, e, i;
+
+  cmp = v_cond_u64 (v_abs_f64 (x) > Thres);
+
+  /* n = round(x/(ln2/N)).  */
+  z = v_fma_f64 (x, InvLn2, Shift);
+  u = v_as_u64_f64 (z);
+  n = z - Shift;
+
+  /* r = x - n*ln2/N.  */
+  r = x;
+  r = v_fma_f64 (-Ln2hi, n, r);
+  r = v_fma_f64 (-Ln2lo, n, r);
+
+  e = u << (52 - V_EXP_TABLE_BITS);
+  i = u & IndexMask;
+
+  /* y = exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4.  */
+  r2 = r * r;
+  y = v_fma_f64 (C2, r, C1);
+  y = v_fma_f64 (C3, r2, y);
+  y = v_fma_f64 (y, r2, r);
+
+  /* s = 2^(n/N).  */
+  u = v_lookup_u64 (Tab, i);
+  s = v_as_f64_u64 (u + e);
+
+  if (unlikely (v_any_u64 (cmp)))
+    return specialcase (s, y, n);
+  return v_fma_f64 (y, s, s);
+}
+VPCS_ALIAS
+#endif
diff --git a/math/v_exp.h b/math/v_exp.h
new file mode 100644
index 0000000..38713d6
--- /dev/null
+++ b/math/v_exp.h
@@ -0,0 +1,12 @@
+/*
+ * Declarations for double-precision e^x vector function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "v_math.h"
+
+#define V_EXP_TABLE_BITS 7
+
+extern const u64_t __v_exp_data[1 << V_EXP_TABLE_BITS] HIDDEN;
diff --git a/math/v_exp_data.c b/math/v_exp_data.c
new file mode 100644
index 0000000..19b2701
--- /dev/null
+++ b/math/v_exp_data.c
@@ -0,0 +1,401 @@
+/*
+ * Lookup table for double-precision e^x vector function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "v_exp.h"
+
+#define N (1 << V_EXP_TABLE_BITS)
+
+/* 2^(j/N), j=0..N.  */
+const u64_t __v_exp_data[] = {
+#if N == 128
+0x3ff0000000000000,
+0x3feff63da9fb3335,
+0x3fefec9a3e778061,
+0x3fefe315e86e7f85,
+0x3fefd9b0d3158574,
+0x3fefd06b29ddf6de,
+0x3fefc74518759bc8,
+0x3fefbe3ecac6f383,
+0x3fefb5586cf9890f,
+0x3fefac922b7247f7,
+0x3fefa3ec32d3d1a2,
+0x3fef9b66affed31b,
+0x3fef9301d0125b51,
+0x3fef8abdc06c31cc,
+0x3fef829aaea92de0,
+0x3fef7a98c8a58e51,
+0x3fef72b83c7d517b,
+0x3fef6af9388c8dea,
+0x3fef635beb6fcb75,
+0x3fef5be084045cd4,
+0x3fef54873168b9aa,
+0x3fef4d5022fcd91d,
+0x3fef463b88628cd6,
+0x3fef3f49917ddc96,
+0x3fef387a6e756238,
+0x3fef31ce4fb2a63f,
+0x3fef2b4565e27cdd,
+0x3fef24dfe1f56381,
+0x3fef1e9df51fdee1,
+0x3fef187fd0dad990,
+0x3fef1285a6e4030b,
+0x3fef0cafa93e2f56,
+0x3fef06fe0a31b715,
+0x3fef0170fc4cd831,
+0x3feefc08b26416ff,
+0x3feef6c55f929ff1,
+0x3feef1a7373aa9cb,
+0x3feeecae6d05d866,
+0x3feee7db34e59ff7,
+0x3feee32dc313a8e5,
+0x3feedea64c123422,
+0x3feeda4504ac801c,
+0x3feed60a21f72e2a,
+0x3feed1f5d950a897,
+0x3feece086061892d,
+0x3feeca41ed1d0057,
+0x3feec6a2b5c13cd0,
+0x3feec32af0d7d3de,
+0x3feebfdad5362a27,
+0x3feebcb299fddd0d,
+0x3feeb9b2769d2ca7,
+0x3feeb6daa2cf6642,
+0x3feeb42b569d4f82,
+0x3feeb1a4ca5d920f,
+0x3feeaf4736b527da,
+0x3feead12d497c7fd,
+0x3feeab07dd485429,
+0x3feea9268a5946b7,
+0x3feea76f15ad2148,
+0x3feea5e1b976dc09,
+0x3feea47eb03a5585,
+0x3feea34634ccc320,
+0x3feea23882552225,
+0x3feea155d44ca973,
+0x3feea09e667f3bcd,
+0x3feea012750bdabf,
+0x3fee9fb23c651a2f,
+0x3fee9f7df9519484,
+0x3fee9f75e8ec5f74,
+0x3fee9f9a48a58174,
+0x3fee9feb564267c9,
+0x3feea0694fde5d3f,
+0x3feea11473eb0187,
+0x3feea1ed0130c132,
+0x3feea2f336cf4e62,
+0x3feea427543e1a12,
+0x3feea589994cce13,
+0x3feea71a4623c7ad,
+0x3feea8d99b4492ed,
+0x3feeaac7d98a6699,
+0x3feeace5422aa0db,
+0x3feeaf3216b5448c,
+0x3feeb1ae99157736,
+0x3feeb45b0b91ffc6,
+0x3feeb737b0cdc5e5,
+0x3feeba44cbc8520f,
+0x3feebd829fde4e50,
+0x3feec0f170ca07ba,
+0x3feec49182a3f090,
+0x3feec86319e32323,
+0x3feecc667b5de565,
+0x3feed09bec4a2d33,
+0x3feed503b23e255d,
+0x3feed99e1330b358,
+0x3feede6b5579fdbf,
+0x3feee36bbfd3f37a,
+0x3feee89f995ad3ad,
+0x3feeee07298db666,
+0x3feef3a2b84f15fb,
+0x3feef9728de5593a,
+0x3feeff76f2fb5e47,
+0x3fef05b030a1064a,
+0x3fef0c1e904bc1d2,
+0x3fef12c25bd71e09,
+0x3fef199bdd85529c,
+0x3fef20ab5fffd07a,
+0x3fef27f12e57d14b,
+0x3fef2f6d9406e7b5,
+0x3fef3720dcef9069,
+0x3fef3f0b555dc3fa,
+0x3fef472d4a07897c,
+0x3fef4f87080d89f2,
+0x3fef5818dcfba487,
+0x3fef60e316c98398,
+0x3fef69e603db3285,
+0x3fef7321f301b460,
+0x3fef7c97337b9b5f,
+0x3fef864614f5a129,
+0x3fef902ee78b3ff6,
+0x3fef9a51fbc74c83,
+0x3fefa4afa2a490da,
+0x3fefaf482d8e67f1,
+0x3fefba1bee615a27,
+0x3fefc52b376bba97,
+0x3fefd0765b6e4540,
+0x3fefdbfdad9cbe14,
+0x3fefe7c1819e90d8,
+0x3feff3c22b8f71f1,
+#elif N == 256
+0x3ff0000000000000,
+0x3feffb1afa5abcbf,
+0x3feff63da9fb3335,
+0x3feff168143b0281,
+0x3fefec9a3e778061,
+0x3fefe7d42e11bbcc,
+0x3fefe315e86e7f85,
+0x3fefde5f72f654b1,
+0x3fefd9b0d3158574,
+0x3fefd50a0e3c1f89,
+0x3fefd06b29ddf6de,
+0x3fefcbd42b72a836,
+0x3fefc74518759bc8,
+0x3fefc2bdf66607e0,
+0x3fefbe3ecac6f383,
+0x3fefb9c79b1f3919,
+0x3fefb5586cf9890f,
+0x3fefb0f145e46c85,
+0x3fefac922b7247f7,
+0x3fefa83b23395dec,
+0x3fefa3ec32d3d1a2,
+0x3fef9fa55fdfa9c5,
+0x3fef9b66affed31b,
+0x3fef973028d7233e,
+0x3fef9301d0125b51,
+0x3fef8edbab5e2ab6,
+0x3fef8abdc06c31cc,
+0x3fef86a814f204ab,
+0x3fef829aaea92de0,
+0x3fef7e95934f312e,
+0x3fef7a98c8a58e51,
+0x3fef76a45471c3c2,
+0x3fef72b83c7d517b,
+0x3fef6ed48695bbc0,
+0x3fef6af9388c8dea,
+0x3fef672658375d2f,
+0x3fef635beb6fcb75,
+0x3fef5f99f8138a1c,
+0x3fef5be084045cd4,
+0x3fef582f95281c6b,
+0x3fef54873168b9aa,
+0x3fef50e75eb44027,
+0x3fef4d5022fcd91d,
+0x3fef49c18438ce4d,
+0x3fef463b88628cd6,
+0x3fef42be3578a819,
+0x3fef3f49917ddc96,
+0x3fef3bdda27912d1,
+0x3fef387a6e756238,
+0x3fef351ffb82140a,
+0x3fef31ce4fb2a63f,
+0x3fef2e85711ece75,
+0x3fef2b4565e27cdd,
+0x3fef280e341ddf29,
+0x3fef24dfe1f56381,
+0x3fef21ba7591bb70,
+0x3fef1e9df51fdee1,
+0x3fef1b8a66d10f13,
+0x3fef187fd0dad990,
+0x3fef157e39771b2f,
+0x3fef1285a6e4030b,
+0x3fef0f961f641589,
+0x3fef0cafa93e2f56,
+0x3fef09d24abd886b,
+0x3fef06fe0a31b715,
+0x3fef0432edeeb2fd,
+0x3fef0170fc4cd831,
+0x3feefeb83ba8ea32,
+0x3feefc08b26416ff,
+0x3feef96266e3fa2d,
+0x3feef6c55f929ff1,
+0x3feef431a2de883b,
+0x3feef1a7373aa9cb,
+0x3feeef26231e754a,
+0x3feeecae6d05d866,
+0x3feeea401b7140ef,
+0x3feee7db34e59ff7,
+0x3feee57fbfec6cf4,
+0x3feee32dc313a8e5,
+0x3feee0e544ede173,
+0x3feedea64c123422,
+0x3feedc70df1c5175,
+0x3feeda4504ac801c,
+0x3feed822c367a024,
+0x3feed60a21f72e2a,
+0x3feed3fb2709468a,
+0x3feed1f5d950a897,
+0x3feecffa3f84b9d4,
+0x3feece086061892d,
+0x3feecc2042a7d232,
+0x3feeca41ed1d0057,
+0x3feec86d668b3237,
+0x3feec6a2b5c13cd0,
+0x3feec4e1e192aed2,
+0x3feec32af0d7d3de,
+0x3feec17dea6db7d7,
+0x3feebfdad5362a27,
+0x3feebe41b817c114,
+0x3feebcb299fddd0d,
+0x3feebb2d81d8abff,
+0x3feeb9b2769d2ca7,
+0x3feeb8417f4531ee,
+0x3feeb6daa2cf6642,
+0x3feeb57de83f4eef,
+0x3feeb42b569d4f82,
+0x3feeb2e2f4f6ad27,
+0x3feeb1a4ca5d920f,
+0x3feeb070dde910d2,
+0x3feeaf4736b527da,
+0x3feeae27dbe2c4cf,
+0x3feead12d497c7fd,
+0x3feeac0827ff07cc,
+0x3feeab07dd485429,
+0x3feeaa11fba87a03,
+0x3feea9268a5946b7,
+0x3feea84590998b93,
+0x3feea76f15ad2148,
+0x3feea6a320dceb71,
+0x3feea5e1b976dc09,
+0x3feea52ae6cdf6f4,
+0x3feea47eb03a5585,
+0x3feea3dd1d1929fd,
+0x3feea34634ccc320,
+0x3feea2b9febc8fb7,
+0x3feea23882552225,
+0x3feea1c1c70833f6,
+0x3feea155d44ca973,
+0x3feea0f4b19e9538,
+0x3feea09e667f3bcd,
+0x3feea052fa75173e,
+0x3feea012750bdabf,
+0x3fee9fdcddd47645,
+0x3fee9fb23c651a2f,
+0x3fee9f9298593ae5,
+0x3fee9f7df9519484,
+0x3fee9f7466f42e87,
+0x3fee9f75e8ec5f74,
+0x3fee9f8286ead08a,
+0x3fee9f9a48a58174,
+0x3fee9fbd35d7cbfd,
+0x3fee9feb564267c9,
+0x3feea024b1ab6e09,
+0x3feea0694fde5d3f,
+0x3feea0b938ac1cf6,
+0x3feea11473eb0187,
+0x3feea17b0976cfdb,
+0x3feea1ed0130c132,
+0x3feea26a62ff86f0,
+0x3feea2f336cf4e62,
+0x3feea3878491c491,
+0x3feea427543e1a12,
+0x3feea4d2add106d9,
+0x3feea589994cce13,
+0x3feea64c1eb941f7,
+0x3feea71a4623c7ad,
+0x3feea7f4179f5b21,
+0x3feea8d99b4492ed,
+0x3feea9cad931a436,
+0x3feeaac7d98a6699,
+0x3feeabd0a478580f,
+0x3feeace5422aa0db,
+0x3feeae05bad61778,
+0x3feeaf3216b5448c,
+0x3feeb06a5e0866d9,
+0x3feeb1ae99157736,
+0x3feeb2fed0282c8a,
+0x3feeb45b0b91ffc6,
+0x3feeb5c353aa2fe2,
+0x3feeb737b0cdc5e5,
+0x3feeb8b82b5f98e5,
+0x3feeba44cbc8520f,
+0x3feebbdd9a7670b3,
+0x3feebd829fde4e50,
+0x3feebf33e47a22a2,
+0x3feec0f170ca07ba,
+0x3feec2bb4d53fe0d,
+0x3feec49182a3f090,
+0x3feec674194bb8d5,
+0x3feec86319e32323,
+0x3feeca5e8d07f29e,
+0x3feecc667b5de565,
+0x3feece7aed8eb8bb,
+0x3feed09bec4a2d33,
+0x3feed2c980460ad8,
+0x3feed503b23e255d,
+0x3feed74a8af46052,
+0x3feed99e1330b358,
+0x3feedbfe53c12e59,
+0x3feede6b5579fdbf,
+0x3feee0e521356eba,
+0x3feee36bbfd3f37a,
+0x3feee5ff3a3c2774,
+0x3feee89f995ad3ad,
+0x3feeeb4ce622f2ff,
+0x3feeee07298db666,
+0x3feef0ce6c9a8952,
+0x3feef3a2b84f15fb,
+0x3feef68415b749b1,
+0x3feef9728de5593a,
+0x3feefc6e29f1c52a,
+0x3feeff76f2fb5e47,
+0x3fef028cf22749e4,
+0x3fef05b030a1064a,
+0x3fef08e0b79a6f1f,
+0x3fef0c1e904bc1d2,
+0x3fef0f69c3f3a207,
+0x3fef12c25bd71e09,
+0x3fef16286141b33d,
+0x3fef199bdd85529c,
+0x3fef1d1cd9fa652c,
+0x3fef20ab5fffd07a,
+0x3fef244778fafb22,
+0x3fef27f12e57d14b,
+0x3fef2ba88988c933,
+0x3fef2f6d9406e7b5,
+0x3fef33405751c4db,
+0x3fef3720dcef9069,
+0x3fef3b0f2e6d1675,
+0x3fef3f0b555dc3fa,
+0x3fef43155b5bab74,
+0x3fef472d4a07897c,
+0x3fef4b532b08c968,
+0x3fef4f87080d89f2,
+0x3fef53c8eacaa1d6,
+0x3fef5818dcfba487,
+0x3fef5c76e862e6d3,
+0x3fef60e316c98398,
+0x3fef655d71ff6075,
+0x3fef69e603db3285,
+0x3fef6e7cd63a8315,
+0x3fef7321f301b460,
+0x3fef77d5641c0658,
+0x3fef7c97337b9b5f,
+0x3fef81676b197d17,
+0x3fef864614f5a129,
+0x3fef8b333b16ee12,
+0x3fef902ee78b3ff6,
+0x3fef953924676d76,
+0x3fef9a51fbc74c83,
+0x3fef9f7977cdb740,
+0x3fefa4afa2a490da,
+0x3fefa9f4867cca6e,
+0x3fefaf482d8e67f1,
+0x3fefb4aaa2188510,
+0x3fefba1bee615a27,
+0x3fefbf9c1cb6412a,
+0x3fefc52b376bba97,
+0x3fefcac948dd7274,
+0x3fefd0765b6e4540,
+0x3fefd632798844f8,
+0x3fefdbfdad9cbe14,
+0x3fefe1d802243c89,
+0x3fefe7c1819e90d8,
+0x3fefedba3692d514,
+0x3feff3c22b8f71f1,
+0x3feff9d96b2a23d9,
+#endif
+};
diff --git a/math/v_expf.c b/math/v_expf.c
new file mode 100644
index 0000000..f536701
--- /dev/null
+++ b/math/v_expf.c
@@ -0,0 +1,75 @@
+/*
+ * Single-precision vector e^x function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#if V_SUPPORTED
+
+static const float Poly[] = {
+  /* maxerr: 1.45358 +0.5 ulp.  */
+  0x1.0e4020p-7f,
+  0x1.573e2ep-5f,
+  0x1.555e66p-3f,
+  0x1.fffdb6p-2f,
+  0x1.ffffecp-1f,
+};
+#define C0 v_f32 (Poly[0])
+#define C1 v_f32 (Poly[1])
+#define C2 v_f32 (Poly[2])
+#define C3 v_f32 (Poly[3])
+#define C4 v_f32 (Poly[4])
+
+#define Shift v_f32 (0x1.8p23f)
+#define InvLn2 v_f32 (0x1.715476p+0f)
+#define Ln2hi v_f32 (0x1.62e4p-1f)
+#define Ln2lo v_f32 (0x1.7f7d1cp-20f)
+
+VPCS_ATTR
+static v_f32_t
+specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale)
+{
+  /* 2^n may overflow, break it up into s1*s2.  */
+  v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x82000000);
+  v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
+  v_f32_t s2 = v_as_f32_u32 (e - b);
+  v_u32_t cmp2 = v_cond_u32 (absn > v_f32 (192.0f));
+  v_u32_t r2 = v_as_u32_f32 (s1 * s1);
+  v_u32_t r1 = v_as_u32_f32 (v_fma_f32 (poly, s2, s2) * s1);
+  /* Similar to r1 but avoids double rounding in the subnormal range.  */
+  v_u32_t r0 = v_as_u32_f32 (v_fma_f32 (poly, scale, scale));
+  return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0));
+}
+
+VPCS_ATTR
+v_f32_t
+V_NAME(expf) (v_f32_t x)
+{
+  v_f32_t n, r, r2, scale, p, q, poly, absn, z;
+  v_u32_t cmp, e;
+
+  /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+     x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+  z = v_fma_f32 (x, InvLn2, Shift);
+  n = z - Shift;
+  r = v_fma_f32 (n, -Ln2hi, x);
+  r = v_fma_f32 (n, -Ln2lo, r);
+  e = v_as_u32_f32 (z) << 23;
+  scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
+  absn = v_abs_f32 (n);
+  cmp = v_cond_u32 (absn > v_f32 (126.0f));
+  r2 = r * r;
+  p = v_fma_f32 (C0, r, C1);
+  q = v_fma_f32 (C2, r, C3);
+  q = v_fma_f32 (p, r2, q);
+  p = C4 * r;
+  poly = v_fma_f32 (q, r2, p);
+  if (unlikely (v_any_u32 (cmp)))
+    return specialcase (poly, n, e, absn, cmp, scale);
+  return v_fma_f32 (poly, scale, scale);
+}
+VPCS_ALIAS
+#endif
diff --git a/math/v_expf_1u.c b/math/v_expf_1u.c
new file mode 100644
index 0000000..37d3d1e
--- /dev/null
+++ b/math/v_expf_1u.c
@@ -0,0 +1,72 @@
+/*
+ * Single-precision vector e^x function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#if V_SUPPORTED
+
+static const float Poly[] = {
+  /*  maxerr: 0.36565 +0.5 ulp.  */
+  0x1.6a6000p-10f,
+  0x1.12718ep-7f,
+  0x1.555af0p-5f,
+  0x1.555430p-3f,
+  0x1.fffff4p-2f,
+};
+#define C0 v_f32 (Poly[0])
+#define C1 v_f32 (Poly[1])
+#define C2 v_f32 (Poly[2])
+#define C3 v_f32 (Poly[3])
+#define C4 v_f32 (Poly[4])
+
+#define Shift v_f32 (0x1.8p23f)
+#define InvLn2 v_f32 (0x1.715476p+0f)
+#define Ln2hi v_f32 (0x1.62e4p-1f)
+#define Ln2lo v_f32 (0x1.7f7d1cp-20f)
+
+VPCS_ATTR
+static v_f32_t
+specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn)
+{
+  /* 2^n may overflow, break it up into s1*s2.  */
+  v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x83000000);
+  v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
+  v_f32_t s2 = v_as_f32_u32 (e - b);
+  v_u32_t cmp = v_cond_u32 (absn > v_f32 (192.0f));
+  v_f32_t r1 = s1 * s1;
+  v_f32_t r0 = poly * s1 * s2;
+  return v_as_f32_u32 ((cmp & v_as_u32_f32 (r1)) | (~cmp & v_as_u32_f32 (r0)));
+}
+
+VPCS_ATTR
+v_f32_t
+V_NAME(expf_1u) (v_f32_t x)
+{
+  v_f32_t n, r, scale, poly, absn, z;
+  v_u32_t cmp, e;
+
+  /* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
+     x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+  z = v_fma_f32 (x, InvLn2, Shift);
+  n = z - Shift;
+  r = v_fma_f32 (n, -Ln2hi, x);
+  r = v_fma_f32 (n, -Ln2lo, r);
+  e = v_as_u32_f32 (z) << 23;
+  scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
+  absn = v_abs_f32 (n);
+  cmp = v_cond_u32 (absn > v_f32 (126.0f));
+  poly = v_fma_f32 (C0, r, C1);
+  poly = v_fma_f32 (poly, r, C2);
+  poly = v_fma_f32 (poly, r, C3);
+  poly = v_fma_f32 (poly, r, C4);
+  poly = v_fma_f32 (poly, r, v_f32 (1.0f));
+  poly = v_fma_f32 (poly, r, v_f32 (1.0f));
+  if (unlikely (v_any_u32 (cmp)))
+    return specialcase (poly, n, e, absn);
+  return scale * poly;
+}
+#endif
diff --git a/math/v_math.h b/math/v_math.h
new file mode 100644
index 0000000..f2bf24e
--- /dev/null
+++ b/math/v_math.h
@@ -0,0 +1,604 @@
+/*
+ * Vector math abstractions.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef _V_MATH_H
+#define _V_MATH_H
+
+/* The goal of this header is to allow vector and scalar
+   build of the same algorithm, the provided intrinsic
+   wrappers are also vector length agnostic so they can
+   be implemented for SVE too (or other simd architectures)
+   and then the code should work on those targets too.  */
+
+#if SCALAR
+#define V_NAME(x) __s_##x
+#elif VPCS && __aarch64__
+#define V_NAME(x) __vn_##x
+#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
+#else
+#define V_NAME(x) __v_##x
+#endif
+
+#ifndef VPCS_ATTR
+#define VPCS_ATTR
+#endif
+#ifndef VPCS_ALIAS
+#define VPCS_ALIAS
+#endif
+
+#include <stdint.h>
+#include "math_config.h"
+
+typedef float f32_t;
+typedef uint32_t u32_t;
+typedef int32_t s32_t;
+typedef double f64_t;
+typedef uint64_t u64_t;
+typedef int64_t s64_t;
+
+/* reinterpret as type1 from type2.  */
+static inline u32_t
+as_u32_f32 (f32_t x)
+{
+  union { f32_t f; u32_t u; } r = {x};
+  return r.u;
+}
+static inline f32_t
+as_f32_u32 (u32_t x)
+{
+  union { u32_t u; f32_t f; } r = {x};
+  return r.f;
+}
+static inline s32_t
+as_s32_u32 (u32_t x)
+{
+  union { u32_t u; s32_t i; } r = {x};
+  return r.i;
+}
+static inline u32_t
+as_u32_s32 (s32_t x)
+{
+  union { s32_t i; u32_t u; } r = {x};
+  return r.u;
+}
+static inline u64_t
+as_u64_f64 (f64_t x)
+{
+  union { f64_t f; u64_t u; } r = {x};
+  return r.u;
+}
+static inline f64_t
+as_f64_u64 (u64_t x)
+{
+  union { u64_t u; f64_t f; } r = {x};
+  return r.f;
+}
+static inline s64_t
+as_s64_u64 (u64_t x)
+{
+  union { u64_t u; s64_t i; } r = {x};
+  return r.i;
+}
+static inline u64_t
+as_u64_s64 (s64_t x)
+{
+  union { s64_t i; u64_t u; } r = {x};
+  return r.u;
+}
+
+#if SCALAR
+#define V_SUPPORTED 1
+typedef f32_t v_f32_t;
+typedef u32_t v_u32_t;
+typedef s32_t v_s32_t;
+typedef f64_t v_f64_t;
+typedef u64_t v_u64_t;
+typedef s64_t v_s64_t;
+
+static inline int
+v_lanes32 (void)
+{
+  return 1;
+}
+
+static inline v_f32_t
+v_f32 (f32_t x)
+{
+  return x;
+}
+static inline v_u32_t
+v_u32 (u32_t x)
+{
+  return x;
+}
+static inline v_s32_t
+v_s32 (s32_t x)
+{
+  return x;
+}
+
+static inline f32_t
+v_get_f32 (v_f32_t x, int i)
+{
+  return x;
+}
+static inline u32_t
+v_get_u32 (v_u32_t x, int i)
+{
+  return x;
+}
+static inline s32_t
+v_get_s32 (v_s32_t x, int i)
+{
+  return x;
+}
+
+static inline void
+v_set_f32 (v_f32_t *x, int i, f32_t v)
+{
+  *x = v;
+}
+static inline void
+v_set_u32 (v_u32_t *x, int i, u32_t v)
+{
+  *x = v;
+}
+static inline void
+v_set_s32 (v_s32_t *x, int i, s32_t v)
+{
+  *x = v;
+}
+
+/* true if any elements of a v_cond result is non-zero.  */
+static inline int
+v_any_u32 (v_u32_t x)
+{
+  return x != 0;
+}
+/* to wrap the result of relational operators.  */
+static inline v_u32_t
+v_cond_u32 (v_u32_t x)
+{
+  return x ? -1 : 0;
+}
+static inline v_f32_t
+v_abs_f32 (v_f32_t x)
+{
+  return __builtin_fabsf (x);
+}
+static inline v_f32_t
+v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
+{
+  return __builtin_fmaf (x, y, z);
+}
+static inline v_f32_t
+v_round_f32 (v_f32_t x)
+{
+  return __builtin_roundf (x);
+}
+static inline v_s32_t
+v_round_s32 (v_f32_t x)
+{
+  return __builtin_lroundf (x); /* relies on -fno-math-errno.  */
+}
+/* convert to type1 from type2.  */
+static inline v_f32_t
+v_to_f32_s32 (v_s32_t x)
+{
+  return x;
+}
+static inline v_f32_t
+v_to_f32_u32 (v_u32_t x)
+{
+  return x;
+}
+/* reinterpret as type1 from type2.  */
+static inline v_u32_t
+v_as_u32_f32 (v_f32_t x)
+{
+  union { v_f32_t f; v_u32_t u; } r = {x};
+  return r.u;
+}
+static inline v_f32_t
+v_as_f32_u32 (v_u32_t x)
+{
+  union { v_u32_t u; v_f32_t f; } r = {x};
+  return r.f;
+}
+static inline v_s32_t
+v_as_s32_u32 (v_u32_t x)
+{
+  union { v_u32_t u; v_s32_t i; } r = {x};
+  return r.i;
+}
+static inline v_u32_t
+v_as_u32_s32 (v_s32_t x)
+{
+  union { v_s32_t i; v_u32_t u; } r = {x};
+  return r.u;
+}
+static inline v_f32_t
+v_lookup_f32 (const f32_t *tab, v_u32_t idx)
+{
+  return tab[idx];
+}
+static inline v_u32_t
+v_lookup_u32 (const u32_t *tab, v_u32_t idx)
+{
+  return tab[idx];
+}
+static inline v_f32_t
+v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p)
+{
+  return f (x);
+}
+static inline v_f32_t
+v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
+	     v_u32_t p)
+{
+  return f (x1, x2);
+}
+
+static inline v_f64_t
+v_f64 (f64_t x)
+{
+  return x;
+}
+static inline v_u64_t
+v_u64 (u64_t x)
+{
+  return x;
+}
+static inline v_s64_t
+v_s64 (s64_t x)
+{
+  return x;
+}
+/* true if any elements of a v_cond result is non-zero.  */
+static inline int
+v_any_u64 (v_u64_t x)
+{
+  return x != 0;
+}
+/* to wrap the result of relational operators.  */
+static inline v_u64_t
+v_cond_u64 (v_u64_t x)
+{
+  return x ? -1 : 0;
+}
+static inline v_f64_t
+v_abs_f64 (v_f64_t x)
+{
+  return __builtin_fabs (x);
+}
+static inline v_f64_t
+v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
+{
+  return __builtin_fma (x, y, z);
+}
+static inline v_f64_t
+v_round_f64 (v_f64_t x)
+{
+  return __builtin_round (x);
+}
+static inline v_s64_t
+v_round_s64 (v_f64_t x)
+{
+  return __builtin_lround (x); /* relies on -fno-math-errno.  */
+}
+/* convert to type1 from type2.  */
+static inline v_f64_t
+v_to_f64_s64 (v_s64_t x)
+{
+  return x;
+}
+static inline v_f64_t
+v_to_f64_u64 (v_u64_t x)
+{
+  return x;
+}
+/* reinterpret as type1 from type2.  */
+static inline v_u64_t
+v_as_u64_f64 (v_f64_t x)
+{
+  union { v_f64_t f; v_u64_t u; } r = {x};
+  return r.u;
+}
+static inline v_f64_t
+v_as_f64_u64 (v_u64_t x)
+{
+  union { v_u64_t u; v_f64_t f; } r = {x};
+  return r.f;
+}
+static inline v_s64_t
+v_as_s64_u64 (v_u64_t x)
+{
+  union { v_u64_t u; v_s64_t i; } r = {x};
+  return r.i;
+}
+static inline v_u64_t
+v_as_u64_s64 (v_s64_t x)
+{
+  union { v_s64_t i; v_u64_t u; } r = {x};
+  return r.u;
+}
+static inline v_f64_t
+v_lookup_f64 (const f64_t *tab, v_u64_t idx)
+{
+  return tab[idx];
+}
+static inline v_u64_t
+v_lookup_u64 (const u64_t *tab, v_u64_t idx)
+{
+  return tab[idx];
+}
+static inline v_f64_t
+v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p)
+{
+  return f (x);
+}
+
+#elif __aarch64__
+#define V_SUPPORTED 1
+#include <arm_neon.h>
+typedef float32x4_t v_f32_t;
+typedef uint32x4_t v_u32_t;
+typedef int32x4_t v_s32_t;
+typedef float64x2_t v_f64_t;
+typedef uint64x2_t v_u64_t;
+typedef int64x2_t v_s64_t;
+
+static inline int
+v_lanes32 (void)
+{
+  return 4;
+}
+
+static inline v_f32_t
+v_f32 (f32_t x)
+{
+  return (v_f32_t){x, x, x, x};
+}
+static inline v_u32_t
+v_u32 (u32_t x)
+{
+  return (v_u32_t){x, x, x, x};
+}
+static inline v_s32_t
+v_s32 (s32_t x)
+{
+  return (v_s32_t){x, x, x, x};
+}
+
+static inline f32_t
+v_get_f32 (v_f32_t x, int i)
+{
+  return x[i];
+}
+static inline u32_t
+v_get_u32 (v_u32_t x, int i)
+{
+  return x[i];
+}
+static inline s32_t
+v_get_s32 (v_s32_t x, int i)
+{
+  return x[i];
+}
+
+static inline void
+v_set_f32 (v_f32_t *x, int i, f32_t v)
+{
+  (*x)[i] = v;
+}
+static inline void
+v_set_u32 (v_u32_t *x, int i, u32_t v)
+{
+  (*x)[i] = v;
+}
+static inline void
+v_set_s32 (v_s32_t *x, int i, s32_t v)
+{
+  (*x)[i] = v;
+}
+
+/* true if any elements of a v_cond result is non-zero.  */
+static inline int
+v_any_u32 (v_u32_t x)
+{
+  /* assume elements in x are either 0 or -1u.  */
+  return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0;
+}
+/* to wrap the result of relational operators.  */
+static inline v_u32_t
+v_cond_u32 (v_u32_t x)
+{
+  return x;
+}
+static inline v_f32_t
+v_abs_f32 (v_f32_t x)
+{
+  return vabsq_f32 (x);
+}
+static inline v_f32_t
+v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
+{
+  return vfmaq_f32 (z, x, y);
+}
+static inline v_f32_t
+v_round_f32 (v_f32_t x)
+{
+  return vrndaq_f32 (x);
+}
+static inline v_s32_t
+v_round_s32 (v_f32_t x)
+{
+  return vcvtaq_s32_f32 (x);
+}
+/* convert to type1 from type2.  */
+static inline v_f32_t
+v_to_f32_s32 (v_s32_t x)
+{
+  return (v_f32_t){x[0], x[1], x[2], x[3]};
+}
+static inline v_f32_t
+v_to_f32_u32 (v_u32_t x)
+{
+  return (v_f32_t){x[0], x[1], x[2], x[3]};
+}
+/* reinterpret as type1 from type2.  */
+static inline v_u32_t
+v_as_u32_f32 (v_f32_t x)
+{
+  union { v_f32_t f; v_u32_t u; } r = {x};
+  return r.u;
+}
+static inline v_f32_t
+v_as_f32_u32 (v_u32_t x)
+{
+  union { v_u32_t u; v_f32_t f; } r = {x};
+  return r.f;
+}
+static inline v_s32_t
+v_as_s32_u32 (v_u32_t x)
+{
+  union { v_u32_t u; v_s32_t i; } r = {x};
+  return r.i;
+}
+static inline v_u32_t
+v_as_u32_s32 (v_s32_t x)
+{
+  union { v_s32_t i; v_u32_t u; } r = {x};
+  return r.u;
+}
+static inline v_f32_t
+v_lookup_f32 (const f32_t *tab, v_u32_t idx)
+{
+  return (v_f32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
+}
+static inline v_u32_t
+v_lookup_u32 (const u32_t *tab, v_u32_t idx)
+{
+  return (v_u32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
+}
+static inline v_f32_t
+v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p)
+{
+  return (v_f32_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1],
+		   p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]};
+}
+static inline v_f32_t
+v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
+	     v_u32_t p)
+{
+  return (
+    v_f32_t){p[0] ? f (x1[0], x2[0]) : y[0], p[1] ? f (x1[1], x2[1]) : y[1],
+	     p[2] ? f (x1[2], x2[2]) : y[2], p[3] ? f (x1[3], x2[3]) : y[3]};
+}
+
+static inline v_f64_t
+v_f64 (f64_t x)
+{
+  return (v_f64_t){x, x};
+}
+static inline v_u64_t
+v_u64 (u64_t x)
+{
+  return (v_u64_t){x, x};
+}
+static inline v_s64_t
+v_s64 (s64_t x)
+{
+  return (v_s64_t){x, x};
+}
+/* true if any elements of a v_cond result is non-zero.  */
+static inline int
+v_any_u64 (v_u64_t x)
+{
+  /* assume elements in x are either 0 or -1u.  */
+  return vpaddd_u64 (x) != 0;
+}
+/* to wrap the result of relational operators.  */
+static inline v_u64_t
+v_cond_u64 (v_u64_t x)
+{
+  return x;
+}
+static inline v_f64_t
+v_abs_f64 (v_f64_t x)
+{
+  return vabsq_f64 (x);
+}
+static inline v_f64_t
+v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
+{
+  return vfmaq_f64 (z, x, y);
+}
+static inline v_f64_t
+v_round_f64 (v_f64_t x)
+{
+  return vrndaq_f64 (x);
+}
+static inline v_s64_t
+v_round_s64 (v_f64_t x)
+{
+  return vcvtaq_s64_f64 (x);
+}
+/* convert to type1 from type2.  */
+static inline v_f64_t
+v_to_f64_s64 (v_s64_t x)
+{
+  return (v_f64_t){x[0], x[1]};
+}
+static inline v_f64_t
+v_to_f64_u64 (v_u64_t x)
+{
+  return (v_f64_t){x[0], x[1]};
+}
+/* reinterpret as type1 from type2.  */
+static inline v_u64_t
+v_as_u64_f64 (v_f64_t x)
+{
+  union { v_f64_t f; v_u64_t u; } r = {x};
+  return r.u;
+}
+static inline v_f64_t
+v_as_f64_u64 (v_u64_t x)
+{
+  union { v_u64_t u; v_f64_t f; } r = {x};
+  return r.f;
+}
+static inline v_s64_t
+v_as_s64_u64 (v_u64_t x)
+{
+  union {  v_u64_t u; v_s64_t i; } r = {x};
+  return r.i;
+}
+static inline v_u64_t
+v_as_u64_s64 (v_s64_t x)
+{
+  union { v_s64_t i; v_u64_t u; } r = {x};
+  return r.u;
+}
+static inline v_f64_t
+v_lookup_f64 (const f64_t *tab, v_u64_t idx)
+{
+  return (v_f64_t){tab[idx[0]], tab[idx[1]]};
+}
+static inline v_u64_t
+v_lookup_u64 (const u64_t *tab, v_u64_t idx)
+{
+  return (v_u64_t){tab[idx[0]], tab[idx[1]]};
+}
+static inline v_f64_t
+v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p)
+{
+  return (v_f64_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1]};
+}
+#endif
+
+#endif
diff --git a/math/vn_exp.c b/math/vn_exp.c
new file mode 100644
index 0000000..06e269d
--- /dev/null
+++ b/math/vn_exp.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_exp.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#include "mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_exp, _ZGVnN2v_exp)
+#include "v_exp.c"
+#endif
diff --git a/math/vn_expf.c b/math/vn_expf.c
new file mode 100644
index 0000000..0652907
--- /dev/null
+++ b/math/vn_expf.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_expf.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#include "mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_expf, _ZGVnN4v_expf)
+#include "v_expf.c"
+#endif
diff --git a/math/vn_expf_1u.c b/math/vn_expf_1u.c
new file mode 100644
index 0000000..3be7768
--- /dev/null
+++ b/math/vn_expf_1u.c
@@ -0,0 +1,11 @@
+/*
+ * AdvSIMD vector PCS variant of __v_expf_1u.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#include "mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#include "v_expf_1u.c"
+#endif