Add vector exp, expf and related vector math support code

Vector math routines are added to the same libmathlib library as scalar ones. The difficulty is that they are not always available, the external abi depends on the compiler version used for the build. Currently only aarch64 AdvSIMD is supported, there are 4 new sets of symbols: __s_foo is a scalar function with identical result to the vector one, __v_foo is a vector function using the base PCS, __vn_foo uses the vector PCS and _ZGV*_foo is the vector ABI symbol alias of vn_foo for a scalar math function foo. The test and benchmark code got extended to handle vector functions. Vector functions aim for < 5 ulp worst case error, only support nearest rounding mode and don't support floating-point exceptions. Vector functions may call scalar functions to handle special cases, but for a single value they should return the same result independently of values in other vector lanes or the position of the value in the vector. The __v_expf and __v_expf_1u polynomials were produced by searching the coefficient space with some heuristics and ideas from https://arxiv.org/abs/1508.03211 Their worst case error is 1.95 and 0.866 ulp respectively. The exp polynomial was produced by sollya, it uses a 128 element (1KB) lookup table and has 2.38 ulp worst case error.
author: Szabolcs Nagy <szabolcs.nagy@arm.com> 2019-07-18 12:51:41 +0100
committer: Szabolcs Nagy <szabolcs.nagy@arm.com> 2019-10-14 11:58:46 +0100
commit: 7a1f4cfdecb2060158bff1c9428ed2bb5442cc6f (patch)
tree: a2ce7dee1d1dd0af37d23f6263f6a1538baffbf5 /math
parent: a88f3f60c4b9891f12522586a9d445175611e2c8 (diff)
download: arm-optimized-routines-7a1f4cfdecb2060158bff1c9428ed2bb5442cc6f.tar.gz
18 files changed, 1749 insertions, 31 deletions
diff --git a/math/include/mathlib.h b/math/include/mathlib.h
index eed294b..6d62610 100644
--- a/math/include/mathlib.h
+++ b/math/include/mathlib.h
@@ -1,10 +1,13 @@
 /*
  * Public API.
  *
- * Copyright (c) 2015-2018, Arm Limited.
+ * Copyright (c) 2015-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
+#ifndef _MATHLIB_H
+#define _MATHLIB_H
+
 float expf (float);
 float exp2f (float);
 float logf (float);
@@ -19,3 +22,40 @@ double exp2 (double);
 double log (double);
 double log2 (double);
 double pow (double, double);
+
+/* Scalar functions using the vector algorithm with identical result.  */
+float __s_expf (float);
+float __s_expf_1u (float);
+double __s_exp (double);
+
+#if __aarch64__
+#if __GNUC__ >= 5
+typedef __Float32x4_t __f32x4_t;
+typedef __Float64x2_t __f64x2_t;
+#elif __clang_major__*100+__clang_minor__ >= 305
+typedef __attribute__((__neon_vector_type__(4))) float __f32x4_t;
+typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t;
+#else
+#error Unsupported compiler
+#endif
+
+/* Vector functions following the base PCS.  */
+__f32x4_t __v_expf (__f32x4_t);
+__f32x4_t __v_expf_1u (__f32x4_t);
+__f64x2_t __v_exp (__f64x2_t);
+
+#if __GNUC__ >= 9 || __clang_major__ >= 8
+#define __vpcs __attribute__((__aarch64_vector_pcs__))
+
+/* Vector functions following the vector PCS.  */
+__vpcs __f32x4_t __vn_expf (__f32x4_t);
+__vpcs __f32x4_t __vn_expf_1u (__f32x4_t);
+__vpcs __f64x2_t __vn_exp (__f64x2_t);
+
+/* Vector functions following the vector PCS using ABI names.  */
+__vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t);
+#endif
+#endif
+
+#endif
diff --git a/math/s_exp.c b/math/s_exp.c
new file mode 100644
index 0000000..ac7246b
--- /dev/null
+++ b/math/s_exp.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#define SCALAR 1
+#include "v_exp.c"
diff --git a/math/s_expf.c b/math/s_expf.c
new file mode 100644
index 0000000..3492c46
--- /dev/null
+++ b/math/s_expf.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#define SCALAR 1
+#include "v_expf.c"
diff --git a/math/s_expf_1u.c b/math/s_expf_1u.c
new file mode 100644
index 0000000..eb7bbcb
--- /dev/null
+++ b/math/s_expf_1u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#define SCALAR 1
+#include "v_expf_1u.c"
diff --git a/math/test/mathbench.c b/math/test/mathbench.c
index 1266d11..be7e439 100644
--- a/math/test/mathbench.c
+++ b/math/test/mathbench.c
@@ -29,6 +29,50 @@ static float Af[N];
 static long measurecount = MEASURE;
 static long itercount = ITER;
 
+#if __aarch64__
+typedef __f64x2_t v_double;
+
+#define v_double_len() 2
+
+static inline v_double
+v_double_load (const double *p)
+{
+  return (v_double){p[0], p[1]};
+}
+
+static inline v_double
+v_double_dup (double x)
+{
+  return (v_double){x, x};
+}
+
+typedef __f32x4_t v_float;
+
+#define v_float_len() 4
+
+static inline v_float
+v_float_load (const float *p)
+{
+  return (v_float){p[0], p[1], p[2], p[3]};
+}
+
+static inline v_float
+v_float_dup (float x)
+{
+  return (v_float){x, x, x, x};
+}
+#else
+/* dummy definitions to make things compile.  */
+typedef double v_double;
+typedef float v_float;
+#define v_double_len(x) 1
+#define v_double_load(x) (x)[0]
+#define v_double_dup(x) (x)
+#define v_float_len(x) 1
+#define v_float_load(x) (x)[0]
+#define v_float_dup(x) (x)
+#endif
+
 static double
 dummy (double x)
 {
@@ -41,6 +85,32 @@ dummyf (float x)
   return x;
 }
 
+static v_double
+__v_dummy (v_double x)
+{
+  return x;
+}
+
+static v_float
+__v_dummyf (v_float x)
+{
+  return x;
+}
+
+#ifdef __vpcs
+__vpcs static v_double
+__vn_dummy (v_double x)
+{
+  return x;
+}
+
+__vpcs static v_float
+__vn_dummyf (v_float x)
+{
+  return x;
+}
+#endif
+
 static double
 xypow (double x)
 {
@@ -89,42 +159,56 @@ static const struct fun
 {
   const char *name;
   int prec;
+  int vec;
   double lo;
   double hi;
   union
   {
     double (*d) (double);
     float (*f) (float);
+    v_double (*vd) (v_double);
+    v_float (*vf) (v_float);
+#ifdef __vpcs
+    __vpcs v_double (*vnd) (v_double);
+    __vpcs v_float (*vnf) (v_float);
+#endif
   } fun;
 } funtab[] = {
-#define D(func, lo, hi) {#func, 'd', lo, hi, {.d = func}},
-#define F(func, lo, hi) {#func, 'f', lo, hi, {.f = func}},
+#define D(func, lo, hi) {#func, 'd', 0, lo, hi, {.d = func}},
+#define F(func, lo, hi) {#func, 'f', 0, lo, hi, {.f = func}},
+#define VD(func, lo, hi) {#func, 'd', 'v', lo, hi, {.vd = func}},
+#define VF(func, lo, hi) {#func, 'f', 'v', lo, hi, {.vf = func}},
+#define VND(func, lo, hi) {#func, 'd', 'n', lo, hi, {.vnd = func}},
+#define VNF(func, lo, hi) {#func, 'f', 'n', lo, hi, {.vnf = func}},
 D (dummy, 1.0, 2.0)
 D (exp, -9.9, 9.9)
 D (exp, 0.5, 1.0)
+D (__s_exp, -9.9, 9.9)
 D (exp2, -9.9, 9.9)
 D (log, 0.01, 11.1)
 D (log, 0.999, 1.001)
 D (log2, 0.01, 11.1)
 D (log2, 0.999, 1.001)
-{"pow", 'd', 0.01, 11.1, {.d = xypow}},
+{"pow", 'd', 0, 0.01, 11.1, {.d = xypow}},
 D (xpow, 0.01, 11.1)
 D (ypow, -9.9, 9.9)
 
 F (dummyf, 1.0, 2.0)
 F (expf, -9.9, 9.9)
+F (__s_expf, -9.9, 9.9)
+F (__s_expf_1u, -9.9, 9.9)
 F (exp2f, -9.9, 9.9)
 F (logf, 0.01, 11.1)
 F (log2f, 0.01, 11.1)
-{"powf", 'f', 0.01, 11.1, {.f = xypowf}},
+{"powf", 'f', 0, 0.01, 11.1, {.f = xypowf}},
 F (xpowf, 0.01, 11.1)
 F (ypowf, -9.9, 9.9)
-{"sincosf", 'f', 0.1, 0.7, {.f = sincosf_wrap}},
-{"sincosf", 'f', 0.8, 3.1, {.f = sincosf_wrap}},
-{"sincosf", 'f', -3.1, 3.1, {.f = sincosf_wrap}},
-{"sincosf", 'f', 3.3, 33.3, {.f = sincosf_wrap}},
-{"sincosf", 'f', 100, 1000, {.f = sincosf_wrap}},
-{"sincosf", 'f', 1e6, 1e32, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 0.1, 0.7, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 0.8, 3.1, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, -3.1, 3.1, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 3.3, 33.3, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 100, 1000, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 1e6, 1e32, {.f = sincosf_wrap}},
 F (sinf, 0.1, 0.7)
 F (sinf, 0.8, 3.1)
 F (sinf, -3.1, 3.1)
@@ -137,9 +221,29 @@ F (cosf, -3.1, 3.1)
 F (cosf, 3.3, 33.3)
 F (cosf, 100, 1000)
 F (cosf, 1e6, 1e32)
+#if __aarch64__
+VD (__v_dummy, 1.0, 2.0)
+VD (__v_exp, -9.9, 9.9)
+VF (__v_dummyf, 1.0, 2.0)
+VF (__v_expf, -9.9, 9.9)
+VF (__v_expf_1u, -9.9, 9.9)
+#ifdef __vpcs
+VND (__vn_dummy, 1.0, 2.0)
+VND (__vn_exp, -9.9, 9.9)
+VND (_ZGVnN2v_exp, -9.9, 9.9)
+VNF (__vn_dummyf, 1.0, 2.0)
+VNF (__vn_expf, -9.9, 9.9)
+VNF (_ZGVnN4v_expf, -9.9, 9.9)
+VNF (__vn_expf_1u, -9.9, 9.9)
+#endif
+#endif
 {0},
 #undef F
 #undef D
+#undef VF
+#undef VD
+#undef VNF
+#undef VND
 };
 
 static void
@@ -238,6 +342,72 @@ runf_latency (float f (float))
     prev = f (Af[i] + prev * z);
 }
 
+static void
+run_v_thruput (v_double f (v_double))
+{
+  for (int i = 0; i < N; i += v_double_len ())
+    f (v_double_load (A+i));
+}
+
+static void
+runf_v_thruput (v_float f (v_float))
+{
+  for (int i = 0; i < N; i += v_float_len ())
+    f (v_float_load (Af+i));
+}
+
+static void
+run_v_latency (v_double f (v_double))
+{
+  v_double z = v_double_dup (zero);
+  v_double prev = z;
+  for (int i = 0; i < N; i += v_double_len ())
+    prev = f (v_double_load (A+i) + prev * z);
+}
+
+static void
+runf_v_latency (v_float f (v_float))
+{
+  v_float z = v_float_dup (zero);
+  v_float prev = z;
+  for (int i = 0; i < N; i += v_float_len ())
+    prev = f (v_float_load (Af+i) + prev * z);
+}
+
+#ifdef __vpcs
+static void
+run_vn_thruput (__vpcs v_double f (v_double))
+{
+  for (int i = 0; i < N; i += v_double_len ())
+    f (v_double_load (A+i));
+}
+
+static void
+runf_vn_thruput (__vpcs v_float f (v_float))
+{
+  for (int i = 0; i < N; i += v_float_len ())
+    f (v_float_load (Af+i));
+}
+
+static void
+run_vn_latency (__vpcs v_double f (v_double))
+{
+  v_double z = v_double_dup (zero);
+  v_double prev = z;
+  for (int i = 0; i < N; i += v_double_len ())
+    prev = f (v_double_load (A+i) + prev * z);
+}
+
+static void
+runf_vn_latency (__vpcs v_float f (v_float))
+{
+  v_float z = v_float_dup (zero);
+  v_float prev = z;
+  for (int i = 0; i < N; i += v_float_len ())
+    prev = f (v_float_load (Af+i) + prev * z);
+}
+#endif
+
 static uint64_t
 tic (void)
 {
@@ -267,20 +437,54 @@ bench1 (const struct fun *f, int type, double lo, double hi)
   uint64_t dt = 0;
   uint64_t ns100;
   const char *s = type == 't' ? "rthruput" : "latency";
+  int vlen = 1;
 
-  if (f->prec == 'd' && type == 't')
+  if (f->vec && f->prec == 'd')
+    vlen = v_double_len();
+  else if (f->vec && f->prec == 'f')
+    vlen = v_float_len();
+
+  if (f->prec == 'd' && type == 't' && f->vec == 0)
     TIMEIT (run_thruput, f->fun.d);
-  else if (f->prec == 'd' && type == 'l')
+  else if (f->prec == 'd' && type == 'l' && f->vec == 0)
     TIMEIT (run_latency, f->fun.d);
-  else if (f->prec == 'f' && type == 't')
+  else if (f->prec == 'f' && type == 't' && f->vec == 0)
     TIMEIT (runf_thruput, f->fun.f);
-  else if (f->prec == 'f' && type == 'l')
+  else if (f->prec == 'f' && type == 'l' && f->vec == 0)
     TIMEIT (runf_latency, f->fun.f);
-
-  ns100 = (100 * dt + itercount * N / 2) / (itercount * N);
-  printf ("%7s %8s: %4u.%02u ns/elem %10llu ns in [%g %g]\n", f->name, s,
-	  (unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
-	  (unsigned long long) dt, lo, hi);
+  else if (f->prec == 'd' && type == 't' && f->vec == 'v')
+    TIMEIT (run_v_thruput, f->fun.vd);
+  else if (f->prec == 'd' && type == 'l' && f->vec == 'v')
+    TIMEIT (run_v_latency, f->fun.vd);
+  else if (f->prec == 'f' && type == 't' && f->vec == 'v')
+    TIMEIT (runf_v_thruput, f->fun.vf);
+  else if (f->prec == 'f' && type == 'l' && f->vec == 'v')
+    TIMEIT (runf_v_latency, f->fun.vf);
+#ifdef __vpcs
+  else if (f->prec == 'd' && type == 't' && f->vec == 'n')
+    TIMEIT (run_vn_thruput, f->fun.vnd);
+  else if (f->prec == 'd' && type == 'l' && f->vec == 'n')
+    TIMEIT (run_vn_latency, f->fun.vnd);
+  else if (f->prec == 'f' && type == 't' && f->vec == 'n')
+    TIMEIT (runf_vn_thruput, f->fun.vnf);
+  else if (f->prec == 'f' && type == 'l' && f->vec == 'n')
+    TIMEIT (runf_vn_latency, f->fun.vnf);
+#endif
+
+  if (type == 't')
+    {
+      ns100 = (100 * dt + itercount * N / 2) / (itercount * N);
+      printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g]\n", f->name, s,
+	      (unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
+	      (unsigned long long) dt, lo, hi);
+    }
+  else if (type == 'l')
+    {
+      ns100 = (100 * dt + itercount * N / vlen / 2) / (itercount * N / vlen);
+      printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g]\n", f->name, s,
+	      (unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
+	      (unsigned long long) dt, lo, hi);
+    }
   fflush (stdout);
 }
 
diff --git a/math/test/runulp.sh b/math/test/runulp.sh
index 64db6f3..b2c1827 100755
--- a/math/test/runulp.sh
+++ b/math/test/runulp.sh
@@ -24,6 +24,10 @@ t() {
 	$emu ./ulp -r $r -e $Lt $flags "$@" && PASS=$((PASS+1)) || FAIL=$((FAIL+1))
 }
 
+check() {
+	$emu ./ulp -f -q "$@" >/dev/null
+}
+
 Ldir=0.5
 for r in $rmodes
 do
@@ -87,6 +91,66 @@ t powf  0x1.ep-1 0x1.1p0 x  0x1p8 0x1p14  50000
 t powf  0x1.ep-1 0x1.1p0 x -0x1p8 -0x1p14 50000
 done
 
+# vector functions
+Ldir=0.5
+r='n'
+flags="${ULPFLAGS:--q} -f"
+runv=
+check __v_exp 1 && runv=1
+runvn=
+check __vn_exp 1 && runvn=1
+
+range_exp='
+  0 0xffff000000000000 10000
+  0x1p-6     0x1p6     400000
+ -0x1p-6    -0x1p6     400000
+  633.3      733.3     10000
+ -633.3     -777.3     10000
+'
+
+range_expf='
+  0    0xffff0000    10000
+  0x1p-14   0x1p8    500000
+ -0x1p-14  -0x1p8    500000
+'
+
+range_expf_1u="$range_expf"
+
+# error limits
+L_exp=1.9
+L_expf=1.49
+L_expf_1u=0.4
+
+# group symbol run
+echo "
+exp  __s_exp       1
+exp  __v_exp       $runv
+exp  __vn_exp      $runvn
+exp  _ZGVnN2v_exp  $runvn
+
+expf __s_expf      1
+expf __v_expf      $runv
+expf __vn_expf     $runvn
+expf _ZGVnN4v_expf $runvn
+
+expf_1u __s_expf_1u   1
+expf_1u __v_expf_1u   $runv
+expf_1u __vn_expf_1u  $runvn
+
+" | while read G F R
+do
+	[ "$R" = 1 ] || continue
+	case "$G" in \#*) continue ;; esac
+	eval range="\${range_$G}"
+	eval L="\${L_$G}"
+	echo "$range" |while read X
+	do
+		[ -n "$X" ] || continue
+		case "$X" in \#*) continue ;; esac
+		t $F $X
+	done
+done
+
 [ 0 -eq $FAIL ] || {
 	echo "FAILED $FAIL PASSED $PASS"
 	exit 1
diff --git a/math/test/ulp.c b/math/test/ulp.c
index cd15dec..aec8aa5 100644
--- a/math/test/ulp.c
+++ b/math/test/ulp.c
@@ -13,6 +13,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include "mathlib.h"
 
 /* Don't depend on mpfr by default.  */
 #ifndef USE_MPFR
@@ -208,11 +209,38 @@ struct conf
   double errlim;
 };
 
+/* A bit of a hack: call vector functions twice with the same
+   input in lane 0 but a different value in other lanes: once
+   with an in-range value and then with a special case value.  */
+static int secondcall;
+
+/* Wrappers for vector functions.  */
+#if __aarch64__
+typedef __f32x4_t v_float;
+typedef __f64x2_t v_double;
+static const float fv[2] = {1.0f, -INFINITY};
+static const double dv[2] = {1.0, -INFINITY};
+static inline v_float argf(float x) { return (v_float){x,x,x,fv[secondcall]}; }
+static inline v_double argd(double x) { return (v_double){x,dv[secondcall]}; }
+
+static float v_expf_1u(float x) { return __v_expf_1u(argf(x))[0]; }
+static float v_expf(float x) { return __v_expf(argf(x))[0]; }
+static double v_exp(double x) { return __v_exp(argd(x))[0]; }
+#ifdef __vpcs
+static float vn_expf_1u(float x) { return __vn_expf_1u(argf(x))[0]; }
+static float vn_expf(float x) { return __vn_expf(argf(x))[0]; }
+static double vn_exp(double x) { return __vn_exp(argd(x))[0]; }
+static float Z_expf(float x) { return _ZGVnN4v_expf(argf(x))[0]; }
+static double Z_exp(double x) { return _ZGVnN2v_exp(argd(x))[0]; }
+#endif
+#endif
+
 struct fun
 {
   const char *name;
   int arity;
   int singleprec;
+  int twice;
   union
   {
     float (*f1) (float);
@@ -240,16 +268,16 @@ struct fun
 
 static const struct fun fun[] = {
 #if USE_MPFR
-# define F(x, x_long, x_mpfr, a, s, t) \
-  {#x, a, s, {.t = x}, {.t = x_long}, {.t = x_mpfr}},
+# define F(x, x_wrap, x_long, x_mpfr, a, s, t, twice) \
+  {#x, a, s, twice, {.t = x_wrap}, {.t = x_long}, {.t = x_mpfr}},
 #else
-# define F(x, x_long, x_mpfr, a, s, t) \
-  {#x, a, s, {.t = x}, {.t = x_long}},
+# define F(x, x_wrap, x_long, x_mpfr, a, s, t, twice) \
+  {#x, a, s, twice, {.t = x_wrap}, {.t = x_long}},
 #endif
-#define F1(x) F (x##f, x, mpfr_##x, 1, 1, f1)
-#define F2(x) F (x##f, x, mpfr_##x, 2, 1, f2)
-#define D1(x) F (x, x##l, mpfr_##x, 1, 0, d1)
-#define D2(x) F (x, x##l, mpfr_##x, 2, 0, d2)
+#define F1(x) F (x##f, x##f, x, mpfr_##x, 1, 1, f1, 0)
+#define F2(x) F (x##f, x##f, x, mpfr_##x, 2, 1, f2, 0)
+#define D1(x) F (x, x, x##l, mpfr_##x, 1, 0, d1, 0)
+#define D2(x) F (x, x, x##l, mpfr_##x, 2, 0, d2, 0)
  F1 (sin)
  F1 (cos)
  F1 (exp)
@@ -262,6 +290,26 @@ static const struct fun fun[] = {
  D1 (log)
  D1 (log2)
  D2 (pow)
+ F (__s_expf_1u, __s_expf_1u, exp, mpfr_exp, 1, 1, f1, 0)
+ F (__s_expf, __s_expf, exp, mpfr_exp, 1, 1, f1, 0)
+ F (__s_exp, __s_exp, expl, mpfr_exp, 1, 0, d1, 0)
+#if __aarch64__
+ F (__v_expf_1u, v_expf_1u, exp, mpfr_exp, 1, 1, f1, 1)
+ F (__v_expf, v_expf, exp, mpfr_exp, 1, 1, f1, 1)
+ F (__v_exp, v_exp, expl, mpfr_exp, 1, 0, d1, 1)
+#ifdef __vpcs
+ F (__vn_expf_1u, vn_expf_1u, exp, mpfr_exp, 1, 1, f1, 1)
+ F (__vn_expf, vn_expf, exp, mpfr_exp, 1, 1, f1, 1)
+ F (__vn_exp, vn_exp, expl, mpfr_exp, 1, 0, d1, 1)
+ F (_ZGVnN4v_expf, Z_expf, exp, mpfr_exp, 1, 1, f1, 1)
+ F (_ZGVnN2v_exp, Z_exp, expl, mpfr_exp, 1, 0, d1, 1)
+#endif
+#endif
+#undef F
+#undef F1
+#undef F2
+#undef D1
+#undef D2
  {0}};
 
 /* Boilerplate for generic calls.  */
diff --git a/math/test/ulp.h b/math/test/ulp.h
index 1f2fe98..a0c3016 100644
--- a/math/test/ulp.h
+++ b/math/test/ulp.h
@@ -257,11 +257,29 @@ static int T(cmp) (const struct fun *f, struct gen *gen,
       struct RT(ret) want;
       struct T(args) a = T(next) (gen);
       int exgot;
+      int exgot2;
       RT(float) ygot;
+      RT(float) ygot2;
+      int fail = 0;
       if (fenv)
 	T(call_fenv) (f, a, r, &ygot, &exgot);
       else
 	T(call_nofenv) (f, a, r, &ygot, &exgot);
+      if (f->twice) {
+	secondcall = 1;
+	if (fenv)
+	  T(call_fenv) (f, a, r, &ygot2, &exgot2);
+	else
+	  T(call_nofenv) (f, a, r, &ygot2, &exgot2);
+	secondcall = 0;
+	if (RT(asuint) (ygot) != RT(asuint) (ygot2))
+	  {
+	    fail = 1;
+	    cntfail++;
+	    T(printcall) (f, a);
+	    printf (" got %a then %a for same input\n", ygot, ygot2);
+	  }
+      }
       cnt++;
       int ok = use_mpfr
 		 ? T(call_mpfr_fix) (f, a, r, &want, ygot, exgot)
@@ -270,7 +288,6 @@ static int T(cmp) (const struct fun *f, struct gen *gen,
       if (!ok)
 	{
 	  int print = 0;
-	  int fail = 0;
 	  double err = RT(ulperr) (ygot, &want, r);
 	  double abserr = fabs (err);
 	  // TODO: count errors below accuracy limit.
@@ -280,8 +297,12 @@ static int T(cmp) (const struct fun *f, struct gen *gen,
 	    cnt2++;
 	  if (abserr > conf->errlim)
 	    {
-	      print = fail = 1;
-	      cntfail++;
+	      print = 1;
+	      if (!fail)
+		{
+		  fail = 1;
+		  cntfail++;
+		}
 	    }
 	  if (abserr > maxerr)
 	    {
diff --git a/math/tools/v_exp.sollya b/math/tools/v_exp.sollya
new file mode 100644
index 0000000..c0abb63
--- /dev/null
+++ b/math/tools/v_exp.sollya
@@ -0,0 +1,30 @@
+// polynomial for approximating e^x
+//
+// Copyright (c) 2019, Arm Limited.
+// SPDX-License-Identifier: MIT
+
+deg = 4; // poly degree
+N = 128; // table entries
+b = log(2)/(2*N);  // interval
+a = -b;
+
+// find polynomial with minimal abs error
+
+// return p that minimizes |exp(x) - poly(x) - x^d*p(x)|
+approx = proc(poly,d) {
+  return remez(exp(x)-poly(x), deg-d, [a;b], x^d, 1e-10);
+};
+
+// first 2 coeffs are fixed, iteratively find optimal double prec coeffs
+poly = 1 + x;
+for i from 2 to deg do {
+  p = roundcoefficients(approx(poly,i), [|D ...|]);
+  poly = poly + x^i*coeff(p,0);
+};
+
+display = hexadecimal;
+print("rel error:", accurateinfnorm(1-poly(x)/exp(x), [a;b], 30));
+print("abs error:", accurateinfnorm(exp(x)-poly(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 0 to deg do coeff(poly,i);
diff --git a/math/v_exp.c b/math/v_exp.c
new file mode 100644
index 0000000..e459d53
--- /dev/null
+++ b/math/v_exp.c
@@ -0,0 +1,94 @@
+/*
+ * Double-precision vector e^x function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#if V_SUPPORTED
+#include "v_exp.h"
+
+#if V_EXP_TABLE_BITS == 7
+/* maxerr: 1.88 +0.5 ulp
+   rel error: 1.4337*2^-53
+   abs error: 1.4299*2^-53 in [ -ln2/256, ln2/256 ].  */
+#define C1 v_f64 (0x1.ffffffffffd43p-2)
+#define C2 v_f64 (0x1.55555c75adbb2p-3)
+#define C3 v_f64 (0x1.55555da646206p-5)
+#define InvLn2 v_f64 (0x1.71547652b82fep7) /* N/ln2.  */
+#define Ln2hi v_f64 (0x1.62e42fefa39efp-8) /* ln2/N.  */
+#define Ln2lo v_f64 (0x1.abc9e3b39803f3p-63)
+#elif V_EXP_TABLE_BITS == 8
+/* maxerr: 0.54 +0.5 ulp
+   rel error: 1.4318*2^-58
+   abs error: 1.4299*2^-58 in [ -ln2/512, ln2/512 ].  */
+#define C1 v_f64 (0x1.fffffffffffd4p-2)
+#define C2 v_f64 (0x1.5555571d6b68cp-3)
+#define C3 v_f64 (0x1.5555576a59599p-5)
+#define InvLn2 v_f64 (0x1.71547652b82fep8)
+#define Ln2hi v_f64 (0x1.62e42fefa39efp-9)
+#define Ln2lo v_f64 (0x1.abc9e3b39803f3p-64)
+#endif
+
+#define N (1 << V_EXP_TABLE_BITS)
+#define Tab __v_exp_data
+#define IndexMask v_u64 (N - 1)
+#define Shift v_f64 (0x1.8p+52)
+#define Thres v_f64 (704.0)
+
+VPCS_ATTR
+static v_f64_t
+specialcase (v_f64_t s, v_f64_t y, v_f64_t n)
+{
+  v_f64_t absn = v_abs_f64 (n);
+
+  /* 2^(n/N) may overflow, break it up into s1*s2.  */
+  v_u64_t b = v_cond_u64 (n <= v_f64 (0.0)) & v_u64 (0x6000000000000000);
+  v_f64_t s1 = v_as_f64_u64 (v_u64 (0x7000000000000000) - b);
+  v_f64_t s2 = v_as_f64_u64 (v_as_u64_f64 (s) - v_u64 (0x3010000000000000) + b);
+  v_u64_t cmp = v_cond_u64 (absn > v_f64 (1280.0 * N));
+  v_f64_t r1 = s1 * s1;
+  v_f64_t r0 = v_fma_f64 (y, s2, s2) * s1;
+  return v_as_f64_u64 ((cmp & v_as_u64_f64 (r1)) | (~cmp & v_as_u64_f64 (r0)));
+}
+
+VPCS_ATTR
+v_f64_t
+V_NAME(exp) (v_f64_t x)
+{
+  v_f64_t n, r, r2, s, y, z;
+  v_u64_t cmp, u, e, i;
+
+  cmp = v_cond_u64 (v_abs_f64 (x) > Thres);
+
+  /* n = round(x/(ln2/N)).  */
+  z = v_fma_f64 (x, InvLn2, Shift);
+  u = v_as_u64_f64 (z);
+  n = z - Shift;
+
+  /* r = x - n*ln2/N.  */
+  r = x;
+  r = v_fma_f64 (-Ln2hi, n, r);
+  r = v_fma_f64 (-Ln2lo, n, r);
+
+  e = u << (52 - V_EXP_TABLE_BITS);
+  i = u & IndexMask;
+
+  /* y = exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4.  */
+  r2 = r * r;
+  y = v_fma_f64 (C2, r, C1);
+  y = v_fma_f64 (C3, r2, y);
+  y = v_fma_f64 (y, r2, r);
+
+  /* s = 2^(n/N).  */
+  u = v_lookup_u64 (Tab, i);
+  s = v_as_f64_u64 (u + e);
+
+  if (unlikely (v_any_u64 (cmp)))
+    return specialcase (s, y, n);
+  return v_fma_f64 (y, s, s);
+}
+VPCS_ALIAS
+#endif
diff --git a/math/v_exp.h b/math/v_exp.h
new file mode 100644
index 0000000..38713d6
--- /dev/null
+++ b/math/v_exp.h
@@ -0,0 +1,12 @@
+/*
+ * Declarations for double-precision e^x vector function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "v_math.h"
+
+#define V_EXP_TABLE_BITS 7
+
+extern const u64_t __v_exp_data[1 << V_EXP_TABLE_BITS] HIDDEN;
diff --git a/math/v_exp_data.c b/math/v_exp_data.c
new file mode 100644
index 0000000..19b2701
--- /dev/null
+++ b/math/v_exp_data.c
@@ -0,0 +1,401 @@
+/*
+ * Lookup table for double-precision e^x vector function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "v_exp.h"
+
+#define N (1 << V_EXP_TABLE_BITS)
+
+/* 2^(j/N), j=0..N.  */
+const u64_t __v_exp_data[] = {
+#if N == 128
+0x3ff0000000000000,
+0x3feff63da9fb3335,
+0x3fefec9a3e778061,
+0x3fefe315e86e7f85,
+0x3fefd9b0d3158574,
+0x3fefd06b29ddf6de,
+0x3fefc74518759bc8,
+0x3fefbe3ecac6f383,
+0x3fefb5586cf9890f,
+0x3fefac922b7247f7,
+0x3fefa3ec32d3d1a2,
+0x3fef9b66affed31b,
+0x3fef9301d0125b51,
+0x3fef8abdc06c31cc,
+0x3fef829aaea92de0,
+0x3fef7a98c8a58e51,
+0x3fef72b83c7d517b,
+0x3fef6af9388c8dea,
+0x3fef635beb6fcb75,
+0x3fef5be084045cd4,
+0x3fef54873168b9aa,
+0x3fef4d5022fcd91d,
+0x3fef463b88628cd6,
+0x3fef3f49917ddc96,
+0x3fef387a6e756238,
+0x3fef31ce4fb2a63f,
+0x3fef2b4565e27cdd,
+0x3fef24dfe1f56381,
+0x3fef1e9df51fdee1,
+0x3fef187fd0dad990,
+0x3fef1285a6e4030b,
+0x3fef0cafa93e2f56,
+0x3fef06fe0a31b715,
+0x3fef0170fc4cd831,
+0x3feefc08b26416ff,
+0x3feef6c55f929ff1,
+0x3feef1a7373aa9cb,
+0x3feeecae6d05d866,
+0x3feee7db34e59ff7,
+0x3feee32dc313a8e5,
+0x3feedea64c123422,
+0x3feeda4504ac801c,
+0x3feed60a21f72e2a,
+0x3feed1f5d950a897,
+0x3feece086061892d,
+0x3feeca41ed1d0057,
+0x3feec6a2b5c13cd0,
+0x3feec32af0d7d3de,
+0x3feebfdad5362a27,
+0x3feebcb299fddd0d,
+0x3feeb9b2769d2ca7,
+0x3feeb6daa2cf6642,
+0x3feeb42b569d4f82,
+0x3feeb1a4ca5d920f,
+0x3feeaf4736b527da,
+0x3feead12d497c7fd,
+0x3feeab07dd485429,
+0x3feea9268a5946b7,
+0x3feea76f15ad2148,
+0x3feea5e1b976dc09,
+0x3feea47eb03a5585,
+0x3feea34634ccc320,
+0x3feea23882552225,
+0x3feea155d44ca973,
+0x3feea09e667f3bcd,
+0x3feea012750bdabf,
+0x3fee9fb23c651a2f,
+0x3fee9f7df9519484,
+0x3fee9f75e8ec5f74,
+0x3fee9f9a48a58174,
+0x3fee9feb564267c9,
+0x3feea0694fde5d3f,
+0x3feea11473eb0187,
+0x3feea1ed0130c132,
+0x3feea2f336cf4e62,
+0x3feea427543e1a12,
+0x3feea589994cce13,
+0x3feea71a4623c7ad,
+0x3feea8d99b4492ed,
+0x3feeaac7d98a6699,
+0x3feeace5422aa0db,
+0x3feeaf3216b5448c,
+0x3feeb1ae99157736,
+0x3feeb45b0b91ffc6,
+0x3feeb737b0cdc5e5,
+0x3feeba44cbc8520f,
+0x3feebd829fde4e50,
+0x3feec0f170ca07ba,
+0x3feec49182a3f090,
+0x3feec86319e32323,
+0x3feecc667b5de565,
+0x3feed09bec4a2d33,
+0x3feed503b23e255d,
+0x3feed99e1330b358,
+0x3feede6b5579fdbf,
+0x3feee36bbfd3f37a,
+0x3feee89f995ad3ad,
+0x3feeee07298db666,
+0x3feef3a2b84f15fb,
+0x3feef9728de5593a,
+0x3feeff76f2fb5e47,
+0x3fef05b030a1064a,
+0x3fef0c1e904bc1d2,
+0x3fef12c25bd71e09,
+0x3fef199bdd85529c,
+0x3fef20ab5fffd07a,
+0x3fef27f12e57d14b,
+0x3fef2f6d9406e7b5,
+0x3fef3720dcef9069,
+0x3fef3f0b555dc3fa,
+0x3fef472d4a07897c,
+0x3fef4f87080d89f2,
+0x3fef5818dcfba487,
+0x3fef60e316c98398,
+0x3fef69e603db3285,
+0x3fef7321f301b460,
+0x3fef7c97337b9b5f,
+0x3fef864614f5a129,
+0x3fef902ee78b3ff6,
+0x3fef9a51fbc74c83,
+0x3fefa4afa2a490da,
+0x3fefaf482d8e67f1,
+0x3fefba1bee615a27,
+0x3fefc52b376bba97,
+0x3fefd0765b6e4540,
+0x3fefdbfdad9cbe14,
+0x3fefe7c1819e90d8,
+0x3feff3c22b8f71f1,
+#elif N == 256
+0x3ff0000000000000,
+0x3feffb1afa5abcbf,
+0x3feff63da9fb3335,
+0x3feff168143b0281,
+0x3fefec9a3e778061,
+0x3fefe7d42e11bbcc,
+0x3fefe315e86e7f85,
+0x3fefde5f72f654b1,
+0x3fefd9b0d3158574,
+0x3fefd50a0e3c1f89,
+0x3fefd06b29ddf6de,
+0x3fefcbd42b72a836,
+0x3fefc74518759bc8,
+0x3fefc2bdf66607e0,
+0x3fefbe3ecac6f383,
+0x3fefb9c79b1f3919,
+0x3fefb5586cf9890f,
+0x3fefb0f145e46c85,
+0x3fefac922b7247f7,
+0x3fefa83b23395dec,
+0x3fefa3ec32d3d1a2,
+0x3fef9fa55fdfa9c5,
+0x3fef9b66affed31b,
+0x3fef973028d7233e,
+0x3fef9301d0125b51,
+0x3fef8edbab5e2ab6,
+0x3fef8abdc06c31cc,
+0x3fef86a814f204ab,
+0x3fef829aaea92de0,
+0x3fef7e95934f312e,
+0x3fef7a98c8a58e51,
+0x3fef76a45471c3c2,
+0x3fef72b83c7d517b,
+0x3fef6ed48695bbc0,
+0x3fef6af9388c8dea,
+0x3fef672658375d2f,
+0x3fef635beb6fcb75,
+0x3fef5f99f8138a1c,
+0x3fef5be084045cd4,
+0x3fef582f95281c6b,
+0x3fef54873168b9aa,
+0x3fef50e75eb44027,
+0x3fef4d5022fcd91d,
+0x3fef49c18438ce4d,
+0x3fef463b88628cd6,
+0x3fef42be3578a819,
+0x3fef3f49917ddc96,
+0x3fef3bdda27912d1,
+0x3fef387a6e756238,
+0x3fef351ffb82140a,
+0x3fef31ce4fb2a63f,
+0x3fef2e85711ece75,
+0x3fef2b4565e27cdd,
+0x3fef280e341ddf29,
+0x3fef24dfe1f56381,
+0x3fef21ba7591bb70,
+0x3fef1e9df51fdee1,
+0x3fef1b8a66d10f13,
+0x3fef187fd0dad990,
+0x3fef157e39771b2f,
+0x3fef1285a6e4030b,
+0x3fef0f961f641589,
+0x3fef0cafa93e2f56,
+0x3fef09d24abd886b,
+0x3fef06fe0a31b715,
+0x3fef0432edeeb2fd,
+0x3fef0170fc4cd831,
+0x3feefeb83ba8ea32,
+0x3feefc08b26416ff,
+0x3feef96266e3fa2d,
+0x3feef6c55f929ff1,
+0x3feef431a2de883b,
+0x3feef1a7373aa9cb,
+0x3feeef26231e754a,
+0x3feeecae6d05d866,
+0x3feeea401b7140ef,
+0x3feee7db34e59ff7,
+0x3feee57fbfec6cf4,
+0x3feee32dc313a8e5,
+0x3feee0e544ede173,
+0x3feedea64c123422,
+0x3feedc70df1c5175,
+0x3feeda4504ac801c,
+0x3feed822c367a024,
+0x3feed60a21f72e2a,
+0x3feed3fb2709468a,
+0x3feed1f5d950a897,
+0x3feecffa3f84b9d4,
+0x3feece086061892d,
+0x3feecc2042a7d232,
+0x3feeca41ed1d0057,
+0x3feec86d668b3237,
+0x3feec6a2b5c13cd0,
+0x3feec4e1e192aed2,
+0x3feec32af0d7d3de,
+0x3feec17dea6db7d7,
+0x3feebfdad5362a27,
+0x3feebe41b817c114,
+0x3feebcb299fddd0d,
+0x3feebb2d81d8abff,
+0x3feeb9b2769d2ca7,
+0x3feeb8417f4531ee,
+0x3feeb6daa2cf6642,
+0x3feeb57de83f4eef,
+0x3feeb42b569d4f82,
+0x3feeb2e2f4f6ad27,
+0x3feeb1a4ca5d920f,
+0x3feeb070dde910d2,
+0x3feeaf4736b527da,
+0x3feeae27dbe2c4cf,
+0x3feead12d497c7fd,
+0x3feeac0827ff07cc,
+0x3feeab07dd485429,
+0x3feeaa11fba87a03,
+0x3feea9268a5946b7,
+0x3feea84590998b93,
+0x3feea76f15ad2148,
+0x3feea6a320dceb71,
+0x3feea5e1b976dc09,
+0x3feea52ae6cdf6f4,
+0x3feea47eb03a5585,
+0x3feea3dd1d1929fd,
+0x3feea34634ccc320,
+0x3feea2b9febc8fb7,
+0x3feea23882552225,
+0x3feea1c1c70833f6,
+0x3feea155d44ca973,
+0x3feea0f4b19e9538,
+0x3feea09e667f3bcd,
+0x3feea052fa75173e,
+0x3feea012750bdabf,
+0x3fee9fdcddd47645,
+0x3fee9fb23c651a2f,
+0x3fee9f9298593ae5,
+0x3fee9f7df9519484,
+0x3fee9f7466f42e87,
+0x3fee9f75e8ec5f74,
+0x3fee9f8286ead08a,
+0x3fee9f9a48a58174,
+0x3fee9fbd35d7cbfd,
+0x3fee9feb564267c9,
+0x3feea024b1ab6e09,
+0x3feea0694fde5d3f,
+0x3feea0b938ac1cf6,
+0x3feea11473eb0187,
+0x3feea17b0976cfdb,
+0x3feea1ed0130c132,
+0x3feea26a62ff86f0,
+0x3feea2f336cf4e62,
+0x3feea3878491c491,
+0x3feea427543e1a12,
+0x3feea4d2add106d9,
+0x3feea589994cce13,
+0x3feea64c1eb941f7,
+0x3feea71a4623c7ad,
+0x3feea7f4179f5b21,
+0x3feea8d99b4492ed,
+0x3feea9cad931a436,
+0x3feeaac7d98a6699,
+0x3feeabd0a478580f,
+0x3feeace5422aa0db,
+0x3feeae05bad61778,
+0x3feeaf3216b5448c,
+0x3feeb06a5e0866d9,
+0x3feeb1ae99157736,
+0x3feeb2fed0282c8a,
+0x3feeb45b0b91ffc6,
+0x3feeb5c353aa2fe2,
+0x3feeb737b0cdc5e5,
+0x3feeb8b82b5f98e5,
+0x3feeba44cbc8520f,
+0x3feebbdd9a7670b3,
+0x3feebd829fde4e50,
+0x3feebf33e47a22a2,
+0x3feec0f170ca07ba,
+0x3feec2bb4d53fe0d,
+0x3feec49182a3f090,
+0x3feec674194bb8d5,
+0x3feec86319e32323,
+0x3feeca5e8d07f29e,
+0x3feecc667b5de565,
+0x3feece7aed8eb8bb,
+0x3feed09bec4a2d33,
+0x3feed2c980460ad8,
+0x3feed503b23e255d,
+0x3feed74a8af46052,
+0x3feed99e1330b358,
+0x3feedbfe53c12e59,
+0x3feede6b5579fdbf,
+0x3feee0e521356eba,
+0x3feee36bbfd3f37a,
+0x3feee5ff3a3c2774,
+0x3feee89f995ad3ad,
+0x3feeeb4ce622f2ff,
+0x3feeee07298db666,
+0x3feef0ce6c9a8952,
+0x3feef3a2b84f15fb,
+0x3feef68415b749b1,
+0x3feef9728de5593a,
+0x3feefc6e29f1c52a,
+0x3feeff76f2fb5e47,
+0x3fef028cf22749e4,
+0x3fef05b030a1064a,
+0x3fef08e0b79a6f1f,
+0x3fef0c1e904bc1d2,
+0x3fef0f69c3f3a207,
+0x3fef12c25bd71e09,
+0x3fef16286141b33d,
+0x3fef199bdd85529c,
+0x3fef1d1cd9fa652c,
+0x3fef20ab5fffd07a,
+0x3fef244778fafb22,
+0x3fef27f12e57d14b,
+0x3fef2ba88988c933,
+0x3fef2f6d9406e7b5,
+0x3fef33405751c4db,
+0x3fef3720dcef9069,
+0x3fef3b0f2e6d1675,
+0x3fef3f0b555dc3fa,
+0x3fef43155b5bab74,
+0x3fef472d4a07897c,
+0x3fef4b532b08c968,
+0x3fef4f87080d89f2,
+0x3fef53c8eacaa1d6,
+0x3fef5818dcfba487,
+0x3fef5c76e862e6d3,
+0x3fef60e316c98398,
+0x3fef655d71ff6075,
+0x3fef69e603db3285,
+0x3fef6e7cd63a8315,
+0x3fef7321f301b460,
+0x3fef77d5641c0658,
+0x3fef7c97337b9b5f,
+0x3fef81676b197d17,
+0x3fef864614f5a129,
+0x3fef8b333b16ee12,
+0x3fef902ee78b3ff6,
+0x3fef953924676d76,
+0x3fef9a51fbc74c83,
+0x3fef9f7977cdb740,
+0x3fefa4afa2a490da,
+0x3fefa9f4867cca6e,
+0x3fefaf482d8e67f1,
+0x3fefb4aaa2188510,
+0x3fefba1bee615a27,
+0x3fefbf9c1cb6412a,
+0x3fefc52b376bba97,
+0x3fefcac948dd7274,
+0x3fefd0765b6e4540,
+0x3fefd632798844f8,
+0x3fefdbfdad9cbe14,
+0x3fefe1d802243c89,
+0x3fefe7c1819e90d8,
+0x3fefedba3692d514,
+0x3feff3c22b8f71f1,
+0x3feff9d96b2a23d9,
+#endif
+};
diff --git a/math/v_expf.c b/math/v_expf.c
new file mode 100644
index 0000000..f536701
--- /dev/null
+++ b/math/v_expf.c
@@ -0,0 +1,75 @@
+/*
+ * Single-precision vector e^x function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#if V_SUPPORTED
+
+static const float Poly[] = {
+  /* maxerr: 1.45358 +0.5 ulp.  */
+  0x1.0e4020p-7f,
+  0x1.573e2ep-5f,
+  0x1.555e66p-3f,
+  0x1.fffdb6p-2f,
+  0x1.ffffecp-1f,
+};
+#define C0 v_f32 (Poly[0])
+#define C1 v_f32 (Poly[1])
+#define C2 v_f32 (Poly[2])
+#define C3 v_f32 (Poly[3])
+#define C4 v_f32 (Poly[4])
+
+#define Shift v_f32 (0x1.8p23f)
+#define InvLn2 v_f32 (0x1.715476p+0f)
+#define Ln2hi v_f32 (0x1.62e4p-1f)
+#define Ln2lo v_f32 (0x1.7f7d1cp-20f)
+
+VPCS_ATTR
+static v_f32_t
+specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale)
+{
+  /* 2^n may overflow, break it up into s1*s2.  */
+  v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x82000000);
+  v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
+  v_f32_t s2 = v_as_f32_u32 (e - b);
+  v_u32_t cmp2 = v_cond_u32 (absn > v_f32 (192.0f));
+  v_u32_t r2 = v_as_u32_f32 (s1 * s1);
+  v_u32_t r1 = v_as_u32_f32 (v_fma_f32 (poly, s2, s2) * s1);
+  /* Similar to r1 but avoids double rounding in the subnormal range.  */
+  v_u32_t r0 = v_as_u32_f32 (v_fma_f32 (poly, scale, scale));
+  return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0));
+}
+
+VPCS_ATTR
+v_f32_t
+V_NAME(expf) (v_f32_t x)
+{
+  v_f32_t n, r, r2, scale, p, q, poly, absn, z;
+  v_u32_t cmp, e;
+
+  /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+     x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+  z = v_fma_f32 (x, InvLn2, Shift);
+  n = z - Shift;
+  r = v_fma_f32 (n, -Ln2hi, x);
+  r = v_fma_f32 (n, -Ln2lo, r);
+  e = v_as_u32_f32 (z) << 23;
+  scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
+  absn = v_abs_f32 (n);
+  cmp = v_cond_u32 (absn > v_f32 (126.0f));
+  r2 = r * r;
+  p = v_fma_f32 (C0, r, C1);
+  q = v_fma_f32 (C2, r, C3);
+  q = v_fma_f32 (p, r2, q);
+  p = C4 * r;
+  poly = v_fma_f32 (q, r2, p);
+  if (unlikely (v_any_u32 (cmp)))
+    return specialcase (poly, n, e, absn, cmp, scale);
+  return v_fma_f32 (poly, scale, scale);
+}
+VPCS_ALIAS
+#endif
diff --git a/math/v_expf_1u.c b/math/v_expf_1u.c
new file mode 100644
index 0000000..37d3d1e
--- /dev/null
+++ b/math/v_expf_1u.c
@@ -0,0 +1,72 @@
+/*
+ * Single-precision vector e^x function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#if V_SUPPORTED
+
+static const float Poly[] = {
+  /*  maxerr: 0.36565 +0.5 ulp.  */
+  0x1.6a6000p-10f,
+  0x1.12718ep-7f,
+  0x1.555af0p-5f,
+  0x1.555430p-3f,
+  0x1.fffff4p-2f,
+};
+#define C0 v_f32 (Poly[0])
+#define C1 v_f32 (Poly[1])
+#define C2 v_f32 (Poly[2])
+#define C3 v_f32 (Poly[3])
+#define C4 v_f32 (Poly[4])
+
+#define Shift v_f32 (0x1.8p23f)
+#define InvLn2 v_f32 (0x1.715476p+0f)
+#define Ln2hi v_f32 (0x1.62e4p-1f)
+#define Ln2lo v_f32 (0x1.7f7d1cp-20f)
+
+VPCS_ATTR
+static v_f32_t
+specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn)
+{
+  /* 2^n may overflow, break it up into s1*s2.  */
+  v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x83000000);
+  v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
+  v_f32_t s2 = v_as_f32_u32 (e - b);
+  v_u32_t cmp = v_cond_u32 (absn > v_f32 (192.0f));
+  v_f32_t r1 = s1 * s1;
+  v_f32_t r0 = poly * s1 * s2;
+  return v_as_f32_u32 ((cmp & v_as_u32_f32 (r1)) | (~cmp & v_as_u32_f32 (r0)));
+}
+
+VPCS_ATTR
+v_f32_t
+V_NAME(expf_1u) (v_f32_t x)
+{
+  v_f32_t n, r, scale, poly, absn, z;
+  v_u32_t cmp, e;
+
+  /* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
+     x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+  z = v_fma_f32 (x, InvLn2, Shift);
+  n = z - Shift;
+  r = v_fma_f32 (n, -Ln2hi, x);
+  r = v_fma_f32 (n, -Ln2lo, r);
+  e = v_as_u32_f32 (z) << 23;
+  scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
+  absn = v_abs_f32 (n);
+  cmp = v_cond_u32 (absn > v_f32 (126.0f));
+  poly = v_fma_f32 (C0, r, C1);
+  poly = v_fma_f32 (poly, r, C2);
+  poly = v_fma_f32 (poly, r, C3);
+  poly = v_fma_f32 (poly, r, C4);
+  poly = v_fma_f32 (poly, r, v_f32 (1.0f));
+  poly = v_fma_f32 (poly, r, v_f32 (1.0f));
+  if (unlikely (v_any_u32 (cmp)))
+    return specialcase (poly, n, e, absn);
+  return scale * poly;
+}
+#endif
diff --git a/math/v_math.h b/math/v_math.h
new file mode 100644
index 0000000..f2bf24e
--- /dev/null
+++ b/math/v_math.h
@@ -0,0 +1,604 @@
+/*
+ * Vector math abstractions.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef _V_MATH_H
+#define _V_MATH_H
+
+/* The goal of this header is to allow vector and scalar
+   build of the same algorithm, the provided intrinsic
+   wrappers are also vector length agnostic so they can
+   be implemented for SVE too (or other simd architectures)
+   and then the code should work on those targets too.  */
+
+#if SCALAR
+#define V_NAME(x) __s_##x
+#elif VPCS && __aarch64__
+#define V_NAME(x) __vn_##x
+#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
+#else
+#define V_NAME(x) __v_##x
+#endif
+
+#ifndef VPCS_ATTR
+#define VPCS_ATTR
+#endif
+#ifndef VPCS_ALIAS
+#define VPCS_ALIAS
+#endif
+
+#include <stdint.h>
+#include "math_config.h"
+
+typedef float f32_t;
+typedef uint32_t u32_t;
+typedef int32_t s32_t;
+typedef double f64_t;
+typedef uint64_t u64_t;
+typedef int64_t s64_t;
+
+/* reinterpret as type1 from type2.  */
+static inline u32_t
+as_u32_f32 (f32_t x)
+{
+  union { f32_t f; u32_t u; } r = {x};
+  return r.u;
+}
+static inline f32_t
+as_f32_u32 (u32_t x)
+{
+  union { u32_t u; f32_t f; } r = {x};
+  return r.f;
+}
+static inline s32_t
+as_s32_u32 (u32_t x)
+{
+  union { u32_t u; s32_t i; } r = {x};
+  return r.i;
+}
+static inline u32_t
+as_u32_s32 (s32_t x)
+{
+  union { s32_t i; u32_t u; } r = {x};
+  return r.u;
+}
+static inline u64_t
+as_u64_f64 (f64_t x)
+{
+  union { f64_t f; u64_t u; } r = {x};
+  return r.u;
+}
+static inline f64_t
+as_f64_u64 (u64_t x)
+{
+  union { u64_t u; f64_t f; } r = {x};
+  return r.f;
+}
+static inline s64_t
+as_s64_u64 (u64_t x)
+{
+  union { u64_t u; s64_t i; } r = {x};
+  return r.i;
+}
+static inline u64_t
+as_u64_s64 (s64_t x)
+{
+  union { s64_t i; u64_t u; } r = {x};
+  return r.u;
+}
+
+#if SCALAR
+#define V_SUPPORTED 1
+typedef f32_t v_f32_t;
+typedef u32_t v_u32_t;
+typedef s32_t v_s32_t;
+typedef f64_t v_f64_t;
+typedef u64_t v_u64_t;
+typedef s64_t v_s64_t;
+
+static inline int
+v_lanes32 (void)
+{
+  return 1;
+}
+
+static inline v_f32_t
+v_f32 (f32_t x)
+{
+  return x;
+}
+static inline v_u32_t
+v_u32 (u32_t x)
+{
+  return x;
+}
+static inline v_s32_t
+v_s32 (s32_t x)
+{
+  return x;
+}
+
+static inline f32_t
+v_get_f32 (v_f32_t x, int i)
+{
+  return x;
+}
+static inline u32_t
+v_get_u32 (v_u32_t x, int i)
+{
+  return x;
+}
+static inline s32_t
+v_get_s32 (v_s32_t x, int i)
+{
+  return x;
+}
+
+static inline void
+v_set_f32 (v_f32_t *x, int i, f32_t v)
+{
+  *x = v;
+}
+static inline void
+v_set_u32 (v_u32_t *x, int i, u32_t v)
+{
+  *x = v;
+}
+static inline void
+v_set_s32 (v_s32_t *x, int i, s32_t v)
+{
+  *x = v;
+}
+
+/* true if any elements of a v_cond result is non-zero.  */
+static inline int
+v_any_u32 (v_u32_t x)
+{
+  return x != 0;
+}
+/* to wrap the result of relational operators.  */
+static inline v_u32_t
+v_cond_u32 (v_u32_t x)
+{
+  return x ? -1 : 0;
+}
+static inline v_f32_t
+v_abs_f32 (v_f32_t x)
+{
+  return __builtin_fabsf (x);
+}
+static inline v_f32_t
+v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
+{
+  return __builtin_fmaf (x, y, z);
+}
+static inline v_f32_t
+v_round_f32 (v_f32_t x)
+{
+  return __builtin_roundf (x);
+}
+static inline v_s32_t
+v_round_s32 (v_f32_t x)
+{
+  return __builtin_lroundf (x); /* relies on -fno-math-errno.  */
+}
+/* convert to type1 from type2.  */
+static inline v_f32_t
+v_to_f32_s32 (v_s32_t x)
+{
+  return x;
+}
+static inline v_f32_t
+v_to_f32_u32 (v_u32_t x)
+{
+  return x;
+}
+/* reinterpret as type1 from type2.  */
+static inline v_u32_t
+v_as_u32_f32 (v_f32_t x)
+{
+  union { v_f32_t f; v_u32_t u; } r = {x};
+  return r.u;
+}
+static inline v_f32_t
+v_as_f32_u32 (v_u32_t x)
+{
+  union { v_u32_t u; v_f32_t f; } r = {x};
+  return r.f;
+}
+static inline v_s32_t
+v_as_s32_u32 (v_u32_t x)
+{
+  union { v_u32_t u; v_s32_t i; } r = {x};
+  return r.i;
+}
+static inline v_u32_t
+v_as_u32_s32 (v_s32_t x)
+{
+  union { v_s32_t i; v_u32_t u; } r = {x};
+  return r.u;
+}
+static inline v_f32_t
+v_lookup_f32 (const f32_t *tab, v_u32_t idx)
+{
+  return tab[idx];
+}
+static inline v_u32_t
+v_lookup_u32 (const u32_t *tab, v_u32_t idx)
+{
+  return tab[idx];
+}
+static inline v_f32_t
+v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p)
+{
+  return f (x);
+}
+static inline v_f32_t
+v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
+	     v_u32_t p)
+{
+  return f (x1, x2);
+}
+
+static inline v_f64_t
+v_f64 (f64_t x)
+{
+  return x;
+}
+static inline v_u64_t
+v_u64 (u64_t x)
+{
+  return x;
+}
+static inline v_s64_t
+v_s64 (s64_t x)
+{
+  return x;
+}
+/* true if any elements of a v_cond result is non-zero.  */
+static inline int
+v_any_u64 (v_u64_t x)
+{
+  return x != 0;
+}
+/* to wrap the result of relational operators.  */
+static inline v_u64_t
+v_cond_u64 (v_u64_t x)
+{
+  return x ? -1 : 0;
+}
+static inline v_f64_t
+v_abs_f64 (v_f64_t x)
+{
+  return __builtin_fabs (x);
+}
+static inline v_f64_t
+v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
+{
+  return __builtin_fma (x, y, z);
+}
+static inline v_f64_t
+v_round_f64 (v_f64_t x)
+{
+  return __builtin_round (x);
+}
+static inline v_s64_t
+v_round_s64 (v_f64_t x)
+{
+  return __builtin_lround (x); /* relies on -fno-math-errno.  */
+}
+/* convert to type1 from type2.  */
+static inline v_f64_t
+v_to_f64_s64 (v_s64_t x)
+{
+  return x;
+}
+static inline v_f64_t
+v_to_f64_u64 (v_u64_t x)
+{
+  return x;
+}
+/* reinterpret as type1 from type2.  */
+static inline v_u64_t
+v_as_u64_f64 (v_f64_t x)
+{
+  union { v_f64_t f; v_u64_t u; } r = {x};
+  return r.u;
+}
+static inline v_f64_t
+v_as_f64_u64 (v_u64_t x)
+{
+  union { v_u64_t u; v_f64_t f; } r = {x};
+  return r.f;
+}
+static inline v_s64_t
+v_as_s64_u64 (v_u64_t x)
+{
+  union { v_u64_t u; v_s64_t i; } r = {x};
+  return r.i;
+}
+static inline v_u64_t
+v_as_u64_s64 (v_s64_t x)
+{
+  union { v_s64_t i; v_u64_t u; } r = {x};
+  return r.u;
+}
+static inline v_f64_t
+v_lookup_f64 (const f64_t *tab, v_u64_t idx)
+{
+  return tab[idx];
+}
+static inline v_u64_t
+v_lookup_u64 (const u64_t *tab, v_u64_t idx)
+{
+  return tab[idx];
+}
+static inline v_f64_t
+v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p)
+{
+  return f (x);
+}
+
+#elif __aarch64__
+#define V_SUPPORTED 1
+#include <arm_neon.h>
+typedef float32x4_t v_f32_t;
+typedef uint32x4_t v_u32_t;
+typedef int32x4_t v_s32_t;
+typedef float64x2_t v_f64_t;
+typedef uint64x2_t v_u64_t;
+typedef int64x2_t v_s64_t;
+
+static inline int
+v_lanes32 (void)
+{
+  return 4;
+}
+
+static inline v_f32_t
+v_f32 (f32_t x)
+{
+  return (v_f32_t){x, x, x, x};
+}
+static inline v_u32_t
+v_u32 (u32_t x)
+{
+  return (v_u32_t){x, x, x, x};
+}
+static inline v_s32_t
+v_s32 (s32_t x)
+{
+  return (v_s32_t){x, x, x, x};
+}
+
+static inline f32_t
+v_get_f32 (v_f32_t x, int i)
+{
+  return x[i];
+}
+static inline u32_t
+v_get_u32 (v_u32_t x, int i)
+{
+  return x[i];
+}
+static inline s32_t
+v_get_s32 (v_s32_t x, int i)
+{
+  return x[i];
+}
+
+static inline void
+v_set_f32 (v_f32_t *x, int i, f32_t v)
+{
+  (*x)[i] = v;
+}
+static inline void
+v_set_u32 (v_u32_t *x, int i, u32_t v)
+{
+  (*x)[i] = v;
+}
+static inline void
+v_set_s32 (v_s32_t *x, int i, s32_t v)
+{
+  (*x)[i] = v;
+}
+
+/* true if any elements of a v_cond result is non-zero.  */
+static inline int
+v_any_u32 (v_u32_t x)
+{
+  /* assume elements in x are either 0 or -1u.  */
+  return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0;
+}
+/* to wrap the result of relational operators.  */
+static inline v_u32_t
+v_cond_u32 (v_u32_t x)
+{
+  return x;
+}
+static inline v_f32_t
+v_abs_f32 (v_f32_t x)
+{
+  return vabsq_f32 (x);
+}
+static inline v_f32_t
+v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
+{
+  return vfmaq_f32 (z, x, y);
+}
+static inline v_f32_t
+v_round_f32 (v_f32_t x)
+{
+  return vrndaq_f32 (x);
+}
+static inline v_s32_t
+v_round_s32 (v_f32_t x)
+{
+  return vcvtaq_s32_f32 (x);
+}
+/* convert to type1 from type2.  */
+static inline v_f32_t
+v_to_f32_s32 (v_s32_t x)
+{
+  return (v_f32_t){x[0], x[1], x[2], x[3]};
+}
+static inline v_f32_t
+v_to_f32_u32 (v_u32_t x)
+{
+  return (v_f32_t){x[0], x[1], x[2], x[3]};
+}
+/* reinterpret as type1 from type2.  */
+static inline v_u32_t
+v_as_u32_f32 (v_f32_t x)
+{
+  union { v_f32_t f; v_u32_t u; } r = {x};
+  return r.u;
+}
+static inline v_f32_t
+v_as_f32_u32 (v_u32_t x)
+{
+  union { v_u32_t u; v_f32_t f; } r = {x};
+  return r.f;
+}
+static inline v_s32_t
+v_as_s32_u32 (v_u32_t x)
+{
+  union { v_u32_t u; v_s32_t i; } r = {x};
+  return r.i;
+}
+static inline v_u32_t
+v_as_u32_s32 (v_s32_t x)
+{
+  union { v_s32_t i; v_u32_t u; } r = {x};
+  return r.u;
+}
+static inline v_f32_t
+v_lookup_f32 (const f32_t *tab, v_u32_t idx)
+{
+  return (v_f32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
+}
+static inline v_u32_t
+v_lookup_u32 (const u32_t *tab, v_u32_t idx)
+{
+  return (v_u32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
+}
+static inline v_f32_t
+v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p)
+{
+  return (v_f32_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1],
+		   p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]};
+}
+static inline v_f32_t
+v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
+	     v_u32_t p)
+{
+  return (
+    v_f32_t){p[0] ? f (x1[0], x2[0]) : y[0], p[1] ? f (x1[1], x2[1]) : y[1],
+	     p[2] ? f (x1[2], x2[2]) : y[2], p[3] ? f (x1[3], x2[3]) : y[3]};
+}
+
+static inline v_f64_t
+v_f64 (f64_t x)
+{
+  return (v_f64_t){x, x};
+}
+static inline v_u64_t
+v_u64 (u64_t x)
+{
+  return (v_u64_t){x, x};
+}
+static inline v_s64_t
+v_s64 (s64_t x)
+{
+  return (v_s64_t){x, x};
+}
+/* true if any elements of a v_cond result is non-zero.  */
+static inline int
+v_any_u64 (v_u64_t x)
+{
+  /* assume elements in x are either 0 or -1u.  */
+  return vpaddd_u64 (x) != 0;
+}
+/* to wrap the result of relational operators.  */
+static inline v_u64_t
+v_cond_u64 (v_u64_t x)
+{
+  return x;
+}
+static inline v_f64_t
+v_abs_f64 (v_f64_t x)
+{
+  return vabsq_f64 (x);
+}
+static inline v_f64_t
+v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
+{
+  return vfmaq_f64 (z, x, y);
+}
+static inline v_f64_t
+v_round_f64 (v_f64_t x)
+{
+  return vrndaq_f64 (x);
+}
+static inline v_s64_t
+v_round_s64 (v_f64_t x)
+{
+  return vcvtaq_s64_f64 (x);
+}
+/* convert to type1 from type2.  */
+static inline v_f64_t
+v_to_f64_s64 (v_s64_t x)
+{
+  return (v_f64_t){x[0], x[1]};
+}
+static inline v_f64_t
+v_to_f64_u64 (v_u64_t x)
+{
+  return (v_f64_t){x[0], x[1]};
+}
+/* reinterpret as type1 from type2.  */
+static inline v_u64_t
+v_as_u64_f64 (v_f64_t x)
+{
+  union { v_f64_t f; v_u64_t u; } r = {x};
+  return r.u;
+}
+static inline v_f64_t
+v_as_f64_u64 (v_u64_t x)
+{
+  union { v_u64_t u; v_f64_t f; } r = {x};
+  return r.f;
+}
+static inline v_s64_t
+v_as_s64_u64 (v_u64_t x)
+{
+  union {  v_u64_t u; v_s64_t i; } r = {x};
+  return r.i;
+}
+static inline v_u64_t
+v_as_u64_s64 (v_s64_t x)
+{
+  union { v_s64_t i; v_u64_t u; } r = {x};
+  return r.u;
+}
+static inline v_f64_t
+v_lookup_f64 (const f64_t *tab, v_u64_t idx)
+{
+  return (v_f64_t){tab[idx[0]], tab[idx[1]]};
+}
+static inline v_u64_t
+v_lookup_u64 (const u64_t *tab, v_u64_t idx)
+{
+  return (v_u64_t){tab[idx[0]], tab[idx[1]]};
+}
+static inline v_f64_t
+v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p)
+{
+  return (v_f64_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1]};
+}
+#endif
+
+#endif
diff --git a/math/vn_exp.c b/math/vn_exp.c
new file mode 100644
index 0000000..06e269d
--- /dev/null
+++ b/math/vn_exp.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_exp.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#include "mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_exp, _ZGVnN2v_exp)
+#include "v_exp.c"
+#endif
diff --git a/math/vn_expf.c b/math/vn_expf.c
new file mode 100644
index 0000000..0652907
--- /dev/null
+++ b/math/vn_expf.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_expf.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#include "mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_expf, _ZGVnN4v_expf)
+#include "v_expf.c"
+#endif
diff --git a/math/vn_expf_1u.c b/math/vn_expf_1u.c
new file mode 100644
index 0000000..3be7768
--- /dev/null
+++ b/math/vn_expf_1u.c
@@ -0,0 +1,11 @@
+/*
+ * AdvSIMD vector PCS variant of __v_expf_1u.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#include "mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#include "v_expf_1u.c"
+#endif
author	Szabolcs Nagy <szabolcs.nagy@arm.com>	2019-07-18 12:51:41 +0100
committer	Szabolcs Nagy <szabolcs.nagy@arm.com>	2019-10-14 11:58:46 +0100
commit	7a1f4cfdecb2060158bff1c9428ed2bb5442cc6f (patch)
tree	a2ce7dee1d1dd0af37d23f6263f6a1538baffbf5 /math
parent	a88f3f60c4b9891f12522586a9d445175611e2c8 (diff)
download	arm-optimized-routines-7a1f4cfdecb2060158bff1c9428ed2bb5442cc6f.tar.gz