pl/math: Add vector/Neon expm1

New routine is a vector port of the scalar algorithm, with fallback to the scalar variant for large and special input. This enables us to simplify elements of the algorithm which were necessary for large input. It also means that, as long as we fall back to the scalar for tiny input as well (dependent on the value of WANT_ERRNO), the routine sets fenv flags correctly.
author: Joe Ramsay <Joe.Ramsay@arm.com> 2022-11-09 14:52:37 +0000
committer: Joe Ramsay <joe.ramsay@arm.com> 2022-11-09 14:52:37 +0000
commit: 1721f53563004249849968a4f78a3ed162b5e8e1 (patch)
tree: 9a9d3d450343f4760c1258d16f89b6fe7e995cf4 /pl/math
parent: 151020a369757cd33e13ca3dd9dfadfc2a15a905 (diff)
download: arm-optimized-routines-1721f53563004249849968a4f78a3ed162b5e8e1.tar.gz
9 files changed, 157 insertions, 0 deletions
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index e0ad61a..9ebe539 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -46,6 +46,7 @@ double __s_atan (double);
 double __s_atan2 (double, double);
 double __s_erf (double);
 double __s_erfc (double);
+double __s_expm1 (double);
 double __s_log10 (double);
 double __s_log1p (double);
 double __s_log2 (double);
@@ -73,6 +74,7 @@ __f64x2_t __v_erf (__f64x2_t);
 __f32x4_t __v_erfcf (__f32x4_t);
 __f64x2_t __v_erfc (__f64x2_t);
 __f32x4_t __v_expm1f (__f32x4_t);
+__f64x2_t __v_expm1 (__f64x2_t);
 __f32x4_t __v_log10f (__f32x4_t);
 __f64x2_t __v_log10 (__f64x2_t);
 __f32x4_t __v_log1pf (__f32x4_t);
@@ -97,6 +99,7 @@ __vpcs __f64x2_t __vn_erf (__f64x2_t);
 __vpcs __f32x4_t __vn_erfcf (__f32x4_t);
 __vpcs __f64x2_t __vn_erfc (__f64x2_t);
 __vpcs __f32x4_t __vn_expm1f (__f32x4_t);
+__vpcs __f64x2_t __vn_expm1 (__f64x2_t);
 __vpcs __f32x4_t __vn_log10f (__f32x4_t);
 __vpcs __f64x2_t __vn_log10 (__f64x2_t);
 __vpcs __f32x4_t __vn_log1pf (__f32x4_t);
@@ -118,6 +121,7 @@ __vpcs __f64x2_t _ZGVnN2v_erf (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_erfcf (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_erfc (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_expm1f (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_expm1 (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t);
diff --git a/pl/math/s_expm1_2u5.c b/pl/math/s_expm1_2u5.c
new file mode 100644
index 0000000..00827da
--- /dev/null
+++ b/pl/math/s_expm1_2u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_expm1_2u5.c"
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 751d5fc..e73206e 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -47,6 +47,7 @@ D (__s_erf, -6.0, 6.0)
 F (__s_erfcf, -6.0, 28.0)
 D (__s_erfc, -6.0, 28.0)
 F (__s_expm1f, -9.9, 9.9)
+D (__s_expm1, -9.9, 9.9)
 F (__s_log10f, 0.01, 11.1)
 D (__s_log10, 0.01, 11.1)
 F (__s_log1pf, -0.9, 10.0)
@@ -67,6 +68,7 @@ VD  (__v_erf, -6.0, 6.0)
 VF (__v_erfcf, -6.0, 28.0)
 VD (__v_erfc, -6.0, 28.0)
 VF (__v_expm1f, -9.9, 9.9)
+VD (__v_expm1, -9.9, 9.9)
 VD (__v_log10, 0.01, 11.1)
 VF (__v_log10f, 0.01, 11.1)
 VF (__v_log1pf, -0.9, 10.0)
@@ -109,6 +111,9 @@ VND (_ZGVnN2v_erfc, -6.0, 28.0)
 VNF (__vn_expm1f, -9.9, 9.9)
 VNF (_ZGVnN4v_expm1f, -9.9, 9.9)
 
+VND (__vn_expm1, -9.9, 9.9)
+VND (_ZGVnN2v_expm1, -9.9, 9.9)
+
 VNF (__vn_log10f, 0.01, 11.1)
 VNF (_ZGVnN4v_log10f, 0.01, 11.1)
 
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 87911a7..c92892b 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -384,6 +384,15 @@ range_coshf='
  -0x1.5a92d8p+6 -inf             2000
 '
 
+range_expm1='
+  0                     0x1p-51              1000
+ -0                    -0x1p-51              1000
+  0x1p-51               0x1.63108c75a1937p+9 100000
+ -0x1p-51              -0x1.740bf7c0d927dp+9 100000
+  0x1.63108c75a1937p+9  inf                  100
+ -0x1.740bf7c0d927dp+9 -inf                  100
+'
+
 range_sve_cosf='
  0    0xffff0000    10000
  0x1p-4    0x1p4    500000
@@ -546,6 +555,7 @@ L_log1p=1.97
 L_expm1f=1.02
 L_sinhf=1.76
 L_coshf=1.89
+L_expm1=1.68
 
 L_sve_cosf=1.57
 L_sve_cos=1.61
@@ -624,6 +634,10 @@ log2   __s_log2        $runs
 log2   __v_log2        $runv
 log2   __vn_log2       $runvn
 log2   _ZGVnN2v_log2   $runvn
+expm1  __s_expm1       $runs    fenv
+expm1  __v_expm1       $runv    fenv
+expm1  __vn_expm1      $runvn   fenv
+expm1  _ZGVnN2v_expm1  $runvn   fenv
 
 atanf  __s_atanf       $runs
 atanf  __v_atanf       $runv
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index ff94e1d..98b63c8 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -34,6 +34,7 @@ SD1 (erf)
 SF1 (erfc)
 SD1 (erfc)
 SF1 (expm1)
+SD1 (expm1)
 SF1 (log10)
 SD1 (log10)
 SF1 (log1p)
@@ -54,6 +55,7 @@ VD1 (erf)
 VF1 (erfc)
 VD1 (erfc)
 VF1 (expm1)
+VD1 (expm1)
 VF1 (log10)
 VD1 (log10)
 VF1 (log1p)
@@ -74,6 +76,7 @@ ZVND1 (erf)
 ZVNF1 (erfc)
 ZVND1 (erfc)
 ZVNF1 (expm1)
+ZVND1 (expm1)
 ZVNF1 (log10)
 ZVND1 (log10)
 ZVNF1 (log1p)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 93cf75e..9c639c1 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -113,6 +113,7 @@ VD1_WRAP(atan)
 VD2_WRAP(atan2)
 VD1_WRAP(erf)
 VD1_WRAP(erfc)
+VD1_WRAP(expm1)
 VD1_WRAP(log10)
 VD1_WRAP(log1p)
 VD1_WRAP(log2)
@@ -133,6 +134,7 @@ ZVND1_WRAP(atan)
 ZVND2_WRAP(atan2)
 ZVND1_WRAP(erf)
 ZVND1_WRAP(erfc)
+ZVND1_WRAP(expm1)
 ZVND1_WRAP(log10)
 ZVND1_WRAP(log1p)
 ZVND1_WRAP(log2)
diff --git a/pl/math/v_expm1_2u5.c b/pl/math/v_expm1_2u5.c
new file mode 100644
index 0000000..425ad88
--- /dev/null
+++ b/pl/math/v_expm1_2u5.c
@@ -0,0 +1,100 @@
+/*
+ * Double-precision vector exp(x) - 1 function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+
+#if V_SUPPORTED
+
+#define InvLn2 v_f64 (0x1.71547652b82fep0)
+#define MLn2hi v_f64 (-0x1.62e42fefa39efp-1)
+#define MLn2lo v_f64 (-0x1.abc9e3b39803fp-56)
+#define Shift v_f64 (0x1.8p52)
+#define TinyBound                                                              \
+  0x3cc0000000000000 /* 0x1p-51, below which expm1(x) is within 2 ULP of x. */
+#define SpecialBound                                                           \
+  0x40862b7d369a5aa9 /* 0x1.62b7d369a5aa9p+9. For |x| > SpecialBound, the      \
+			final stage of the algorithm overflows so fall back to \
+			scalar.  */
+#define AbsMask 0x7fffffffffffffff
+#define One 0x3ff0000000000000
+
+#define C(i) v_f64 (__expm1_poly[i])
+
+static inline v_f64_t
+eval_poly (v_f64_t f, v_f64_t f2)
+{
+  /* Evaluate custom polynomial using Estrin scheme.  */
+  v_f64_t p_01 = v_fma_f64 (f, C (1), C (0));
+  v_f64_t p_23 = v_fma_f64 (f, C (3), C (2));
+  v_f64_t p_45 = v_fma_f64 (f, C (5), C (4));
+  v_f64_t p_67 = v_fma_f64 (f, C (7), C (6));
+  v_f64_t p_89 = v_fma_f64 (f, C (9), C (8));
+
+  v_f64_t p_03 = v_fma_f64 (f2, p_23, p_01);
+  v_f64_t p_47 = v_fma_f64 (f2, p_67, p_45);
+  v_f64_t p_8a = v_fma_f64 (f2, C (10), p_89);
+
+  v_f64_t f4 = f2 * f2;
+  v_f64_t p_07 = v_fma_f64 (f4, p_47, p_03);
+  return v_fma_f64 (f4 * f4, p_8a, p_07);
+}
+
+/* Double-precision vector exp(x) - 1 function.
+   The maximum error observed error is 2.18 ULP:
+   __v_expm1(0x1.634ba0c237d7bp-2) got 0x1.a8b9ea8d66e22p-2
+				  want 0x1.a8b9ea8d66e2p-2.  */
+VPCS_ATTR
+v_f64_t V_NAME (expm1) (v_f64_t x)
+{
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t ax = ix & AbsMask;
+
+#if WANT_ERRNO
+  /* If errno is to be set correctly, fall back to the scalar variant for all
+     lanes if any of them should trigger an exception.  */
+  v_u64_t special = v_cond_u64 ((ax >= SpecialBound) | (ax <= TinyBound));
+  if (unlikely (v_any_u64 (special)))
+    return v_call_f64 (expm1, x, x, v_u64 (-1));
+#else
+  /* Large input, NaNs and Infs.  */
+  v_u64_t special = v_cond_u64 (ax >= SpecialBound);
+#endif
+
+  /* Reduce argument to smaller range:
+     Let i = round(x / ln2)
+     and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
+     exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+     where 2^i is exact because i is an integer.  */
+  v_f64_t j = v_fma_f64 (InvLn2, x, Shift) - Shift;
+  v_s64_t i = v_to_s64_f64 (j);
+  v_f64_t f = v_fma_f64 (j, MLn2hi, x);
+  f = v_fma_f64 (j, MLn2lo, f);
+
+  /* Approximate expm1(f) using polynomial.
+     Taylor expansion for expm1(x) has the form:
+	 x + ax^2 + bx^3 + cx^4 ....
+     So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+     and assemble the approximation expm1(f) ~= f + f^2 * P(f).  */
+  v_f64_t f2 = f * f;
+  v_f64_t p = v_fma_f64 (f2, eval_poly (f, f2), f);
+
+  /* Assemble the result.
+     expm1(x) ~= 2^i * (p + 1) - 1
+     Let t = 2^i.  */
+  v_f64_t t = v_as_f64_u64 (v_as_u64_s64 (i << 52) + One);
+  /* expm1(x) ~= p * t + (t - 1).  */
+  v_f64_t y = v_fma_f64 (p, t, t - 1);
+
+#if !WANT_ERRNO
+  if (unlikely (v_any_u64 (special)))
+    return v_call_f64 (expm1, x, y, special);
+#endif
+
+  return y;
+}
+VPCS_ALIAS
+#endif
diff --git a/pl/math/v_math.h b/pl/math/v_math.h
index a3f9c57..d4597c8 100644
--- a/pl/math/v_math.h
+++ b/pl/math/v_math.h
@@ -400,6 +400,12 @@ v_to_f64_u64 (v_u64_t x)
 {
   return x;
 }
+
+static inline v_s64_t
+v_to_s64_f64 (v_f64_t x)
+{
+  return x;
+}
 /* reinterpret as type1 from type2.  */
 static inline v_u64_t
 v_as_u64_f64 (v_f64_t x)
@@ -761,6 +767,11 @@ v_to_f64_u64 (v_u64_t x)
 {
   return (v_f64_t){x[0], x[1]};
 }
+static inline v_s64_t
+v_to_s64_f64 (v_f64_t x)
+{
+  return vcvtq_s64_f64 (x);
+}
 /* reinterpret as type1 from type2.  */
 static inline v_u64_t
 v_as_u64_f64 (v_f64_t x)
diff --git a/pl/math/vn_expm1_2u5.c b/pl/math/vn_expm1_2u5.c
new file mode 100644
index 0000000..fc88b06
--- /dev/null
+++ b/pl/math/vn_expm1_2u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_expm1.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_expm1, _ZGVnN2v_expm1)
+#include "v_expm1_2u5.c"
+#endif
author	Joe Ramsay <Joe.Ramsay@arm.com>	2022-11-09 14:52:37 +0000
committer	Joe Ramsay <joe.ramsay@arm.com>	2022-11-09 14:52:37 +0000
commit	1721f53563004249849968a4f78a3ed162b5e8e1 (patch)
tree	9a9d3d450343f4760c1258d16f89b6fe7e995cf4 /pl/math
parent	151020a369757cd33e13ca3dd9dfadfc2a15a905 (diff)
download	arm-optimized-routines-1721f53563004249849968a4f78a3ed162b5e8e1.tar.gz