aboutsummaryrefslogtreecommitdiff
path: root/pl/math
diff options
context:
space:
mode:
authorJoe Ramsay <Joe.Ramsay@arm.com>2022-11-09 14:52:37 +0000
committerJoe Ramsay <joe.ramsay@arm.com>2022-11-09 14:52:37 +0000
commit1721f53563004249849968a4f78a3ed162b5e8e1 (patch)
tree9a9d3d450343f4760c1258d16f89b6fe7e995cf4 /pl/math
parent151020a369757cd33e13ca3dd9dfadfc2a15a905 (diff)
downloadarm-optimized-routines-1721f53563004249849968a4f78a3ed162b5e8e1.tar.gz
pl/math: Add vector/Neon expm1
New routine is a vector port of the scalar algorithm, with fallback to the scalar variant for large and special input. This enables us to simplify elements of the algorithm which were necessary for large input. It also means that, as long as we fall back to the scalar for tiny input as well (dependent on the value of WANT_ERRNO), the routine sets fenv flags correctly.
Diffstat (limited to 'pl/math')
-rw-r--r--pl/math/include/mathlib.h4
-rw-r--r--pl/math/s_expm1_2u5.c6
-rw-r--r--pl/math/test/mathbench_funcs.h5
-rwxr-xr-xpl/math/test/runulp.sh14
-rw-r--r--pl/math/test/ulp_funcs.h3
-rw-r--r--pl/math/test/ulp_wrappers.h2
-rw-r--r--pl/math/v_expm1_2u5.c100
-rw-r--r--pl/math/v_math.h11
-rw-r--r--pl/math/vn_expm1_2u5.c12
9 files changed, 157 insertions, 0 deletions
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index e0ad61a..9ebe539 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -46,6 +46,7 @@ double __s_atan (double);
double __s_atan2 (double, double);
double __s_erf (double);
double __s_erfc (double);
+double __s_expm1 (double);
double __s_log10 (double);
double __s_log1p (double);
double __s_log2 (double);
@@ -73,6 +74,7 @@ __f64x2_t __v_erf (__f64x2_t);
__f32x4_t __v_erfcf (__f32x4_t);
__f64x2_t __v_erfc (__f64x2_t);
__f32x4_t __v_expm1f (__f32x4_t);
+__f64x2_t __v_expm1 (__f64x2_t);
__f32x4_t __v_log10f (__f32x4_t);
__f64x2_t __v_log10 (__f64x2_t);
__f32x4_t __v_log1pf (__f32x4_t);
@@ -97,6 +99,7 @@ __vpcs __f64x2_t __vn_erf (__f64x2_t);
__vpcs __f32x4_t __vn_erfcf (__f32x4_t);
__vpcs __f64x2_t __vn_erfc (__f64x2_t);
__vpcs __f32x4_t __vn_expm1f (__f32x4_t);
+__vpcs __f64x2_t __vn_expm1 (__f64x2_t);
__vpcs __f32x4_t __vn_log10f (__f32x4_t);
__vpcs __f64x2_t __vn_log10 (__f64x2_t);
__vpcs __f32x4_t __vn_log1pf (__f32x4_t);
@@ -118,6 +121,7 @@ __vpcs __f64x2_t _ZGVnN2v_erf (__f64x2_t);
__vpcs __f32x4_t _ZGVnN4v_erfcf (__f32x4_t);
__vpcs __f64x2_t _ZGVnN2v_erfc (__f64x2_t);
__vpcs __f32x4_t _ZGVnN4v_expm1f (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_expm1 (__f64x2_t);
__vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t);
__vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
__vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t);
diff --git a/pl/math/s_expm1_2u5.c b/pl/math/s_expm1_2u5.c
new file mode 100644
index 0000000..00827da
--- /dev/null
+++ b/pl/math/s_expm1_2u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_expm1_2u5.c"
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 751d5fc..e73206e 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -47,6 +47,7 @@ D (__s_erf, -6.0, 6.0)
F (__s_erfcf, -6.0, 28.0)
D (__s_erfc, -6.0, 28.0)
F (__s_expm1f, -9.9, 9.9)
+D (__s_expm1, -9.9, 9.9)
F (__s_log10f, 0.01, 11.1)
D (__s_log10, 0.01, 11.1)
F (__s_log1pf, -0.9, 10.0)
@@ -67,6 +68,7 @@ VD (__v_erf, -6.0, 6.0)
VF (__v_erfcf, -6.0, 28.0)
VD (__v_erfc, -6.0, 28.0)
VF (__v_expm1f, -9.9, 9.9)
+VD (__v_expm1, -9.9, 9.9)
VD (__v_log10, 0.01, 11.1)
VF (__v_log10f, 0.01, 11.1)
VF (__v_log1pf, -0.9, 10.0)
@@ -109,6 +111,9 @@ VND (_ZGVnN2v_erfc, -6.0, 28.0)
VNF (__vn_expm1f, -9.9, 9.9)
VNF (_ZGVnN4v_expm1f, -9.9, 9.9)
+VND (__vn_expm1, -9.9, 9.9)
+VND (_ZGVnN2v_expm1, -9.9, 9.9)
+
VNF (__vn_log10f, 0.01, 11.1)
VNF (_ZGVnN4v_log10f, 0.01, 11.1)
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 87911a7..c92892b 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -384,6 +384,15 @@ range_coshf='
-0x1.5a92d8p+6 -inf 2000
'
+range_expm1='
+ 0 0x1p-51 1000
+ -0 -0x1p-51 1000
+ 0x1p-51 0x1.63108c75a1937p+9 100000
+ -0x1p-51 -0x1.740bf7c0d927dp+9 100000
+ 0x1.63108c75a1937p+9 inf 100
+ -0x1.740bf7c0d927dp+9 -inf 100
+'
+
range_sve_cosf='
0 0xffff0000 10000
0x1p-4 0x1p4 500000
@@ -546,6 +555,7 @@ L_log1p=1.97
L_expm1f=1.02
L_sinhf=1.76
L_coshf=1.89
+L_expm1=1.68
L_sve_cosf=1.57
L_sve_cos=1.61
@@ -624,6 +634,10 @@ log2 __s_log2 $runs
log2 __v_log2 $runv
log2 __vn_log2 $runvn
log2 _ZGVnN2v_log2 $runvn
+expm1 __s_expm1 $runs fenv
+expm1 __v_expm1 $runv fenv
+expm1 __vn_expm1 $runvn fenv
+expm1 _ZGVnN2v_expm1 $runvn fenv
atanf __s_atanf $runs
atanf __v_atanf $runv
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index ff94e1d..98b63c8 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -34,6 +34,7 @@ SD1 (erf)
SF1 (erfc)
SD1 (erfc)
SF1 (expm1)
+SD1 (expm1)
SF1 (log10)
SD1 (log10)
SF1 (log1p)
@@ -54,6 +55,7 @@ VD1 (erf)
VF1 (erfc)
VD1 (erfc)
VF1 (expm1)
+VD1 (expm1)
VF1 (log10)
VD1 (log10)
VF1 (log1p)
@@ -74,6 +76,7 @@ ZVND1 (erf)
ZVNF1 (erfc)
ZVND1 (erfc)
ZVNF1 (expm1)
+ZVND1 (expm1)
ZVNF1 (log10)
ZVND1 (log10)
ZVNF1 (log1p)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 93cf75e..9c639c1 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -113,6 +113,7 @@ VD1_WRAP(atan)
VD2_WRAP(atan2)
VD1_WRAP(erf)
VD1_WRAP(erfc)
+VD1_WRAP(expm1)
VD1_WRAP(log10)
VD1_WRAP(log1p)
VD1_WRAP(log2)
@@ -133,6 +134,7 @@ ZVND1_WRAP(atan)
ZVND2_WRAP(atan2)
ZVND1_WRAP(erf)
ZVND1_WRAP(erfc)
+ZVND1_WRAP(expm1)
ZVND1_WRAP(log10)
ZVND1_WRAP(log1p)
ZVND1_WRAP(log2)
diff --git a/pl/math/v_expm1_2u5.c b/pl/math/v_expm1_2u5.c
new file mode 100644
index 0000000..425ad88
--- /dev/null
+++ b/pl/math/v_expm1_2u5.c
@@ -0,0 +1,100 @@
+/*
+ * Double-precision vector exp(x) - 1 function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+
+#if V_SUPPORTED
+
+#define InvLn2 v_f64 (0x1.71547652b82fep0)
+#define MLn2hi v_f64 (-0x1.62e42fefa39efp-1)
+#define MLn2lo v_f64 (-0x1.abc9e3b39803fp-56)
+#define Shift v_f64 (0x1.8p52)
+#define TinyBound \
+ 0x3cc0000000000000 /* 0x1p-51, below which expm1(x) is within 2 ULP of x. */
+#define SpecialBound \
+ 0x40862b7d369a5aa9 /* 0x1.62b7d369a5aa9p+9. For |x| > SpecialBound, the \
+ final stage of the algorithm overflows so fall back to \
+ scalar. */
+#define AbsMask 0x7fffffffffffffff
+#define One 0x3ff0000000000000
+
+#define C(i) v_f64 (__expm1_poly[i])
+
+static inline v_f64_t
+eval_poly (v_f64_t f, v_f64_t f2)
+{
+ /* Evaluate custom polynomial using Estrin scheme. */
+ v_f64_t p_01 = v_fma_f64 (f, C (1), C (0));
+ v_f64_t p_23 = v_fma_f64 (f, C (3), C (2));
+ v_f64_t p_45 = v_fma_f64 (f, C (5), C (4));
+ v_f64_t p_67 = v_fma_f64 (f, C (7), C (6));
+ v_f64_t p_89 = v_fma_f64 (f, C (9), C (8));
+
+ v_f64_t p_03 = v_fma_f64 (f2, p_23, p_01);
+ v_f64_t p_47 = v_fma_f64 (f2, p_67, p_45);
+ v_f64_t p_8a = v_fma_f64 (f2, C (10), p_89);
+
+ v_f64_t f4 = f2 * f2;
+ v_f64_t p_07 = v_fma_f64 (f4, p_47, p_03);
+ return v_fma_f64 (f4 * f4, p_8a, p_07);
+}
+
+/* Double-precision vector exp(x) - 1 function.
+ The maximum error observed error is 2.18 ULP:
+ __v_expm1(0x1.634ba0c237d7bp-2) got 0x1.a8b9ea8d66e22p-2
+ want 0x1.a8b9ea8d66e2p-2. */
+VPCS_ATTR
+v_f64_t V_NAME (expm1) (v_f64_t x)
+{
+ v_u64_t ix = v_as_u64_f64 (x);
+ v_u64_t ax = ix & AbsMask;
+
+#if WANT_ERRNO
+ /* If errno is to be set correctly, fall back to the scalar variant for all
+ lanes if any of them should trigger an exception. */
+ v_u64_t special = v_cond_u64 ((ax >= SpecialBound) | (ax <= TinyBound));
+ if (unlikely (v_any_u64 (special)))
+ return v_call_f64 (expm1, x, x, v_u64 (-1));
+#else
+ /* Large input, NaNs and Infs. */
+ v_u64_t special = v_cond_u64 (ax >= SpecialBound);
+#endif
+
+ /* Reduce argument to smaller range:
+ Let i = round(x / ln2)
+ and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
+ exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+ where 2^i is exact because i is an integer. */
+ v_f64_t j = v_fma_f64 (InvLn2, x, Shift) - Shift;
+ v_s64_t i = v_to_s64_f64 (j);
+ v_f64_t f = v_fma_f64 (j, MLn2hi, x);
+ f = v_fma_f64 (j, MLn2lo, f);
+
+ /* Approximate expm1(f) using polynomial.
+ Taylor expansion for expm1(x) has the form:
+ x + ax^2 + bx^3 + cx^4 ....
+ So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+ and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
+ v_f64_t f2 = f * f;
+ v_f64_t p = v_fma_f64 (f2, eval_poly (f, f2), f);
+
+ /* Assemble the result.
+ expm1(x) ~= 2^i * (p + 1) - 1
+ Let t = 2^i. */
+ v_f64_t t = v_as_f64_u64 (v_as_u64_s64 (i << 52) + One);
+ /* expm1(x) ~= p * t + (t - 1). */
+ v_f64_t y = v_fma_f64 (p, t, t - 1);
+
+#if !WANT_ERRNO
+ if (unlikely (v_any_u64 (special)))
+ return v_call_f64 (expm1, x, y, special);
+#endif
+
+ return y;
+}
+VPCS_ALIAS
+#endif
diff --git a/pl/math/v_math.h b/pl/math/v_math.h
index a3f9c57..d4597c8 100644
--- a/pl/math/v_math.h
+++ b/pl/math/v_math.h
@@ -400,6 +400,12 @@ v_to_f64_u64 (v_u64_t x)
{
return x;
}
+
+static inline v_s64_t
+v_to_s64_f64 (v_f64_t x)
+{
+ return x;
+}
/* reinterpret as type1 from type2. */
static inline v_u64_t
v_as_u64_f64 (v_f64_t x)
@@ -761,6 +767,11 @@ v_to_f64_u64 (v_u64_t x)
{
return (v_f64_t){x[0], x[1]};
}
+static inline v_s64_t
+v_to_s64_f64 (v_f64_t x)
+{
+ return vcvtq_s64_f64 (x);
+}
/* reinterpret as type1 from type2. */
static inline v_u64_t
v_as_u64_f64 (v_f64_t x)
diff --git a/pl/math/vn_expm1_2u5.c b/pl/math/vn_expm1_2u5.c
new file mode 100644
index 0000000..fc88b06
--- /dev/null
+++ b/pl/math/vn_expm1_2u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_expm1.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_expm1, _ZGVnN2v_expm1)
+#include "v_expm1_2u5.c"
+#endif