aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJoe Ramsay <Joe.Ramsay@arm.com>2022-09-02 10:42:05 +0100
committerJoe Ramsay <joe.ramsay@arm.com>2022-09-02 10:42:05 +0100
commit6e0ada9b7edd52693fe8a684863796b7a309d6fe (patch)
treef66dfcafeba33e6ded6830d486112fef8ddb70fe
parent65e0232e36b784b09a72caf0df6845eec9e95057 (diff)
downloadarm-optimized-routines-6e0ada9b7edd52693fe8a684863796b7a309d6fe.tar.gz
pl/math: Add vector/SVE log10f
New routine shares polynomial with Neon variant and is accurate to 3.5 ULP.
-rw-r--r--pl/math/include/mathlib.h2
-rw-r--r--pl/math/math_config.h3
-rw-r--r--pl/math/sv_log10f_3u5.c78
-rw-r--r--pl/math/sv_math.h12
-rw-r--r--pl/math/test/mathbench_funcs.h2
-rwxr-xr-xpl/math/test/runulp.sh12
-rw-r--r--pl/math/test/ulp_funcs.h2
-rw-r--r--pl/math/test/ulp_wrappers.h6
-rw-r--r--pl/math/v_log10f_3u5.c27
-rw-r--r--pl/math/v_log10f_data.c13
10 files changed, 137 insertions, 20 deletions
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 0f004c0..4f3eba3 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -111,6 +111,7 @@ svfloat64_t __sv_atan_x (svfloat64_t, svbool_t);
svfloat64_t __sv_atan2_x (svfloat64_t, svfloat64_t, svbool_t);
svfloat32_t __sv_cosf_x (svfloat32_t, svbool_t);
svfloat64_t __sv_cos_x (svfloat64_t, svbool_t);
+svfloat32_t __sv_log10f_x (svfloat32_t, svbool_t);
svfloat64_t __sv_log10_x (svfloat64_t, svbool_t);
svfloat32_t __sv_sinf_x (svfloat32_t, svbool_t);
svfloat64_t __sv_sin_x (svfloat64_t, svbool_t);
@@ -121,6 +122,7 @@ svfloat64_t _ZGVsMxv_atan (svfloat64_t, svbool_t);
svfloat64_t _ZGVsMxvv_atan2 (svfloat64_t, svfloat64_t, svbool_t);
svfloat32_t _ZGVsMxv_cosf (svfloat32_t, svbool_t);
svfloat64_t _ZGVsMxv_cos (svfloat64_t, svbool_t);
+svfloat32_t _ZGVsMxv_log10f (svfloat32_t, svbool_t);
svfloat64_t _ZGVsMxv_log10 (svfloat64_t, svbool_t);
svfloat32_t _ZGVsMxv_sinf (svfloat32_t, svbool_t);
svfloat64_t _ZGVsMxv_sin (svfloat64_t, svbool_t);
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index d70a38c..10bbd9b 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -508,4 +508,7 @@ extern const struct v_log10_data
double invln10, log10_2;
} __v_log10_data HIDDEN;
+#define V_LOG10F_POLY_ORDER 9
+extern const float __v_log10f_poly[V_LOG10F_POLY_ORDER - 1] HIDDEN;
+
#endif
diff --git a/pl/math/sv_log10f_3u5.c b/pl/math/sv_log10f_3u5.c
new file mode 100644
index 0000000..fe8ecfd
--- /dev/null
+++ b/pl/math/sv_log10f_3u5.c
@@ -0,0 +1,78 @@
+/*
+ * Single-precision SVE log10 function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+
+#if SV_SUPPORTED
+
+#define SpecialCaseMin 0x00800000
+#define SpecialCaseMax 0x7f800000
+#define Offset 0x3f2aaaab /* 0.666667. */
+#define Mask 0x007fffff
+#define Ln2 0x1.62e43p-1f /* 0x3f317218. */
+#define InvLn10 0x1.bcb7b2p-2f
+
+#define P(i) __v_log10f_poly[i]
+
+static NOINLINE sv_f32_t
+special_case (sv_f32_t x, sv_f32_t y, svbool_t special)
+{
+ return sv_call_f32 (log10f, x, y, special);
+}
+
+/* Optimised implementation of SVE log10f using the same algorithm and
+ polynomial as v_log10f. Maximum error is 3.31ulps:
+ __sv_log10f(0x1.555c16p+0) got 0x1.ffe2fap-4
+ want 0x1.ffe2f4p-4. */
+sv_f32_t
+__sv_log10f_x (sv_f32_t x, const svbool_t pg)
+{
+ sv_u32_t ix = sv_as_u32_f32 (x);
+ svbool_t special_cases
+ = svcmpge_n_u32 (pg, svsub_n_u32_x (pg, ix, SpecialCaseMin),
+ SpecialCaseMax - SpecialCaseMin);
+
+ /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
+ ix = svsub_n_u32_x (pg, ix, Offset);
+ sv_f32_t n = sv_to_f32_s32_x (pg, svasr_n_s32_x (pg, sv_as_s32_u32 (ix),
+ 23)); /* signextend. */
+ ix = svand_n_u32_x (pg, ix, Mask);
+ ix = svadd_n_u32_x (pg, ix, Offset);
+ sv_f32_t r = svsub_n_f32_x (pg, sv_as_f32_u32 (ix), 1.0f);
+
+ /* y = log10(1+r) + n*log10(2)
+ log10(1+r) ~ r * InvLn(10) + P(r)
+ where P(r) is a polynomial. Use order 9 for log10(1+x), i.e. order 8 for
+ log10(1+x)/x, with x in [-1/3, 1/3] (offset=2/3)
+
+ P(r) = r2 * (Q01 + r2 * (Q23 + r2 * (Q45 + r2 * Q67)))
+ and Qij = Pi + r * Pj. */
+ sv_f32_t q12 = sv_fma_n_f32_x (pg, P (1), r, sv_f32 (P (0)));
+ sv_f32_t q34 = sv_fma_n_f32_x (pg, P (3), r, sv_f32 (P (2)));
+ sv_f32_t q56 = sv_fma_n_f32_x (pg, P (5), r, sv_f32 (P (4)));
+ sv_f32_t q78 = sv_fma_n_f32_x (pg, P (7), r, sv_f32 (P (6)));
+
+ sv_f32_t r2 = svmul_f32_x (pg, r, r);
+ sv_f32_t y = sv_fma_f32_x (pg, q78, r2, q56);
+ y = sv_fma_f32_x (pg, y, r2, q34);
+ y = sv_fma_f32_x (pg, y, r2, q12);
+
+ /* Using p = Log10(2)*n + r*InvLn(10) is slightly faster but less
+ accurate. */
+ sv_f32_t p = sv_fma_n_f32_x (pg, Ln2, n, r);
+ y = sv_fma_f32_x (pg, y, r2, svmul_n_f32_x (pg, p, InvLn10));
+
+ if (unlikely (svptest_any (pg, special_cases)))
+ {
+ return special_case (x, y, special_cases);
+ }
+ return y;
+}
+
+strong_alias (__sv_log10f_x, _ZGVsMxv_log10f)
+
+#endif
diff --git a/pl/math/sv_math.h b/pl/math/sv_math.h
index 4164faa..3b318f1 100644
--- a/pl/math/sv_math.h
+++ b/pl/math/sv_math.h
@@ -183,6 +183,18 @@ sv_as_f32_u32 (sv_u32_t x)
return svreinterpret_f32_u32 (x);
}
+static inline sv_s32_t
+sv_as_s32_u32 (sv_u32_t x)
+{
+ return svreinterpret_s32_u32 (x);
+}
+
+static inline sv_f32_t
+sv_to_f32_s32_x (svbool_t pg, sv_s32_t s)
+{
+ return svcvt_f32_x (pg, s);
+}
+
static inline sv_f32_t
sv_call_f32 (f32_t (*f) (f32_t), sv_f32_t x, sv_f32_t y, svbool_t cmp)
{
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index f89ee08..076340e 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -117,6 +117,8 @@ SVF (_ZGVsMxv_cosf, -3.1, 3.1)
SVF (__sv_sinf_x, -3.1, 3.1)
SVF (_ZGVsMxv_sinf, -3.1, 3.1)
+SVF (__sv_log10f_x, 0.01, 11.1)
+SVF (_ZGVsMxv_log10f, 0.01, 11.1)
SVD (__sv_log10_x, 0.01, 11.1)
SVD (_ZGVsMxv_log10, 0.01, 11.1)
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 993b7a3..3475adb 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -323,6 +323,15 @@ range_sve_log10='
100 inf 50000
'
+range_sve_log10f='
+ -0.0 -0x1p126 100
+ 0x1p-149 0x1p-126 4000
+ 0x1p-126 0x1p-23 50000
+ 0x1p-23 1.0 50000
+ 1.0 100 50000
+ 100 inf 50000
+'
+
# error limits
L_erfc=3.7
L_erfcf=1.0
@@ -349,6 +358,7 @@ L_sve_atan=2.5
L_sve_atan2f=3.0
L_sve_atan2=2.0
L_sve_log10=2.5
+L_sve_log10f=3.5
while read G F R
do
@@ -434,6 +444,8 @@ sve_atan2f __sv_atan2f $runsv
sve_atan2f _ZGVsMxvv_atan2f $runsv
sve_atanf __sv_atanf $runsv
sve_atanf _ZGVsMxv_atanf $runsv
+sve_log10f __sv_log10f $runsv
+sve_log10f _ZGVsMxv_log10f $runsv
sve_cos __sv_cos $runsv
sve_cos _ZGVsMxv_cos $runsv
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 695909f..4b12a1a 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -90,6 +90,8 @@ SVF1 (cos)
ZSVF1 (cos)
SVD1 (cos)
ZSVD1 (cos)
+SVF1 (log10)
+ZSVF1 (log10)
SVD1 (log10)
ZSVD1 (log10)
SVF1 (sin)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 886dfde..4f11aec 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -83,6 +83,12 @@ static float sv_cosf(float x) {
static float Z_sv_cosf(float x) {
return svretf(_ZGVsMxv_cosf(svargf(x), svptrue_b32()));
}
+static float sv_log10f(float x) {
+ return svretf(__sv_log10f_x(svargf(x), svptrue_b32()));
+}
+static float Z_sv_log10f(float x) {
+ return svretf(_ZGVsMxv_log10f(svargf(x), svptrue_b32()));
+}
static float sv_sinf(float x) {
return svretf(__sv_sinf_x(svargf(x), svptrue_b32()));
}
diff --git a/pl/math/v_log10f_3u5.c b/pl/math/v_log10f_3u5.c
index c956d0c..4dede3d 100644
--- a/pl/math/v_log10f_3u5.c
+++ b/pl/math/v_log10f_3u5.c
@@ -9,22 +9,9 @@
#include "v_math.h"
#if V_SUPPORTED
-static const float Poly[] = {
- /* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in
- [-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25. */
- -0x1.bcb79cp-3f, 0x1.2879c8p-3f, -0x1.bcd472p-4f, 0x1.6408f8p-4f,
- -0x1.246f8p-4f, 0x1.f0e514p-5f, -0x1.0fc92cp-4f, 0x1.f5f76ap-5f};
-#define P8 v_f32 (Poly[7])
-#define P7 v_f32 (Poly[6])
-#define P6 v_f32 (Poly[5])
-#define P5 v_f32 (Poly[4])
-#define P4 v_f32 (Poly[3])
-#define P3 v_f32 (Poly[2])
-#define P2 v_f32 (Poly[1])
-#define P1 v_f32 (Poly[0])
+#define P(i) v_f32 (__v_log10f_poly[i])
#define Ln2 v_f32 (0x1.62e43p-1f) /* 0x3f317218. */
-#define Log10_2 v_f32 (0x1.344136p-2f)
#define InvLn10 v_f32 (0x1.bcb7b2p-2f)
#define Min v_u32 (0x00800000)
#define Max v_u32 (0x7f800000)
@@ -64,12 +51,12 @@ v_f32_t V_NAME (log10f) (v_f32_t x)
/* y = log10(1+r) + n*log10(2). */
r2 = r * r;
- /* (n*ln2 + r)*InvLn10 + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 +
- r2*(P7+r*P8))). */
- o = v_fma_f32 (P8, r, P7);
- p = v_fma_f32 (P6, r, P5);
- q = v_fma_f32 (P4, r, P3);
- y = v_fma_f32 (P2, r, P1);
+ /* (n*ln2 + r)*InvLn10 + r2*(P0 + r*P1 + r2*(P2 + r*P3 + r2*(P4 + r*P5 +
+ r2*(P6+r*P7))). */
+ o = v_fma_f32 (P (7), r, P (6));
+ p = v_fma_f32 (P (5), r, P (4));
+ q = v_fma_f32 (P (3), r, P (2));
+ y = v_fma_f32 (P (1), r, P (0));
p = v_fma_f32 (o, r2, p);
q = v_fma_f32 (p, r2, q);
y = v_fma_f32 (q, r2, y);
diff --git a/pl/math/v_log10f_data.c b/pl/math/v_log10f_data.c
new file mode 100644
index 0000000..c95f38b
--- /dev/null
+++ b/pl/math/v_log10f_data.c
@@ -0,0 +1,13 @@
+/*
+ * Coefficients for single-precision vector log10 function.
+ *
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "math_config.h"
+
+const float __v_log10f_poly[] = {
+ /* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in
+ [-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25. */
+ -0x1.bcb79cp-3f, 0x1.2879c8p-3f, -0x1.bcd472p-4f, 0x1.6408f8p-4f,
+ -0x1.246f8p-4f, 0x1.f0e514p-5f, -0x1.0fc92cp-4f, 0x1.f5f76ap-5f};