pl/math: Add vector/SVE log10f

New routine shares polynomial with Neon variant and is accurate to 3.5 ULP.
author: Joe Ramsay <Joe.Ramsay@arm.com> 2022-09-02 10:42:05 +0100
committer: Joe Ramsay <joe.ramsay@arm.com> 2022-09-02 10:42:05 +0100
commit: 6e0ada9b7edd52693fe8a684863796b7a309d6fe (patch)
tree: f66dfcafeba33e6ded6830d486112fef8ddb70fe
parent: 65e0232e36b784b09a72caf0df6845eec9e95057 (diff)
download: arm-optimized-routines-6e0ada9b7edd52693fe8a684863796b7a309d6fe.tar.gz
10 files changed, 137 insertions, 20 deletions
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 0f004c0..4f3eba3 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -111,6 +111,7 @@ svfloat64_t __sv_atan_x (svfloat64_t, svbool_t);
 svfloat64_t __sv_atan2_x (svfloat64_t, svfloat64_t, svbool_t);
 svfloat32_t __sv_cosf_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_cos_x (svfloat64_t, svbool_t);
+svfloat32_t __sv_log10f_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_log10_x (svfloat64_t, svbool_t);
 svfloat32_t __sv_sinf_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_sin_x (svfloat64_t, svbool_t);
@@ -121,6 +122,7 @@ svfloat64_t _ZGVsMxv_atan (svfloat64_t, svbool_t);
 svfloat64_t _ZGVsMxvv_atan2 (svfloat64_t, svfloat64_t, svbool_t);
 svfloat32_t _ZGVsMxv_cosf (svfloat32_t, svbool_t);
 svfloat64_t _ZGVsMxv_cos (svfloat64_t, svbool_t);
+svfloat32_t _ZGVsMxv_log10f (svfloat32_t, svbool_t);
 svfloat64_t _ZGVsMxv_log10 (svfloat64_t, svbool_t);
 svfloat32_t _ZGVsMxv_sinf (svfloat32_t, svbool_t);
 svfloat64_t _ZGVsMxv_sin (svfloat64_t, svbool_t);
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index d70a38c..10bbd9b 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -508,4 +508,7 @@ extern const struct v_log10_data
   double invln10, log10_2;
 } __v_log10_data HIDDEN;
 
+#define V_LOG10F_POLY_ORDER 9
+extern const float __v_log10f_poly[V_LOG10F_POLY_ORDER - 1] HIDDEN;
+
 #endif
diff --git a/pl/math/sv_log10f_3u5.c b/pl/math/sv_log10f_3u5.c
new file mode 100644
index 0000000..fe8ecfd
--- /dev/null
+++ b/pl/math/sv_log10f_3u5.c
@@ -0,0 +1,78 @@
+/*
+ * Single-precision SVE log10 function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+
+#if SV_SUPPORTED
+
+#define SpecialCaseMin 0x00800000
+#define SpecialCaseMax 0x7f800000
+#define Offset 0x3f2aaaab /* 0.666667.  */
+#define Mask 0x007fffff
+#define Ln2 0x1.62e43p-1f /* 0x3f317218.  */
+#define InvLn10 0x1.bcb7b2p-2f
+
+#define P(i) __v_log10f_poly[i]
+
+static NOINLINE sv_f32_t
+special_case (sv_f32_t x, sv_f32_t y, svbool_t special)
+{
+  return sv_call_f32 (log10f, x, y, special);
+}
+
+/* Optimised implementation of SVE log10f using the same algorithm and
+   polynomial as v_log10f. Maximum error is 3.31ulps:
+   __sv_log10f(0x1.555c16p+0) got 0x1.ffe2fap-4
+			     want 0x1.ffe2f4p-4.  */
+sv_f32_t
+__sv_log10f_x (sv_f32_t x, const svbool_t pg)
+{
+  sv_u32_t ix = sv_as_u32_f32 (x);
+  svbool_t special_cases
+    = svcmpge_n_u32 (pg, svsub_n_u32_x (pg, ix, SpecialCaseMin),
+		     SpecialCaseMax - SpecialCaseMin);
+
+  /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
+  ix = svsub_n_u32_x (pg, ix, Offset);
+  sv_f32_t n = sv_to_f32_s32_x (pg, svasr_n_s32_x (pg, sv_as_s32_u32 (ix),
+						   23)); /* signextend.  */
+  ix = svand_n_u32_x (pg, ix, Mask);
+  ix = svadd_n_u32_x (pg, ix, Offset);
+  sv_f32_t r = svsub_n_f32_x (pg, sv_as_f32_u32 (ix), 1.0f);
+
+  /* y = log10(1+r) + n*log10(2)
+     log10(1+r) ~ r * InvLn(10) + P(r)
+     where P(r) is a polynomial. Use order 9 for log10(1+x), i.e. order 8 for
+     log10(1+x)/x, with x in [-1/3, 1/3] (offset=2/3)
+
+     P(r) = r2 * (Q01 + r2 * (Q23 + r2 * (Q45 + r2 * Q67)))
+     and Qij  = Pi + r * Pj.  */
+  sv_f32_t q12 = sv_fma_n_f32_x (pg, P (1), r, sv_f32 (P (0)));
+  sv_f32_t q34 = sv_fma_n_f32_x (pg, P (3), r, sv_f32 (P (2)));
+  sv_f32_t q56 = sv_fma_n_f32_x (pg, P (5), r, sv_f32 (P (4)));
+  sv_f32_t q78 = sv_fma_n_f32_x (pg, P (7), r, sv_f32 (P (6)));
+
+  sv_f32_t r2 = svmul_f32_x (pg, r, r);
+  sv_f32_t y = sv_fma_f32_x (pg, q78, r2, q56);
+  y = sv_fma_f32_x (pg, y, r2, q34);
+  y = sv_fma_f32_x (pg, y, r2, q12);
+
+  /* Using p = Log10(2)*n + r*InvLn(10) is slightly faster but less
+     accurate.  */
+  sv_f32_t p = sv_fma_n_f32_x (pg, Ln2, n, r);
+  y = sv_fma_f32_x (pg, y, r2, svmul_n_f32_x (pg, p, InvLn10));
+
+  if (unlikely (svptest_any (pg, special_cases)))
+    {
+      return special_case (x, y, special_cases);
+    }
+  return y;
+}
+
+strong_alias (__sv_log10f_x, _ZGVsMxv_log10f)
+
+#endif
diff --git a/pl/math/sv_math.h b/pl/math/sv_math.h
index 4164faa..3b318f1 100644
--- a/pl/math/sv_math.h
+++ b/pl/math/sv_math.h
@@ -183,6 +183,18 @@ sv_as_f32_u32 (sv_u32_t x)
   return svreinterpret_f32_u32 (x);
 }
 
+static inline sv_s32_t
+sv_as_s32_u32 (sv_u32_t x)
+{
+  return svreinterpret_s32_u32 (x);
+}
+
+static inline sv_f32_t
+sv_to_f32_s32_x (svbool_t pg, sv_s32_t s)
+{
+  return svcvt_f32_x (pg, s);
+}
+
 static inline sv_f32_t
 sv_call_f32 (f32_t (*f) (f32_t), sv_f32_t x, sv_f32_t y, svbool_t cmp)
 {
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index f89ee08..076340e 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -117,6 +117,8 @@ SVF (_ZGVsMxv_cosf, -3.1, 3.1)
 SVF (__sv_sinf_x, -3.1, 3.1)
 SVF (_ZGVsMxv_sinf, -3.1, 3.1)
 
+SVF (__sv_log10f_x, 0.01, 11.1)
+SVF (_ZGVsMxv_log10f, 0.01, 11.1)
 SVD (__sv_log10_x, 0.01, 11.1)
 SVD (_ZGVsMxv_log10, 0.01, 11.1)
 
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 993b7a3..3475adb 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -323,6 +323,15 @@ range_sve_log10='
       100       inf  50000
 '
 
+range_sve_log10f='
+     -0.0  -0x1p126  100
+ 0x1p-149  0x1p-126  4000
+ 0x1p-126   0x1p-23  50000
+  0x1p-23       1.0  50000
+      1.0       100  50000
+      100       inf  50000
+'
+
 # error limits
 L_erfc=3.7
 L_erfcf=1.0
@@ -349,6 +358,7 @@ L_sve_atan=2.5
 L_sve_atan2f=3.0
 L_sve_atan2=2.0
 L_sve_log10=2.5
+L_sve_log10f=3.5
 
 while read G F R
 do
@@ -434,6 +444,8 @@ sve_atan2f   __sv_atan2f       $runsv
 sve_atan2f   _ZGVsMxvv_atan2f  $runsv
 sve_atanf    __sv_atanf        $runsv
 sve_atanf    _ZGVsMxv_atanf    $runsv
+sve_log10f   __sv_log10f       $runsv
+sve_log10f   _ZGVsMxv_log10f   $runsv
 
 sve_cos    __sv_cos        $runsv
 sve_cos    _ZGVsMxv_cos    $runsv
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 695909f..4b12a1a 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -90,6 +90,8 @@ SVF1 (cos)
 ZSVF1 (cos)
 SVD1 (cos)
 ZSVD1 (cos)
+SVF1 (log10)
+ZSVF1 (log10)
 SVD1 (log10)
 ZSVD1 (log10)
 SVF1 (sin)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 886dfde..4f11aec 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -83,6 +83,12 @@ static float sv_cosf(float x) {
 static float Z_sv_cosf(float x) {
   return svretf(_ZGVsMxv_cosf(svargf(x), svptrue_b32()));
 }
+static float sv_log10f(float x) {
+  return svretf(__sv_log10f_x(svargf(x), svptrue_b32()));
+}
+static float Z_sv_log10f(float x) {
+  return svretf(_ZGVsMxv_log10f(svargf(x), svptrue_b32()));
+}
 static float sv_sinf(float x) {
   return svretf(__sv_sinf_x(svargf(x), svptrue_b32()));
 }
diff --git a/pl/math/v_log10f_3u5.c b/pl/math/v_log10f_3u5.c
index c956d0c..4dede3d 100644
--- a/pl/math/v_log10f_3u5.c
+++ b/pl/math/v_log10f_3u5.c
@@ -9,22 +9,9 @@
 #include "v_math.h"
 #if V_SUPPORTED
 
-static const float Poly[] = {
-  /* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in
-     [-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25.  */
-  -0x1.bcb79cp-3f, 0x1.2879c8p-3f, -0x1.bcd472p-4f, 0x1.6408f8p-4f,
-  -0x1.246f8p-4f,  0x1.f0e514p-5f, -0x1.0fc92cp-4f, 0x1.f5f76ap-5f};
-#define P8 v_f32 (Poly[7])
-#define P7 v_f32 (Poly[6])
-#define P6 v_f32 (Poly[5])
-#define P5 v_f32 (Poly[4])
-#define P4 v_f32 (Poly[3])
-#define P3 v_f32 (Poly[2])
-#define P2 v_f32 (Poly[1])
-#define P1 v_f32 (Poly[0])
+#define P(i) v_f32 (__v_log10f_poly[i])
 
 #define Ln2 v_f32 (0x1.62e43p-1f) /* 0x3f317218.  */
-#define Log10_2 v_f32 (0x1.344136p-2f)
 #define InvLn10 v_f32 (0x1.bcb7b2p-2f)
 #define Min v_u32 (0x00800000)
 #define Max v_u32 (0x7f800000)
@@ -64,12 +51,12 @@ v_f32_t V_NAME (log10f) (v_f32_t x)
 
   /* y = log10(1+r) + n*log10(2).  */
   r2 = r * r;
-  /* (n*ln2 + r)*InvLn10 + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 +
-     r2*(P7+r*P8))).  */
-  o = v_fma_f32 (P8, r, P7);
-  p = v_fma_f32 (P6, r, P5);
-  q = v_fma_f32 (P4, r, P3);
-  y = v_fma_f32 (P2, r, P1);
+  /* (n*ln2 + r)*InvLn10 + r2*(P0 + r*P1 + r2*(P2 + r*P3 + r2*(P4 + r*P5 +
+     r2*(P6+r*P7))).  */
+  o = v_fma_f32 (P (7), r, P (6));
+  p = v_fma_f32 (P (5), r, P (4));
+  q = v_fma_f32 (P (3), r, P (2));
+  y = v_fma_f32 (P (1), r, P (0));
   p = v_fma_f32 (o, r2, p);
   q = v_fma_f32 (p, r2, q);
   y = v_fma_f32 (q, r2, y);
diff --git a/pl/math/v_log10f_data.c b/pl/math/v_log10f_data.c
new file mode 100644
index 0000000..c95f38b
--- /dev/null
+++ b/pl/math/v_log10f_data.c
@@ -0,0 +1,13 @@
+/*
+ * Coefficients for single-precision vector log10 function.
+ *
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "math_config.h"
+
+const float __v_log10f_poly[] = {
+  /* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in
+     [-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25.  */
+  -0x1.bcb79cp-3f, 0x1.2879c8p-3f, -0x1.bcd472p-4f, 0x1.6408f8p-4f,
+  -0x1.246f8p-4f,  0x1.f0e514p-5f, -0x1.0fc92cp-4f, 0x1.f5f76ap-5f};
author	Joe Ramsay <Joe.Ramsay@arm.com>	2022-09-02 10:42:05 +0100
committer	Joe Ramsay <joe.ramsay@arm.com>	2022-09-02 10:42:05 +0100
commit	6e0ada9b7edd52693fe8a684863796b7a309d6fe (patch)
tree	f66dfcafeba33e6ded6830d486112fef8ddb70fe
parent	65e0232e36b784b09a72caf0df6845eec9e95057 (diff)
download	arm-optimized-routines-6e0ada9b7edd52693fe8a684863796b7a309d6fe.tar.gz