diff options
author | Joe Ramsay <Joe.Ramsay@arm.com> | 2022-09-02 10:42:05 +0100 |
---|---|---|
committer | Joe Ramsay <joe.ramsay@arm.com> | 2022-09-02 10:42:05 +0100 |
commit | 6e0ada9b7edd52693fe8a684863796b7a309d6fe (patch) | |
tree | f66dfcafeba33e6ded6830d486112fef8ddb70fe | |
parent | 65e0232e36b784b09a72caf0df6845eec9e95057 (diff) | |
download | arm-optimized-routines-6e0ada9b7edd52693fe8a684863796b7a309d6fe.tar.gz |
pl/math: Add vector/SVE log10f
New routine shares polynomial with Neon variant and is accurate to 3.5 ULP.
-rw-r--r-- | pl/math/include/mathlib.h | 2 | ||||
-rw-r--r-- | pl/math/math_config.h | 3 | ||||
-rw-r--r-- | pl/math/sv_log10f_3u5.c | 78 | ||||
-rw-r--r-- | pl/math/sv_math.h | 12 | ||||
-rw-r--r-- | pl/math/test/mathbench_funcs.h | 2 | ||||
-rwxr-xr-x | pl/math/test/runulp.sh | 12 | ||||
-rw-r--r-- | pl/math/test/ulp_funcs.h | 2 | ||||
-rw-r--r-- | pl/math/test/ulp_wrappers.h | 6 | ||||
-rw-r--r-- | pl/math/v_log10f_3u5.c | 27 | ||||
-rw-r--r-- | pl/math/v_log10f_data.c | 13 |
10 files changed, 137 insertions, 20 deletions
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h index 0f004c0..4f3eba3 100644 --- a/pl/math/include/mathlib.h +++ b/pl/math/include/mathlib.h @@ -111,6 +111,7 @@ svfloat64_t __sv_atan_x (svfloat64_t, svbool_t); svfloat64_t __sv_atan2_x (svfloat64_t, svfloat64_t, svbool_t); svfloat32_t __sv_cosf_x (svfloat32_t, svbool_t); svfloat64_t __sv_cos_x (svfloat64_t, svbool_t); +svfloat32_t __sv_log10f_x (svfloat32_t, svbool_t); svfloat64_t __sv_log10_x (svfloat64_t, svbool_t); svfloat32_t __sv_sinf_x (svfloat32_t, svbool_t); svfloat64_t __sv_sin_x (svfloat64_t, svbool_t); @@ -121,6 +122,7 @@ svfloat64_t _ZGVsMxv_atan (svfloat64_t, svbool_t); svfloat64_t _ZGVsMxvv_atan2 (svfloat64_t, svfloat64_t, svbool_t); svfloat32_t _ZGVsMxv_cosf (svfloat32_t, svbool_t); svfloat64_t _ZGVsMxv_cos (svfloat64_t, svbool_t); +svfloat32_t _ZGVsMxv_log10f (svfloat32_t, svbool_t); svfloat64_t _ZGVsMxv_log10 (svfloat64_t, svbool_t); svfloat32_t _ZGVsMxv_sinf (svfloat32_t, svbool_t); svfloat64_t _ZGVsMxv_sin (svfloat64_t, svbool_t); diff --git a/pl/math/math_config.h b/pl/math/math_config.h index d70a38c..10bbd9b 100644 --- a/pl/math/math_config.h +++ b/pl/math/math_config.h @@ -508,4 +508,7 @@ extern const struct v_log10_data double invln10, log10_2; } __v_log10_data HIDDEN; +#define V_LOG10F_POLY_ORDER 9 +extern const float __v_log10f_poly[V_LOG10F_POLY_ORDER - 1] HIDDEN; + #endif diff --git a/pl/math/sv_log10f_3u5.c b/pl/math/sv_log10f_3u5.c new file mode 100644 index 0000000..fe8ecfd --- /dev/null +++ b/pl/math/sv_log10f_3u5.c @@ -0,0 +1,78 @@ +/* + * Single-precision SVE log10 function. + * + * Copyright (c) 2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" + +#if SV_SUPPORTED + +#define SpecialCaseMin 0x00800000 +#define SpecialCaseMax 0x7f800000 +#define Offset 0x3f2aaaab /* 0.666667. */ +#define Mask 0x007fffff +#define Ln2 0x1.62e43p-1f /* 0x3f317218. */ +#define InvLn10 0x1.bcb7b2p-2f + +#define P(i) __v_log10f_poly[i] + +static NOINLINE sv_f32_t +special_case (sv_f32_t x, sv_f32_t y, svbool_t special) +{ + return sv_call_f32 (log10f, x, y, special); +} + +/* Optimised implementation of SVE log10f using the same algorithm and + polynomial as v_log10f. Maximum error is 3.31ulps: + __sv_log10f(0x1.555c16p+0) got 0x1.ffe2fap-4 + want 0x1.ffe2f4p-4. */ +sv_f32_t +__sv_log10f_x (sv_f32_t x, const svbool_t pg) +{ + sv_u32_t ix = sv_as_u32_f32 (x); + svbool_t special_cases + = svcmpge_n_u32 (pg, svsub_n_u32_x (pg, ix, SpecialCaseMin), + SpecialCaseMax - SpecialCaseMin); + + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ + ix = svsub_n_u32_x (pg, ix, Offset); + sv_f32_t n = sv_to_f32_s32_x (pg, svasr_n_s32_x (pg, sv_as_s32_u32 (ix), + 23)); /* signextend. */ + ix = svand_n_u32_x (pg, ix, Mask); + ix = svadd_n_u32_x (pg, ix, Offset); + sv_f32_t r = svsub_n_f32_x (pg, sv_as_f32_u32 (ix), 1.0f); + + /* y = log10(1+r) + n*log10(2) + log10(1+r) ~ r * InvLn(10) + P(r) + where P(r) is a polynomial. Use order 9 for log10(1+x), i.e. order 8 for + log10(1+x)/x, with x in [-1/3, 1/3] (offset=2/3) + + P(r) = r2 * (Q01 + r2 * (Q23 + r2 * (Q45 + r2 * Q67))) + and Qij = Pi + r * Pj. */ + sv_f32_t q12 = sv_fma_n_f32_x (pg, P (1), r, sv_f32 (P (0))); + sv_f32_t q34 = sv_fma_n_f32_x (pg, P (3), r, sv_f32 (P (2))); + sv_f32_t q56 = sv_fma_n_f32_x (pg, P (5), r, sv_f32 (P (4))); + sv_f32_t q78 = sv_fma_n_f32_x (pg, P (7), r, sv_f32 (P (6))); + + sv_f32_t r2 = svmul_f32_x (pg, r, r); + sv_f32_t y = sv_fma_f32_x (pg, q78, r2, q56); + y = sv_fma_f32_x (pg, y, r2, q34); + y = sv_fma_f32_x (pg, y, r2, q12); + + /* Using p = Log10(2)*n + r*InvLn(10) is slightly faster but less + accurate. */ + sv_f32_t p = sv_fma_n_f32_x (pg, Ln2, n, r); + y = sv_fma_f32_x (pg, y, r2, svmul_n_f32_x (pg, p, InvLn10)); + + if (unlikely (svptest_any (pg, special_cases))) + { + return special_case (x, y, special_cases); + } + return y; +} + +strong_alias (__sv_log10f_x, _ZGVsMxv_log10f) + +#endif diff --git a/pl/math/sv_math.h b/pl/math/sv_math.h index 4164faa..3b318f1 100644 --- a/pl/math/sv_math.h +++ b/pl/math/sv_math.h @@ -183,6 +183,18 @@ sv_as_f32_u32 (sv_u32_t x) return svreinterpret_f32_u32 (x); } +static inline sv_s32_t +sv_as_s32_u32 (sv_u32_t x) +{ + return svreinterpret_s32_u32 (x); +} + +static inline sv_f32_t +sv_to_f32_s32_x (svbool_t pg, sv_s32_t s) +{ + return svcvt_f32_x (pg, s); +} + static inline sv_f32_t sv_call_f32 (f32_t (*f) (f32_t), sv_f32_t x, sv_f32_t y, svbool_t cmp) { diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h index f89ee08..076340e 100644 --- a/pl/math/test/mathbench_funcs.h +++ b/pl/math/test/mathbench_funcs.h @@ -117,6 +117,8 @@ SVF (_ZGVsMxv_cosf, -3.1, 3.1) SVF (__sv_sinf_x, -3.1, 3.1) SVF (_ZGVsMxv_sinf, -3.1, 3.1) +SVF (__sv_log10f_x, 0.01, 11.1) +SVF (_ZGVsMxv_log10f, 0.01, 11.1) SVD (__sv_log10_x, 0.01, 11.1) SVD (_ZGVsMxv_log10, 0.01, 11.1) diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh index 993b7a3..3475adb 100755 --- a/pl/math/test/runulp.sh +++ b/pl/math/test/runulp.sh @@ -323,6 +323,15 @@ range_sve_log10=' 100 inf 50000 ' +range_sve_log10f=' + -0.0 -0x1p126 100 + 0x1p-149 0x1p-126 4000 + 0x1p-126 0x1p-23 50000 + 0x1p-23 1.0 50000 + 1.0 100 50000 + 100 inf 50000 +' + # error limits L_erfc=3.7 L_erfcf=1.0 @@ -349,6 +358,7 @@ L_sve_atan=2.5 L_sve_atan2f=3.0 L_sve_atan2=2.0 L_sve_log10=2.5 +L_sve_log10f=3.5 while read G F R do @@ -434,6 +444,8 @@ sve_atan2f __sv_atan2f $runsv sve_atan2f _ZGVsMxvv_atan2f $runsv sve_atanf __sv_atanf $runsv sve_atanf _ZGVsMxv_atanf $runsv +sve_log10f __sv_log10f $runsv +sve_log10f _ZGVsMxv_log10f $runsv sve_cos __sv_cos $runsv sve_cos _ZGVsMxv_cos $runsv diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h index 695909f..4b12a1a 100644 --- a/pl/math/test/ulp_funcs.h +++ b/pl/math/test/ulp_funcs.h @@ -90,6 +90,8 @@ SVF1 (cos) ZSVF1 (cos) SVD1 (cos) ZSVD1 (cos) +SVF1 (log10) +ZSVF1 (log10) SVD1 (log10) ZSVD1 (log10) SVF1 (sin) diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h index 886dfde..4f11aec 100644 --- a/pl/math/test/ulp_wrappers.h +++ b/pl/math/test/ulp_wrappers.h @@ -83,6 +83,12 @@ static float sv_cosf(float x) { static float Z_sv_cosf(float x) { return svretf(_ZGVsMxv_cosf(svargf(x), svptrue_b32())); } +static float sv_log10f(float x) { + return svretf(__sv_log10f_x(svargf(x), svptrue_b32())); +} +static float Z_sv_log10f(float x) { + return svretf(_ZGVsMxv_log10f(svargf(x), svptrue_b32())); +} static float sv_sinf(float x) { return svretf(__sv_sinf_x(svargf(x), svptrue_b32())); } diff --git a/pl/math/v_log10f_3u5.c b/pl/math/v_log10f_3u5.c index c956d0c..4dede3d 100644 --- a/pl/math/v_log10f_3u5.c +++ b/pl/math/v_log10f_3u5.c @@ -9,22 +9,9 @@ #include "v_math.h" #if V_SUPPORTED -static const float Poly[] = { - /* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in - [-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25. */ - -0x1.bcb79cp-3f, 0x1.2879c8p-3f, -0x1.bcd472p-4f, 0x1.6408f8p-4f, - -0x1.246f8p-4f, 0x1.f0e514p-5f, -0x1.0fc92cp-4f, 0x1.f5f76ap-5f}; -#define P8 v_f32 (Poly[7]) -#define P7 v_f32 (Poly[6]) -#define P6 v_f32 (Poly[5]) -#define P5 v_f32 (Poly[4]) -#define P4 v_f32 (Poly[3]) -#define P3 v_f32 (Poly[2]) -#define P2 v_f32 (Poly[1]) -#define P1 v_f32 (Poly[0]) +#define P(i) v_f32 (__v_log10f_poly[i]) #define Ln2 v_f32 (0x1.62e43p-1f) /* 0x3f317218. */ -#define Log10_2 v_f32 (0x1.344136p-2f) #define InvLn10 v_f32 (0x1.bcb7b2p-2f) #define Min v_u32 (0x00800000) #define Max v_u32 (0x7f800000) @@ -64,12 +51,12 @@ v_f32_t V_NAME (log10f) (v_f32_t x) /* y = log10(1+r) + n*log10(2). */ r2 = r * r; - /* (n*ln2 + r)*InvLn10 + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + - r2*(P7+r*P8))). */ - o = v_fma_f32 (P8, r, P7); - p = v_fma_f32 (P6, r, P5); - q = v_fma_f32 (P4, r, P3); - y = v_fma_f32 (P2, r, P1); + /* (n*ln2 + r)*InvLn10 + r2*(P0 + r*P1 + r2*(P2 + r*P3 + r2*(P4 + r*P5 + + r2*(P6+r*P7))). */ + o = v_fma_f32 (P (7), r, P (6)); + p = v_fma_f32 (P (5), r, P (4)); + q = v_fma_f32 (P (3), r, P (2)); + y = v_fma_f32 (P (1), r, P (0)); p = v_fma_f32 (o, r2, p); q = v_fma_f32 (p, r2, q); y = v_fma_f32 (q, r2, y); diff --git a/pl/math/v_log10f_data.c b/pl/math/v_log10f_data.c new file mode 100644 index 0000000..c95f38b --- /dev/null +++ b/pl/math/v_log10f_data.c @@ -0,0 +1,13 @@ +/* + * Coefficients for single-precision vector log10 function. + * + * Copyright (c) 2020-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "math_config.h" + +const float __v_log10f_poly[] = { + /* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in + [-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25. */ + -0x1.bcb79cp-3f, 0x1.2879c8p-3f, -0x1.bcd472p-4f, 0x1.6408f8p-4f, + -0x1.246f8p-4f, 0x1.f0e514p-5f, -0x1.0fc92cp-4f, 0x1.f5f76ap-5f}; |