diff options
author | Pierre Blanchard <pierre.blanchard@arm.com> | 2022-12-08 10:37:53 +0000 |
---|---|---|
committer | Pierre Blanchard <pierre.blanchard@arm.com> | 2022-12-08 10:38:02 +0000 |
commit | 61056f3ccca43a5e79edee0e359713614a1efd3c (patch) | |
tree | 10f179f2b27e1a649cd115c481495115307d877d | |
parent | a5e45e4e299f5fe6b51601694cc3cb066a20723a (diff) | |
download | arm-optimized-routines-61056f3ccca43a5e79edee0e359713614a1efd3c.tar.gz |
pl/math: Fix vector/SVE erf
Fixing a bug that resulted in potentially random results
in boring domain by saturating index at an appropriate value.
-rw-r--r-- | pl/math/sv_erf_2u5.c | 18 |
1 files changed, 8 insertions, 10 deletions
diff --git a/pl/math/sv_erf_2u5.c b/pl/math/sv_erf_2u5.c index a68b9e3..1265047 100644 --- a/pl/math/sv_erf_2u5.c +++ b/pl/math/sv_erf_2u5.c @@ -8,7 +8,7 @@ #include "sv_math.h" #if SV_SUPPORTED -#define Scale (8.0f) +#define Scale (8.0) #define AbsMask (0x7fffffffffffffff) static NOINLINE sv_f64_t @@ -33,13 +33,14 @@ __sv_erf_x (sv_f64_t x, const svbool_t pg) = svcmpge_n_u64 (pg, svsub_n_u64_x (pg, atop, 0x3e30), 0x7ff0 - 0x3e30); /* Get sign and absolute value. */ - sv_f64_t a = svabs_f64_x (pg, x); + sv_f64_t a = sv_as_f64_u64 (svand_n_u64_x (pg, ix, AbsMask)); sv_u64_t sign = svand_n_u64_x (pg, ix, ~AbsMask); /* i = trunc(Scale*x). */ - sv_u64_t i = svcvt_u64_f64_x (pg, svmul_n_f64_x (pg, a, Scale)); + sv_f64_t a_scale = svmul_n_f64_x (pg, a, Scale); /* Saturate index of intervals. */ - i = svmin_u64_x (pg, i, sv_u64 (V_ERF_NINTS)); + svbool_t a_lt_6 = svcmplt_n_u64 (pg, atop, 0x4018); + sv_u64_t i = svcvt_u64_f64_m (sv_u64 (V_ERF_NINTS - 1), a_lt_6, a_scale); /* Load polynomial coefficients. */ sv_f64_t P_0 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[0], i); @@ -56,8 +57,9 @@ __sv_erf_x (sv_f64_t x, const svbool_t pg) /* Get shift and scale. */ sv_f64_t shift = sv_lookup_f64_x (pg, __v_erf_data.shifts, i); - /* Transform polynomial variable. */ - sv_f64_t z = sv_fma_n_f64_x (pg, Scale, a, shift); + /* Transform polynomial variable. + Set z = 0 in the boring domain to avoid overflow. */ + sv_f64_t z = svmla_f64_m (a_lt_6, shift, sv_f64 (Scale), a); /* Evaluate polynomial P(z) using level-2 Estrin. */ sv_f64_t r1 = sv_fma_f64_x (pg, z, P_1, P_0); @@ -75,10 +77,6 @@ __sv_erf_x (sv_f64_t x, const svbool_t pg) sv_f64_t y = sv_fma_f64_x (pg, z4, r5, q2); y = sv_fma_f64_x (pg, z4, y, q1); - /* Saturate y. This works because using the last interval on the boring domain - produces y > 1. */ - y = svmin_n_f64_x (pg, y, 1.0); - /* y = erf(x) if x > 0, -erf(-x) otherwise. */ y = sv_as_f64_u64 (sveor_u64_x (pg, sv_as_u64_f64 (y), sign)); |