aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPierre Blanchard <pierre.blanchard@arm.com>2022-12-08 10:37:53 +0000
committerPierre Blanchard <pierre.blanchard@arm.com>2022-12-08 10:38:02 +0000
commit61056f3ccca43a5e79edee0e359713614a1efd3c (patch)
tree10f179f2b27e1a649cd115c481495115307d877d
parenta5e45e4e299f5fe6b51601694cc3cb066a20723a (diff)
downloadarm-optimized-routines-61056f3ccca43a5e79edee0e359713614a1efd3c.tar.gz
pl/math: Fix vector/SVE erf
Fixing a bug that resulted in potentially random results in boring domain by saturating index at an appropriate value.
-rw-r--r--pl/math/sv_erf_2u5.c18
1 files changed, 8 insertions, 10 deletions
diff --git a/pl/math/sv_erf_2u5.c b/pl/math/sv_erf_2u5.c
index a68b9e3..1265047 100644
--- a/pl/math/sv_erf_2u5.c
+++ b/pl/math/sv_erf_2u5.c
@@ -8,7 +8,7 @@
#include "sv_math.h"
#if SV_SUPPORTED
-#define Scale (8.0f)
+#define Scale (8.0)
#define AbsMask (0x7fffffffffffffff)
static NOINLINE sv_f64_t
@@ -33,13 +33,14 @@ __sv_erf_x (sv_f64_t x, const svbool_t pg)
= svcmpge_n_u64 (pg, svsub_n_u64_x (pg, atop, 0x3e30), 0x7ff0 - 0x3e30);
/* Get sign and absolute value. */
- sv_f64_t a = svabs_f64_x (pg, x);
+ sv_f64_t a = sv_as_f64_u64 (svand_n_u64_x (pg, ix, AbsMask));
sv_u64_t sign = svand_n_u64_x (pg, ix, ~AbsMask);
/* i = trunc(Scale*x). */
- sv_u64_t i = svcvt_u64_f64_x (pg, svmul_n_f64_x (pg, a, Scale));
+ sv_f64_t a_scale = svmul_n_f64_x (pg, a, Scale);
/* Saturate index of intervals. */
- i = svmin_u64_x (pg, i, sv_u64 (V_ERF_NINTS));
+ svbool_t a_lt_6 = svcmplt_n_u64 (pg, atop, 0x4018);
+ sv_u64_t i = svcvt_u64_f64_m (sv_u64 (V_ERF_NINTS - 1), a_lt_6, a_scale);
/* Load polynomial coefficients. */
sv_f64_t P_0 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[0], i);
@@ -56,8 +57,9 @@ __sv_erf_x (sv_f64_t x, const svbool_t pg)
/* Get shift and scale. */
sv_f64_t shift = sv_lookup_f64_x (pg, __v_erf_data.shifts, i);
- /* Transform polynomial variable. */
- sv_f64_t z = sv_fma_n_f64_x (pg, Scale, a, shift);
+ /* Transform polynomial variable.
+ Set z = 0 in the boring domain to avoid overflow. */
+ sv_f64_t z = svmla_f64_m (a_lt_6, shift, sv_f64 (Scale), a);
/* Evaluate polynomial P(z) using level-2 Estrin. */
sv_f64_t r1 = sv_fma_f64_x (pg, z, P_1, P_0);
@@ -75,10 +77,6 @@ __sv_erf_x (sv_f64_t x, const svbool_t pg)
sv_f64_t y = sv_fma_f64_x (pg, z4, r5, q2);
y = sv_fma_f64_x (pg, z4, y, q1);
- /* Saturate y. This works because using the last interval on the boring domain
- produces y > 1. */
- y = svmin_n_f64_x (pg, y, 1.0);
-
/* y = erf(x) if x > 0, -erf(-x) otherwise. */
y = sv_as_f64_u64 (sveor_u64_x (pg, sv_as_u64_f64 (y), sign));