diff options
author | Elliott Hughes <enh@google.com> | 2023-01-25 21:31:42 +0000 |
---|---|---|
committer | Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com> | 2023-01-25 21:31:42 +0000 |
commit | 8a481eb48cf7ace04026cbe776d65492f89a6d7d (patch) | |
tree | 945eb1832687ad5eb97c433013617dbe107718e5 /pl/math/v_erfcf_1u.c | |
parent | 0bc581b0e4f509c96bffa7af329087ae2b4bfb97 (diff) | |
parent | 8783f524beaad825ac1bddeb93cef35b5f793513 (diff) | |
download | arm-optimized-routines-8a481eb48cf7ace04026cbe776d65492f89a6d7d.tar.gz |
Upgrade ARM-software/optimized-routines to v23.01 am: 62662f115a am: 8783f524be
Original change: https://android-review.googlesource.com/c/platform/external/arm-optimized-routines/+/2402215
Change-Id: Ief5438cf89d86c43e5f52d5e48b882bacb945170
Signed-off-by: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>
Diffstat (limited to 'pl/math/v_erfcf_1u.c')
-rw-r--r-- | pl/math/v_erfcf_1u.c | 183 |
1 files changed, 183 insertions, 0 deletions
diff --git a/pl/math/v_erfcf_1u.c b/pl/math/v_erfcf_1u.c new file mode 100644 index 0000000..963490d --- /dev/null +++ b/pl/math/v_erfcf_1u.c @@ -0,0 +1,183 @@ +/* + * Single-precision vector erfc(x) function. + * + * Copyright (c) 2021-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "erfcf.h" +#include "estrin.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if V_SUPPORTED + +#define P(ia12) __erfcf_poly_data.poly[interval_index (ia12)] + +VPCS_ATTR v_f64_t V_NAME (exp_tail) (v_f64_t, v_f64_t); + +static VPCS_ATTR NOINLINE v_f32_t +specialcase (v_f32_t x, v_f32_t y, v_u32_t special) +{ + return v_call_f32 (erfcf, x, y, special); +} + +static inline uint32_t +interval_index (uint32_t ia12) +{ + // clang-format off + return (ia12 < 0x400 ? 0 : + (ia12 < 0x408 ? 1 : + (ia12 < 0x410 ? 2 : + 3))); + // clang-format on +} + +/* The C macro wraps the coeffs argument in order to make the + poynomial evaluation more readable. In the scalarised variant the + second pointer is ignored. */ +#ifdef SCALAR +#define C(i) coeff1[i] +#else +#define C(i) ((v_f64_t){coeff1[i], coeff2[i]}) +#endif + +static inline v_f64_t +v_approx_erfcf_poly_gauss (v_f64_t x, const double *coeff1, + const double *coeff2) +{ + v_f64_t x2 = x * x; + v_f64_t x4 = x2 * x2; + v_f64_t poly = ESTRIN_15 (x, x2, x4, x4 * x4, C); + v_f64_t gauss = V_NAME (exp_tail) (-(x * x), v_f64 (0.0)); + return poly * gauss; +} + +static inline float +approx_poly_gauss (float abs_x, const double *coeff) +{ + return (float) (eval_poly (abs_x, coeff) * eval_exp_mx2 (abs_x)); +} + +static v_f32_t +v_approx_erfcf (v_f32_t abs_x, v_u32_t sign, v_u32_t ia12, v_u32_t lanes) +{ +#ifdef SCALAR + float y = approx_poly_gauss (abs_x, P (ia12)); + return sign ? 2 - y : y; +#else + float32x2_t lo32 = {0, 0}; + float32x2_t hi32 = {0, 0}; + /* The polynomial and Gaussian components must be calculated in + double precision in order to meet the required ULP error. This + means we have to promote low and high halves of the + single-precision input vector to two separate double-precision + input vectors. This incurs some overhead, and there is also + overhead to loading the polynomial coefficients as this cannot be + done in a vector fashion. This would be wasted effort for + elements which lie in the 'boring' zone, as they will be + overwritten later. Hence we use the lanes parameter to only do + the promotion on a pair of lanes if both of those lanes are + interesting and not special cases. If one lane is inactive, we + use a scalar routine which is shared with the scalar variant. */ + if (lanes[0] & lanes[1]) + { + lo32 = vcvt_f32_f64 ( + v_approx_erfcf_poly_gauss (vcvt_f64_f32 (vget_low_f32 (abs_x)), + P (ia12[0]), P (ia12[1]))); + } + else if (lanes[0]) + { + lo32[0] = approx_poly_gauss (abs_x[0], P (ia12[0])); + } + else if (lanes[1]) + { + lo32[1] = approx_poly_gauss (abs_x[1], P (ia12[1])); + } + + if (lanes[2] & lanes[3]) + { + hi32 + = vcvt_f32_f64 (v_approx_erfcf_poly_gauss (vcvt_high_f64_f32 (abs_x), + P (ia12[2]), P (ia12[3]))); + } + else if (lanes[2]) + { + hi32[0] = approx_poly_gauss (abs_x[2], P (ia12[2])); + } + else if (lanes[3]) + { + hi32[1] = approx_poly_gauss (abs_x[3], P (ia12[3])); + } + + v_f32_t y = vcombine_f32 (lo32, hi32); + + if (v_any_u32 (sign)) + { + y = vbslq_f32 (vceqzq_u32 (sign), y, 2 - y); + } + + return y; +#endif +} + +/* Optimized single-precision vector complementary error function + erfcf. Max measured error: 0.750092 at various values between + -0x1.06521p-20 and -0x1.add1dap-17. For example: + __v_erfc(-0x1.08185p-18) got 0x1.00004cp+0 want 0x1.00004ap+0 + +0.249908 ulp err 0.250092. */ +VPCS_ATTR +v_f32_t V_NAME (erfcf) (v_f32_t x) +{ + v_u32_t ix = v_as_u32_f32 (x); + v_u32_t ia = ix & 0x7fffffff; + v_u32_t ia12 = ia >> 20; + v_u32_t sign = ix >> 31; + v_u32_t inf_ia12 = v_u32 (0x7f8); + + v_u32_t special_cases + = v_cond_u32 ((ia12 - 0x328) >= ((inf_ia12 & 0x7f8) - 0x328)); + v_u32_t in_bounds + = v_cond_u32 ((ia < 0x408ccccd) | (~sign & (ix < 0x4120f5c3))); + v_f32_t boring_zone = v_as_f32_u32 (sign << 30); + +#ifdef SCALAR + if (unlikely (special_cases)) + { + if (ia12 >= 0x7f8) + return (float) (sign << 1) + 1.0f / x; /* Special cases. */ + else + return 1.0f - x; /* Small case. */ + } + else if (likely (!in_bounds)) + { + return sign ? boring_zone : __math_uflowf (boring_zone); + } +#endif + + v_f32_t y = v_approx_erfcf (v_as_f32_u32 (ia), sign, ia12, + in_bounds & ~special_cases); + +#ifndef SCALAR + y = vbslq_f32 (~in_bounds, boring_zone, y); + + if (unlikely (v_any_u32 (special_cases))) + { + return specialcase (x, y, special_cases); + } +#endif + + return y; +} +VPCS_ALIAS + +PL_SIG (V, F, 1, erfc, -6.0, 28.0) +PL_TEST_ULP (V_NAME (erfcf), 0.26) +PL_TEST_INTERVAL (V_NAME (erfcf), 0, 0xffff0000, 10000) +PL_TEST_INTERVAL (V_NAME (erfcf), 0x1p-127, 0x1p-26, 40000) +PL_TEST_INTERVAL (V_NAME (erfcf), -0x1p-127, -0x1p-26, 40000) +PL_TEST_INTERVAL (V_NAME (erfcf), 0x1p-26, 0x1p5, 40000) +PL_TEST_INTERVAL (V_NAME (erfcf), -0x1p-26, -0x1p3, 40000) +PL_TEST_INTERVAL (V_NAME (erfcf), 0, inf, 40000) +#endif |