/* * Single-precision vector erfc(x) function. * * Copyright (c) 2021-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" #include "erfcf.h" #include "estrin.h" #include "pl_sig.h" #include "pl_test.h" #if V_SUPPORTED #define P(ia12) __erfcf_poly_data.poly[interval_index (ia12)] VPCS_ATTR v_f64_t V_NAME (exp_tail) (v_f64_t, v_f64_t); static VPCS_ATTR NOINLINE v_f32_t specialcase (v_f32_t x, v_f32_t y, v_u32_t special) { return v_call_f32 (erfcf, x, y, special); } static inline uint32_t interval_index (uint32_t ia12) { // clang-format off return (ia12 < 0x400 ? 0 : (ia12 < 0x408 ? 1 : (ia12 < 0x410 ? 2 : 3))); // clang-format on } /* The C macro wraps the coeffs argument in order to make the poynomial evaluation more readable. In the scalarised variant the second pointer is ignored. */ #ifdef SCALAR #define C(i) coeff1[i] #else #define C(i) ((v_f64_t){coeff1[i], coeff2[i]}) #endif static inline v_f64_t v_approx_erfcf_poly_gauss (v_f64_t x, const double *coeff1, const double *coeff2) { v_f64_t x2 = x * x; v_f64_t x4 = x2 * x2; v_f64_t poly = ESTRIN_15 (x, x2, x4, x4 * x4, C); v_f64_t gauss = V_NAME (exp_tail) (-(x * x), v_f64 (0.0)); return poly * gauss; } static inline float approx_poly_gauss (float abs_x, const double *coeff) { return (float) (eval_poly (abs_x, coeff) * eval_exp_mx2 (abs_x)); } static v_f32_t v_approx_erfcf (v_f32_t abs_x, v_u32_t sign, v_u32_t ia12, v_u32_t lanes) { #ifdef SCALAR float y = approx_poly_gauss (abs_x, P (ia12)); return sign ? 2 - y : y; #else float32x2_t lo32 = {0, 0}; float32x2_t hi32 = {0, 0}; /* The polynomial and Gaussian components must be calculated in double precision in order to meet the required ULP error. This means we have to promote low and high halves of the single-precision input vector to two separate double-precision input vectors. This incurs some overhead, and there is also overhead to loading the polynomial coefficients as this cannot be done in a vector fashion. This would be wasted effort for elements which lie in the 'boring' zone, as they will be overwritten later. Hence we use the lanes parameter to only do the promotion on a pair of lanes if both of those lanes are interesting and not special cases. If one lane is inactive, we use a scalar routine which is shared with the scalar variant. */ if (lanes[0] & lanes[1]) { lo32 = vcvt_f32_f64 ( v_approx_erfcf_poly_gauss (vcvt_f64_f32 (vget_low_f32 (abs_x)), P (ia12[0]), P (ia12[1]))); } else if (lanes[0]) { lo32[0] = approx_poly_gauss (abs_x[0], P (ia12[0])); } else if (lanes[1]) { lo32[1] = approx_poly_gauss (abs_x[1], P (ia12[1])); } if (lanes[2] & lanes[3]) { hi32 = vcvt_f32_f64 (v_approx_erfcf_poly_gauss (vcvt_high_f64_f32 (abs_x), P (ia12[2]), P (ia12[3]))); } else if (lanes[2]) { hi32[0] = approx_poly_gauss (abs_x[2], P (ia12[2])); } else if (lanes[3]) { hi32[1] = approx_poly_gauss (abs_x[3], P (ia12[3])); } v_f32_t y = vcombine_f32 (lo32, hi32); if (v_any_u32 (sign)) { y = vbslq_f32 (vceqzq_u32 (sign), y, 2 - y); } return y; #endif } /* Optimized single-precision vector complementary error function erfcf. Max measured error: 0.750092 at various values between -0x1.06521p-20 and -0x1.add1dap-17. For example: __v_erfc(-0x1.08185p-18) got 0x1.00004cp+0 want 0x1.00004ap+0 +0.249908 ulp err 0.250092. */ VPCS_ATTR v_f32_t V_NAME (erfcf) (v_f32_t x) { v_u32_t ix = v_as_u32_f32 (x); v_u32_t ia = ix & 0x7fffffff; v_u32_t ia12 = ia >> 20; v_u32_t sign = ix >> 31; v_u32_t inf_ia12 = v_u32 (0x7f8); v_u32_t special_cases = v_cond_u32 ((ia12 - 0x328) >= ((inf_ia12 & 0x7f8) - 0x328)); v_u32_t in_bounds = v_cond_u32 ((ia < 0x408ccccd) | (~sign & (ix < 0x4120f5c3))); v_f32_t boring_zone = v_as_f32_u32 (sign << 30); #ifdef SCALAR if (unlikely (special_cases)) { if (ia12 >= 0x7f8) return (float) (sign << 1) + 1.0f / x; /* Special cases. */ else return 1.0f - x; /* Small case. */ } else if (likely (!in_bounds)) { return sign ? boring_zone : __math_uflowf (boring_zone); } #endif v_f32_t y = v_approx_erfcf (v_as_f32_u32 (ia), sign, ia12, in_bounds & ~special_cases); #ifndef SCALAR y = vbslq_f32 (~in_bounds, boring_zone, y); if (unlikely (v_any_u32 (special_cases))) { return specialcase (x, y, special_cases); } #endif return y; } VPCS_ALIAS PL_SIG (V, F, 1, erfc, -6.0, 28.0) PL_TEST_ULP (V_NAME (erfcf), 0.26) PL_TEST_INTERVAL (V_NAME (erfcf), 0, 0xffff0000, 10000) PL_TEST_INTERVAL (V_NAME (erfcf), 0x1p-127, 0x1p-26, 40000) PL_TEST_INTERVAL (V_NAME (erfcf), -0x1p-127, -0x1p-26, 40000) PL_TEST_INTERVAL (V_NAME (erfcf), 0x1p-26, 0x1p5, 40000) PL_TEST_INTERVAL (V_NAME (erfcf), -0x1p-26, -0x1p3, 40000) PL_TEST_INTERVAL (V_NAME (erfcf), 0, inf, 40000) #endif