/*
 * Single-precision vector erfc(x) function.
 *
 * Copyright (c) 2021-2023, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

#include "v_math.h"
#include "erfcf.h"
#include "estrin.h"
#include "pl_sig.h"
#include "pl_test.h"

#if V_SUPPORTED

#define P(ia12) __erfcf_poly_data.poly[interval_index (ia12)]

VPCS_ATTR v_f64_t V_NAME (exp_tail) (v_f64_t, v_f64_t);

static VPCS_ATTR NOINLINE v_f32_t
specialcase (v_f32_t x, v_f32_t y, v_u32_t special)
{
  return v_call_f32 (erfcf, x, y, special);
}

static inline uint32_t
interval_index (uint32_t ia12)
{
  // clang-format off
  return (ia12 < 0x400 ? 0 :
         (ia12 < 0x408 ? 1 :
         (ia12 < 0x410 ? 2 :
                         3)));
  // clang-format on
}

/* The C macro wraps the coeffs argument in order to make the
   poynomial evaluation more readable. In the scalarised variant the
   second pointer is ignored.  */
#ifdef SCALAR
#define C(i) coeff1[i]
#else
#define C(i) ((v_f64_t){coeff1[i], coeff2[i]})
#endif

static inline v_f64_t
v_approx_erfcf_poly_gauss (v_f64_t x, const double *coeff1,
			   const double *coeff2)
{
  v_f64_t x2 = x * x;
  v_f64_t x4 = x2 * x2;
  v_f64_t poly = ESTRIN_15 (x, x2, x4, x4 * x4, C);
  v_f64_t gauss = V_NAME (exp_tail) (-(x * x), v_f64 (0.0));
  return poly * gauss;
}

static inline float
approx_poly_gauss (float abs_x, const double *coeff)
{
  return (float) (eval_poly (abs_x, coeff) * eval_exp_mx2 (abs_x));
}

static v_f32_t
v_approx_erfcf (v_f32_t abs_x, v_u32_t sign, v_u32_t ia12, v_u32_t lanes)
{
#ifdef SCALAR
  float y = approx_poly_gauss (abs_x, P (ia12));
  return sign ? 2 - y : y;
#else
  float32x2_t lo32 = {0, 0};
  float32x2_t hi32 = {0, 0};
  /* The polynomial and Gaussian components must be calculated in
     double precision in order to meet the required ULP error. This
     means we have to promote low and high halves of the
     single-precision input vector to two separate double-precision
     input vectors. This incurs some overhead, and there is also
     overhead to loading the polynomial coefficients as this cannot be
     done in a vector fashion. This would be wasted effort for
     elements which lie in the 'boring' zone, as they will be
     overwritten later. Hence we use the lanes parameter to only do
     the promotion on a pair of lanes if both of those lanes are
     interesting and not special cases. If one lane is inactive, we
     use a scalar routine which is shared with the scalar variant.  */
  if (lanes[0] & lanes[1])
    {
      lo32 = vcvt_f32_f64 (
	v_approx_erfcf_poly_gauss (vcvt_f64_f32 (vget_low_f32 (abs_x)),
				   P (ia12[0]), P (ia12[1])));
    }
  else if (lanes[0])
    {
      lo32[0] = approx_poly_gauss (abs_x[0], P (ia12[0]));
    }
  else if (lanes[1])
    {
      lo32[1] = approx_poly_gauss (abs_x[1], P (ia12[1]));
    }

  if (lanes[2] & lanes[3])
    {
      hi32
	= vcvt_f32_f64 (v_approx_erfcf_poly_gauss (vcvt_high_f64_f32 (abs_x),
						   P (ia12[2]), P (ia12[3])));
    }
  else if (lanes[2])
    {
      hi32[0] = approx_poly_gauss (abs_x[2], P (ia12[2]));
    }
  else if (lanes[3])
    {
      hi32[1] = approx_poly_gauss (abs_x[3], P (ia12[3]));
    }

  v_f32_t y = vcombine_f32 (lo32, hi32);

  if (v_any_u32 (sign))
    {
      y = vbslq_f32 (vceqzq_u32 (sign), y, 2 - y);
    }

  return y;
#endif
}

/* Optimized single-precision vector complementary error function
   erfcf. Max measured error: 0.750092 at various values between
   -0x1.06521p-20 and -0x1.add1dap-17. For example:
   __v_erfc(-0x1.08185p-18) got 0x1.00004cp+0 want 0x1.00004ap+0
   +0.249908 ulp err 0.250092.  */
VPCS_ATTR
v_f32_t V_NAME (erfcf) (v_f32_t x)
{
  v_u32_t ix = v_as_u32_f32 (x);
  v_u32_t ia = ix & 0x7fffffff;
  v_u32_t ia12 = ia >> 20;
  v_u32_t sign = ix >> 31;
  v_u32_t inf_ia12 = v_u32 (0x7f8);

  v_u32_t special_cases
    = v_cond_u32 ((ia12 - 0x328) >= ((inf_ia12 & 0x7f8) - 0x328));
  v_u32_t in_bounds
    = v_cond_u32 ((ia < 0x408ccccd) | (~sign & (ix < 0x4120f5c3)));
  v_f32_t boring_zone = v_as_f32_u32 (sign << 30);

#ifdef SCALAR
  if (unlikely (special_cases))
    {
      if (ia12 >= 0x7f8)
	return (float) (sign << 1) + 1.0f / x; /* Special cases.  */
      else
	return 1.0f - x; /* Small case.  */
    }
  else if (likely (!in_bounds))
    {
      return sign ? boring_zone : __math_uflowf (boring_zone);
    }
#endif

  v_f32_t y = v_approx_erfcf (v_as_f32_u32 (ia), sign, ia12,
			      in_bounds & ~special_cases);

#ifndef SCALAR
  y = vbslq_f32 (~in_bounds, boring_zone, y);

  if (unlikely (v_any_u32 (special_cases)))
    {
      return specialcase (x, y, special_cases);
    }
#endif

  return y;
}
VPCS_ALIAS

PL_SIG (V, F, 1, erfc, -6.0, 28.0)
PL_TEST_ULP (V_NAME (erfcf), 0.26)
PL_TEST_INTERVAL (V_NAME (erfcf), 0, 0xffff0000, 10000)
PL_TEST_INTERVAL (V_NAME (erfcf), 0x1p-127, 0x1p-26, 40000)
PL_TEST_INTERVAL (V_NAME (erfcf), -0x1p-127, -0x1p-26, 40000)
PL_TEST_INTERVAL (V_NAME (erfcf), 0x1p-26, 0x1p5, 40000)
PL_TEST_INTERVAL (V_NAME (erfcf), -0x1p-26, -0x1p3, 40000)
PL_TEST_INTERVAL (V_NAME (erfcf), 0, inf, 40000)
#endif