pl/math/v_asinh_3u5.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175

/*
 * Double-precision vector asinh(x) function.
 *
 * Copyright (c) 2022-2023, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

#include "v_math.h"
#include "estrin.h"
#include "pl_sig.h"
#include "pl_test.h"

#if V_SUPPORTED

#define OneTop 0x3ff	/* top12(asuint64(1.0f)).  */
#define HugeBound 0x5fe /* top12(asuint64(0x1p511)).  */
#define TinyBound 0x3e5 /* top12(asuint64(0x1p-26)).  */
#define AbsMask v_u64 (0x7fffffffffffffff)
#define C(i) v_f64 (__asinh_data.poly[i])

/* Constants & data for log.  */
#define OFF 0x3fe6000000000000
#define Ln2 v_f64 (0x1.62e42fefa39efp-1)
#define A(i) v_f64 (__sv_log_data.poly[i])
#define T(i) __log_data.tab[i]
#define N (1 << LOG_TABLE_BITS)

static NOINLINE v_f64_t
special_case (v_f64_t x, v_f64_t y, v_u64_t special)
{
  return v_call_f64 (asinh, x, y, special);
}

struct entry
{
  v_f64_t invc;
  v_f64_t logc;
};

static inline struct entry
lookup (v_u64_t i)
{
  struct entry e;
#ifdef SCALAR
  e.invc = T (i).invc;
  e.logc = T (i).logc;
#else
  e.invc[0] = T (i[0]).invc;
  e.logc[0] = T (i[0]).logc;
  e.invc[1] = T (i[1]).invc;
  e.logc[1] = T (i[1]).logc;
#endif
  return e;
}

static inline v_f64_t
log_inline (v_f64_t x)
{
  /* Double-precision vector log, copied from math/v_log.c with some cosmetic
     modification and special-cases removed. See that file for details of the
     algorithm used.  */
  v_u64_t ix = v_as_u64_f64 (x);
  v_u64_t tmp = ix - OFF;
  v_u64_t i = (tmp >> (52 - LOG_TABLE_BITS)) % N;
  v_s64_t k = v_as_s64_u64 (tmp) >> 52;
  v_u64_t iz = ix - (tmp & 0xfffULL << 52);
  v_f64_t z = v_as_f64_u64 (iz);
  struct entry e = lookup (i);
  v_f64_t r = v_fma_f64 (z, e.invc, v_f64 (-1.0));
  v_f64_t kd = v_to_f64_s64 (k);
  v_f64_t hi = v_fma_f64 (kd, Ln2, e.logc + r);
  v_f64_t r2 = r * r;
  v_f64_t y = v_fma_f64 (A (3), r, A (2));
  v_f64_t p = v_fma_f64 (A (1), r, A (0));
  y = v_fma_f64 (A (4), r2, y);
  y = v_fma_f64 (y, r2, p);
  y = v_fma_f64 (y, r2, hi);
  return y;
}

/* Double-precision implementation of vector asinh(x).
   asinh is very sensitive around 1, so it is impractical to devise a single
   low-cost algorithm which is sufficiently accurate on a wide range of input.
   Instead we use two different algorithms:
   asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1)      if |x| >= 1
	    = sign(x) * (|x| + |x|^3 * P(x^2))       otherwise
   where log(x) is an optimized log approximation, and P(x) is a polynomial
   shared with the scalar routine. The greatest observed error 3.29 ULP, in
   |x| >= 1:
   __v_asinh(0x1.2cd9d717e2c9bp+0) got 0x1.ffffcfd0e234fp-1
				  want 0x1.ffffcfd0e2352p-1.  */
VPCS_ATTR v_f64_t V_NAME (asinh) (v_f64_t x)
{
  v_u64_t ix = v_as_u64_f64 (x);
  v_u64_t iax = ix & AbsMask;
  v_f64_t ax = v_as_f64_u64 (iax);
  v_u64_t top12 = iax >> 52;

  v_u64_t gt1 = v_cond_u64 (top12 >= OneTop);
  v_u64_t special = v_cond_u64 (top12 >= HugeBound);

#if WANT_SIMD_EXCEPT
  v_u64_t tiny = v_cond_u64 (top12 < TinyBound);
  special |= tiny;
#endif

  /* Option 1: |x| >= 1.
     Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)).
     If WANT_SIMD_EXCEPT is enabled, sidestep special values, which will
     overflow, by setting special lanes to 1. These will be fixed later.  */
  v_f64_t option_1 = v_f64 (0);
  if (likely (v_any_u64 (gt1)))
    {
#if WANT_SIMD_EXCEPT
      v_f64_t xm = v_sel_f64 (special, v_f64 (1), ax);
#else
      v_f64_t xm = ax;
#endif
      option_1 = log_inline (xm + v_sqrt_f64 (xm * xm + 1));
    }

  /* Option 2: |x| < 1.
     Compute asinh(x) using a polynomial.
     If WANT_SIMD_EXCEPT is enabled, sidestep special lanes, which will
     overflow, and tiny lanes, which will underflow, by setting them to 0. They
     will be fixed later, either by selecting x or falling back to the scalar
     special-case. The largest observed error in this region is 1.47 ULPs:
     __v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1
				    want 0x1.c1d6bf874019cp-1.  */
  v_f64_t option_2 = v_f64 (0);
  if (likely (v_any_u64 (~gt1)))
    {
#if WANT_SIMD_EXCEPT
      ax = v_sel_f64 (tiny | gt1, v_f64 (0), ax);
#endif
      v_f64_t x2 = ax * ax;
      v_f64_t z2 = x2 * x2;
      v_f64_t z4 = z2 * z2;
      v_f64_t z8 = z4 * z4;
      v_f64_t p = ESTRIN_17 (x2, z2, z4, z8, z8 * z8, C);
      option_2 = v_fma_f64 (p, x2 * ax, ax);
#if WANT_SIMD_EXCEPT
      option_2 = v_sel_f64 (tiny, x, option_2);
#endif
    }

  /* Choose the right option for each lane.  */
  v_f64_t y = v_sel_f64 (gt1, option_1, option_2);
  /* Copy sign.  */
  y = v_as_f64_u64 (v_bsl_u64 (AbsMask, v_as_u64_f64 (y), ix));

  if (unlikely (v_any_u64 (special)))
    return special_case (x, y, special);
  return y;
}
VPCS_ALIAS

PL_SIG (V, D, 1, asinh, -10.0, 10.0)
PL_TEST_ULP (V_NAME (asinh), 2.80)
PL_TEST_EXPECT_FENV (V_NAME (asinh), WANT_SIMD_EXCEPT)
/* Test vector asinh 3 times, with control lane < 1, > 1 and special.
   Ensures the v_sel is choosing the right option in all cases.  */
#define V_ASINH_INTERVAL(lo, hi, n)                                            \
  PL_TEST_INTERVAL_C (V_NAME (asinh), lo, hi, n, 0.5)                          \
  PL_TEST_INTERVAL_C (V_NAME (asinh), lo, hi, n, 2)                            \
  PL_TEST_INTERVAL_C (V_NAME (asinh), lo, hi, n, 0x1p600)
V_ASINH_INTERVAL (0, 0x1p-26, 50000)
V_ASINH_INTERVAL (0x1p-26, 1, 50000)
V_ASINH_INTERVAL (1, 0x1p511, 50000)
V_ASINH_INTERVAL (0x1p511, inf, 40000)
V_ASINH_INTERVAL (-0, -0x1p-26, 50000)
V_ASINH_INTERVAL (-0x1p-26, -1, 50000)
V_ASINH_INTERVAL (-1, -0x1p511, 50000)
V_ASINH_INTERVAL (-0x1p511, -inf, 40000)
#endif