aboutsummaryrefslogtreecommitdiff
path: root/pl/math
diff options
context:
space:
mode:
Diffstat (limited to 'pl/math')
-rw-r--r--pl/math/asinh_2u5.c39
-rw-r--r--pl/math/asinhf_3u5.c13
-rw-r--r--pl/math/atan_common.h34
-rw-r--r--pl/math/atanf_common.h8
-rw-r--r--pl/math/cbrtf_1u5.c6
-rw-r--r--pl/math/erfc_4u5.c23
-rw-r--r--pl/math/erfcf.h38
-rw-r--r--pl/math/erfcf_2u.c4
-rw-r--r--pl/math/erff_1u5.c22
-rw-r--r--pl/math/estrin.h16
-rw-r--r--pl/math/estrin_wrap.h48
-rw-r--r--pl/math/estrinf.h14
-rw-r--r--pl/math/expm1_2u5.c23
-rw-r--r--pl/math/expm1f_1u6.c8
-rw-r--r--pl/math/horner.h14
-rw-r--r--pl/math/horner_wrap.h34
-rw-r--r--pl/math/hornerf.h14
-rw-r--r--pl/math/log1p_2u.c13
-rw-r--r--pl/math/log1p_common.h61
-rw-r--r--pl/math/log1pf_2u1.c15
-rw-r--r--pl/math/pairwise_horner.h14
-rw-r--r--pl/math/pairwise_horner_wrap.h36
-rw-r--r--pl/math/pairwise_hornerf.h14
-rwxr-xr-xpl/math/test/runulp.sh2
-rw-r--r--pl/math/v_asinh_2u5.c38
-rw-r--r--pl/math/v_erfc_4u.c26
-rw-r--r--pl/math/v_erfcf_1u.c36
-rw-r--r--pl/math/v_log1p_2u5.c12
-rw-r--r--pl/math/v_tanf_3u2.c10
-rw-r--r--pl/math/v_tanhf_2u6.c6
30 files changed, 299 insertions, 342 deletions
diff --git a/pl/math/asinh_2u5.c b/pl/math/asinh_2u5.c
index f22b342..9cbdd33 100644
--- a/pl/math/asinh_2u5.c
+++ b/pl/math/asinh_2u5.c
@@ -5,48 +5,17 @@
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
+#include "estrin.h"
#define AbsMask 0x7fffffffffffffff
#define ExpM26 0x3e50000000000000 /* asuint64(0x1.0p-26). */
#define One 0x3ff0000000000000 /* asuint64(1.0). */
#define Exp511 0x5fe0000000000000 /* asuint64(0x1.0p511). */
#define Ln2 0x1.62e42fefa39efp-1
-#define C(i) __asinh_data.poly[i]
double
optr_aor_log_f64 (double);
-static inline double
-eval_poly (double z)
-{
- /* Evaluate polynomial using Estrin scheme. */
- double p_01 = fma (z, C (1), C (0));
- double p_23 = fma (z, C (3), C (2));
- double p_45 = fma (z, C (5), C (4));
- double p_67 = fma (z, C (7), C (6));
- double p_89 = fma (z, C (9), C (8));
- double p_ab = fma (z, C (11), C (10));
- double p_cd = fma (z, C (13), C (12));
- double p_ef = fma (z, C (15), C (14));
- double p_gh = fma (z, C (17), C (16));
-
- double z2 = z * z;
- double p_03 = fma (z2, p_23, p_01);
- double p_47 = fma (z2, p_67, p_45);
- double p_8b = fma (z2, p_ab, p_89);
- double p_cf = fma (z2, p_ef, p_cd);
-
- double z4 = z2 * z2;
- double p_07 = fma (z4, p_47, p_03);
- double p_8f = fma (z4, p_cf, p_8b);
-
- double z8 = z4 * z4;
- double p_0f = fma (z8, p_8f, p_07);
-
- double z16 = z8 * z8;
- return fma (z16, p_gh, p_0f);
-}
-
/* Scalar double-precision asinh implementation. This routine uses different
approaches on different intervals:
@@ -86,7 +55,11 @@ asinh (double x)
if (ia < One)
{
double x2 = x * x;
- double p = eval_poly (x2);
+ double z2 = x2 * x2;
+ double z4 = z2 * z2;
+ double z8 = z4 * z4;
+#define C(i) __asinh_data.poly[i]
+ double p = ESTRIN_17 (x2, z2, z4, z8, z8 * z8, C);
double y = fma (p, x2 * ax, ax);
return asdouble (asuint64 (y) | sign);
}
diff --git a/pl/math/asinhf_3u5.c b/pl/math/asinhf_3u5.c
index 8aa62ad..48acdef 100644
--- a/pl/math/asinhf_3u5.c
+++ b/pl/math/asinhf_3u5.c
@@ -5,6 +5,7 @@
*/
#include "math_config.h"
+#include "estrinf.h"
#define AbsMask (0x7fffffff)
#define SqrtFltMax (0x1.749e96p+10f)
@@ -53,17 +54,7 @@ asinhf (float x)
if (ia12 < One)
{
float x2 = ax * ax;
- float x4 = x2 * x2;
-
- float p_01 = fmaf (ax, C (1), C (0));
- float p_23 = fmaf (ax, C (3), C (2));
- float p_45 = fmaf (ax, C (5), C (4));
- float p_67 = fmaf (ax, C (7), C (6));
-
- float p_03 = fmaf (x2, p_23, p_01);
- float p_47 = fmaf (x2, p_67, p_45);
-
- float p = fmaf (x4, p_47, p_03);
+ float p = ESTRIN_7 (ax, x2, x2 * x2, C);
float y = fmaf (x2, p, ax);
return asfloat (asuint (y) | sign);
}
diff --git a/pl/math/atan_common.h b/pl/math/atan_common.h
index 1690e7e..331c1bb 100644
--- a/pl/math/atan_common.h
+++ b/pl/math/atan_common.h
@@ -7,19 +7,18 @@
*/
#include "math_config.h"
+#include "estrin.h"
#if V_SUPPORTED
#include "v_math.h"
#define DBL_T v_f64_t
-#define FMA v_fma_f64
#define P(i) v_f64 (__atan_poly_data.poly[i])
#else
#define DBL_T double
-#define FMA fma
#define P(i) __atan_poly_data.poly[i]
#endif
@@ -29,37 +28,14 @@
static inline DBL_T
eval_poly (DBL_T z, DBL_T az, DBL_T shift)
{
- /* Use full Estrin scheme for P(z^2) with deg(P)=19. */
+ /* Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of
+ full scheme to avoid underflow in x^16. */
DBL_T z2 = z * z;
- /* Level 1. */
- DBL_T P_1_0 = FMA (P (1), z2, P (0));
- DBL_T P_3_2 = FMA (P (3), z2, P (2));
- DBL_T P_5_4 = FMA (P (5), z2, P (4));
- DBL_T P_7_6 = FMA (P (7), z2, P (6));
- DBL_T P_9_8 = FMA (P (9), z2, P (8));
- DBL_T P_11_10 = FMA (P (11), z2, P (10));
- DBL_T P_13_12 = FMA (P (13), z2, P (12));
- DBL_T P_15_14 = FMA (P (15), z2, P (14));
- DBL_T P_17_16 = FMA (P (17), z2, P (16));
- DBL_T P_19_18 = FMA (P (19), z2, P (18));
-
- /* Level 2. */
DBL_T x2 = z2 * z2;
- DBL_T P_3_0 = FMA (P_3_2, x2, P_1_0);
- DBL_T P_7_4 = FMA (P_7_6, x2, P_5_4);
- DBL_T P_11_8 = FMA (P_11_10, x2, P_9_8);
- DBL_T P_15_12 = FMA (P_15_14, x2, P_13_12);
- DBL_T P_19_16 = FMA (P_19_18, x2, P_17_16);
-
- /* Level 3. */
DBL_T x4 = x2 * x2;
- DBL_T P_7_0 = FMA (P_7_4, x4, P_3_0);
- DBL_T P_15_8 = FMA (P_15_12, x4, P_11_8);
-
- /* Level 4. */
DBL_T x8 = x4 * x4;
- DBL_T y = FMA (P_19_16, x8, P_15_8);
- y = FMA (y, x8, P_7_0);
+ DBL_T y
+ = FMA (ESTRIN_11_ (z2, x2, x4, x8, P, 8), x8, ESTRIN_7 (z2, x2, x4, P));
/* Finalize. y = shift + z + z^3 * P(z^2). */
y = FMA (y, z2 * az, az);
diff --git a/pl/math/atanf_common.h b/pl/math/atanf_common.h
index 436b88b..3038e54 100644
--- a/pl/math/atanf_common.h
+++ b/pl/math/atanf_common.h
@@ -10,19 +10,18 @@
#define PL_MATH_ATANF_COMMON_H
#include "math_config.h"
+#include "estrinf.h"
#if V_SUPPORTED
#include "v_math.h"
#define FLT_T v_f32_t
-#define FMA v_fma_f32
#define P(i) v_f32 (__atanf_poly_data.poly[i])
#else
#define FLT_T float
-#define FMA fmaf
#define P(i) __atanf_poly_data.poly[i]
#endif
@@ -42,10 +41,7 @@ eval_poly (FLT_T z, FLT_T az, FLT_T shift)
FLT_T z4 = z2 * z2;
/* Then assemble polynomial. */
- FLT_T y
- = FMA (z4,
- z4 * FMA (z4, (FMA (z2, P (7), P (6))), (FMA (z2, P (5), P (4)))),
- FMA (z4, (FMA (z2, P (3), P (2))), (FMA (z2, P (1), P (0)))));
+ FLT_T y = FMA (z4, z4 * ESTRIN_3_ (z2, z4, P, 4), ESTRIN_3 (z2, z4, P));
/* Finalize:
y = shift + z * P(z^2). */
diff --git a/pl/math/cbrtf_1u5.c b/pl/math/cbrtf_1u5.c
index 73b9049..d544a68 100644
--- a/pl/math/cbrtf_1u5.c
+++ b/pl/math/cbrtf_1u5.c
@@ -8,6 +8,7 @@
#include <math.h>
#include "math_config.h"
+#include "estrinf.h"
#define AbsMask 0x7fffffff
#define SignMask 0x80000000
@@ -40,10 +41,7 @@ cbrtf (float x)
/* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is,
the less accurate the next stage of the algorithm needs to be. An order-4
polynomial is enough for one Newton iteration. */
- float p_01 = fmaf (C (1), m, C (0));
- float p_23 = fmaf (C (3), m, C (2));
- float p = fmaf (m * m, p_23, p_01);
-
+ float p = ESTRIN_3 (m, m * m, C);
/* One iteration of Newton's method for iteratively approximating cbrt. */
float m_by_3 = m / 3;
float a = fmaf (TwoThirds, p, m_by_3 / (p * p));
diff --git a/pl/math/erfc_4u5.c b/pl/math/erfc_4u5.c
index 003a0cc..8088562 100644
--- a/pl/math/erfc_4u5.c
+++ b/pl/math/erfc_4u5.c
@@ -9,6 +9,7 @@
#include <math.h>
#include <errno.h>
#include "math_config.h"
+#include "pairwise_horner.h"
#define AbsMask (0x7fffffffffffffff)
@@ -19,28 +20,12 @@
double
__exp_dd (double x, double xtail);
-/* Evaluate order-12 polynomials using
- pairwise summation and Horner scheme
- in double precision. */
static inline double
eval_poly_horner (double z, int i)
{
- double r1, r2, r3, r4, r5, r6, z2;
- r1 = fma (z, PX[i][1], PX[i][0]);
- r2 = fma (z, PX[i][3], PX[i][2]);
- r3 = fma (z, PX[i][5], PX[i][4]);
- r4 = fma (z, PX[i][7], PX[i][6]);
- r5 = fma (z, PX[i][9], PX[i][8]);
- r6 = fma (z, PX[i][11], PX[i][10]);
- z2 = z * z;
- double r = PX[i][12];
- r = fma (z2, r, r6);
- r = fma (z2, r, r5);
- r = fma (z2, r, r4);
- r = fma (z2, r, r3);
- r = fma (z2, r, r2);
- r = fma (z2, r, r1);
- return r;
+ double z2 = z * z;
+#define C(j) PX[i][j]
+ return PAIRWISE_HORNER_12 (z, z2, C);
}
/* Accurate evaluation of exp(x^2)
diff --git a/pl/math/erfcf.h b/pl/math/erfcf.h
index 6adc6b4..98ead38 100644
--- a/pl/math/erfcf.h
+++ b/pl/math/erfcf.h
@@ -8,39 +8,24 @@
#ifndef PL_MATH_ERFCF_H
#define PL_MATH_ERFCF_H
-#include <math.h>
+#include "math_config.h"
+
+#define FMA fma
+#include "estrin_wrap.h"
/* Accurate exponential from optimized-routines. */
double
__exp_dd (double x, double xtail);
-/* Evaluate order-12 polynomials using pairwise summation and Horner scheme in
- double precision. */
static inline double
-eval_poly_horner_lvl2 (double z, const double *coeff)
+eval_poly (double z, const double *coeff)
{
- double r1, r2, r3, r4, r5, r6, r7, r8;
- double R1, R2, R3, R4;
- double Q1, Q2;
- double z2, z4, z8;
- z2 = z * z;
- r1 = fma (z, coeff[1], coeff[0]);
- r2 = fma (z, coeff[3], coeff[2]);
- z4 = z2 * z2;
- z8 = z4 * z4;
- R1 = fma (z2, r2, r1);
- r3 = fma (z, coeff[5], coeff[4]);
- r4 = fma (z, coeff[7], coeff[6]);
- R2 = fma (z2, r4, r3);
- Q1 = fma (z4, R2, R1);
- r5 = fma (z, coeff[9], coeff[8]);
- r6 = fma (z, coeff[11], coeff[10]);
- R3 = fma (z2, r6, r5);
- r7 = fma (z, coeff[13], coeff[12]);
- r8 = fma (z, coeff[15], coeff[14]);
- R4 = fma (z2, r8, r7);
- Q2 = fma (z4, R4, R3);
- return fma (z8, Q2, Q1);
+ double z2 = z * z;
+ double z4 = z2 * z2;
+ double z8 = z4 * z4;
+#define C(i) coeff[i]
+ return ESTRIN_15 (z, z2, z4, z8, C);
+#undef C
}
static inline double
@@ -49,4 +34,5 @@ eval_exp_mx2 (double x)
return __exp_dd (-(x * x), 0.0);
}
+#undef FMA
#endif // PL_MATH_ERFCF_H
diff --git a/pl/math/erfcf_2u.c b/pl/math/erfcf_2u.c
index 80dba83..8d4bba1 100644
--- a/pl/math/erfcf_2u.c
+++ b/pl/math/erfcf_2u.c
@@ -21,7 +21,7 @@ approx_erfcf_hi (float x, uint32_t sign, const double *coeff)
/* Polynomial contribution. */
double z = (double) fabs (x);
- float p = (float) eval_poly_horner_lvl2 (z, coeff);
+ float p = (float) eval_poly (z, coeff);
/* Gaussian contribution. */
float e_mx2 = (float) eval_exp_mx2 (z);
@@ -34,7 +34,7 @@ approx_erfcf_lo (float x, uint32_t sign, const double *coeff)
{
/* Polynomial contribution. */
double z = (double) fabs (x);
- float p = (float) eval_poly_horner_lvl2 (z, coeff);
+ float p = (float) eval_poly (z, coeff);
/* Gaussian contribution. */
float e_mx2 = (float) eval_exp_mx2 (z);
diff --git a/pl/math/erff_1u5.c b/pl/math/erff_1u5.c
index 1073603..bad68a6 100644
--- a/pl/math/erff_1u5.c
+++ b/pl/math/erff_1u5.c
@@ -7,7 +7,10 @@
#include <stdint.h>
#include <math.h>
+
#include "math_config.h"
+#include "hornerf.h"
+#include "estrinf.h"
#define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f
#define A __erff_data.erff_poly_A
@@ -26,7 +29,7 @@ top12 (float x)
float
erff (float x)
{
- float r, x2, u;
+ float r, x2;
/* Get top word. */
uint32_t ix = asuint (x);
@@ -54,23 +57,18 @@ erff (float x)
/* Normalized cases (|x| < 0.921875) - Use Horner scheme for x+x*P(x^2).
*/
- r = A[5];
- r = fmaf (r, x2, A[4]);
- r = fmaf (r, x2, A[3]);
- r = fmaf (r, x2, A[2]);
- r = fmaf (r, x2, A[1]);
- r = fmaf (r, x2, A[0]);
- r = fmaf (r, x, x);
+#define C(i) A[i]
+ r = fmaf (HORNER_5 (x2, C), x, x);
+#undef C
}
else if (ia12 < 0x408)
{ /* |x| < 4.0 - Use a custom Estrin scheme. */
float a = fabsf (x);
/* Use Estrin scheme on high order (small magnitude) coefficients. */
- r = fmaf (B[6], a, B[5]);
- u = fmaf (B[4], a, B[3]);
- x2 = x * x;
- r = fmaf (r, x2, u);
+#define C(i) B[i]
+ r = ESTRIN_3_ (a, x * x, C, 3);
+#undef C
/* Then switch to pure Horner scheme. */
r = fmaf (r, a, B[2]);
r = fmaf (r, a, B[1]);
diff --git a/pl/math/estrin.h b/pl/math/estrin.h
new file mode 100644
index 0000000..89df329
--- /dev/null
+++ b/pl/math/estrin.h
@@ -0,0 +1,16 @@
+/*
+ * Helper macros for double-precision Estrin polynomial evaluation.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#if V_SUPPORTED
+#define FMA v_fma_f64
+#else
+#define FMA fma
+#endif
+
+#include "estrin_wrap.h"
diff --git a/pl/math/estrin_wrap.h b/pl/math/estrin_wrap.h
new file mode 100644
index 0000000..93af2ab
--- /dev/null
+++ b/pl/math/estrin_wrap.h
@@ -0,0 +1,48 @@
+/*
+ * Helper macros for double-precision Estrin polynomial evaluation.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+// clang-format off
+#define ESTRIN_1_(x, c, i) FMA(x, c(1 + i), c(i))
+#define ESTRIN_2_(x, x2, c, i) FMA(x2, c(2 + i), ESTRIN_1_(x, c, i))
+#define ESTRIN_3_(x, x2, c, i) FMA(x2, ESTRIN_1_(x, c, 2 + i), ESTRIN_1_(x, c, i))
+#define ESTRIN_4_(x, x2, x4, c, i) FMA(x4, c(4 + i), ESTRIN_3_(x, x2, c, i))
+#define ESTRIN_5_(x, x2, x4, c, i) FMA(x4, ESTRIN_1_(x, c, 4 + i), ESTRIN_3_(x, x2, c, i))
+#define ESTRIN_6_(x, x2, x4, c, i) FMA(x4, ESTRIN_2_(x, x2, c, 4 + i), ESTRIN_3_(x, x2, c, i))
+#define ESTRIN_7_(x, x2, x4, c, i) FMA(x4, ESTRIN_3_(x, x2, c, 4 + i), ESTRIN_3_(x, x2, c, i))
+#define ESTRIN_8_(x, x2, x4, x8, c, i) FMA(x8, c(8 + i), ESTRIN_7_(x, x2, x4, c, i))
+#define ESTRIN_9_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_1_(x, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i))
+#define ESTRIN_10_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_2_(x, x2, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i))
+#define ESTRIN_11_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_3_(x, x2, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i))
+#define ESTRIN_12_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_4_(x, x2, x4, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i))
+#define ESTRIN_13_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_5_(x, x2, x4, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i))
+#define ESTRIN_14_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_6_(x, x2, x4, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i))
+#define ESTRIN_15_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_7_(x, x2, x4, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i))
+#define ESTRIN_16_(x, x2, x4, x8, x16, c, i) FMA(x16, c(16 + i), ESTRIN_15_(x, x2, x4, x8, c, i))
+#define ESTRIN_17_(x, x2, x4, x8, x16, c, i) FMA(x16, ESTRIN_1_(x, c, 16 + i), ESTRIN_15_(x, x2, x4, x8, c, i))
+#define ESTRIN_18_(x, x2, x4, x8, x16, c, i) FMA(x16, ESTRIN_2_(x, x2, c, 16 + i), ESTRIN_15_(x, x2, x4, x8, c, i))
+#define ESTRIN_19_(x, x2, x4, x8, x16, c, i) FMA(x16, ESTRIN_3_(x, x2, c, 16 + i), ESTRIN_15_(x, x2, x4, x8, c, i))
+
+#define ESTRIN_1(x, c) ESTRIN_1_(x, c, 0)
+#define ESTRIN_2(x, x2, c) ESTRIN_2_(x, x2, c, 0)
+#define ESTRIN_3(x, x2, c) ESTRIN_3_(x, x2, c, 0)
+#define ESTRIN_4(x, x2, x4, c) ESTRIN_4_(x, x2, x4, c, 0)
+#define ESTRIN_5(x, x2, x4, c) ESTRIN_5_(x, x2, x4, c, 0)
+#define ESTRIN_6(x, x2, x4, c) ESTRIN_6_(x, x2, x4, c, 0)
+#define ESTRIN_7(x, x2, x4, c) ESTRIN_7_(x, x2, x4, c, 0)
+#define ESTRIN_8(x, x2, x4, x8, c) ESTRIN_8_(x, x2, x4, x8, c, 0)
+#define ESTRIN_9(x, x2, x4, x8, c) ESTRIN_9_(x, x2, x4, x8, c, 0)
+#define ESTRIN_10(x, x2, x4, x8, c) ESTRIN_10_(x, x2, x4, x8, c, 0)
+#define ESTRIN_11(x, x2, x4, x8, c) ESTRIN_11_(x, x2, x4, x8, c, 0)
+#define ESTRIN_12(x, x2, x4, x8, c) ESTRIN_12_(x, x2, x4, x8, c, 0)
+#define ESTRIN_13(x, x2, x4, x8, c) ESTRIN_13_(x, x2, x4, x8, c, 0)
+#define ESTRIN_14(x, x2, x4, x8, c) ESTRIN_14_(x, x2, x4, x8, c, 0)
+#define ESTRIN_15(x, x2, x4, x8, c) ESTRIN_15_(x, x2, x4, x8, c, 0)
+#define ESTRIN_16(x, x2, x4, x8, x16, c) ESTRIN_16_(x, x2, x4, x8, x16, c, 0)
+#define ESTRIN_17(x, x2, x4, x8, x16, c) ESTRIN_17_(x, x2, x4, x8, x16, c, 0)
+#define ESTRIN_18(x, x2, x4, x8, x16, c) ESTRIN_18_(x, x2, x4, x8, x16, c, 0)
+#define ESTRIN_19(x, x2, x4, x8, x16, c) ESTRIN_19_(x, x2, x4, x8, x16, c, 0)
+// clang-format on
diff --git a/pl/math/estrinf.h b/pl/math/estrinf.h
new file mode 100644
index 0000000..be52ab5
--- /dev/null
+++ b/pl/math/estrinf.h
@@ -0,0 +1,14 @@
+/*
+ * Helper macros for single-precision Estrin polynomial evaluation.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#if V_SUPPORTED
+#define FMA v_fma_f32
+#else
+#define FMA fmaf
+#endif
+
+#include "estrin_wrap.h"
diff --git a/pl/math/expm1_2u5.c b/pl/math/expm1_2u5.c
index c701d7e..98ef078 100644
--- a/pl/math/expm1_2u5.c
+++ b/pl/math/expm1_2u5.c
@@ -6,6 +6,7 @@
*/
#include "math_config.h"
+#include "estrin.h"
#define InvLn2 0x1.71547652b82fep0
#define Ln2hi 0x1.62e42fefa39efp-1
@@ -19,25 +20,6 @@
#define C(i) __expm1_poly[i]
-static inline double
-eval_poly (double f, double f2)
-{
- /* Evaluate custom polynomial using Estrin scheme. */
- double p_01 = fma (f, C (1), C (0));
- double p_23 = fma (f, C (3), C (2));
- double p_45 = fma (f, C (5), C (4));
- double p_67 = fma (f, C (7), C (6));
- double p_89 = fma (f, C (9), C (8));
-
- double p_03 = fma (f2, p_23, p_01);
- double p_47 = fma (f2, p_67, p_45);
- double p_8a = fma (f2, C (10), p_89);
-
- double f4 = f2 * f2;
- double p_07 = fma (f4, p_47, p_03);
- return fma (f4 * f4, p_8a, p_07);
-}
-
/* Approximation for exp(x) - 1 using polynomial on a reduced interval.
The maximum error observed error is 2.17 ULP:
expm1(0x1.63f90a866748dp-2) got 0x1.a9af56603878ap-2
@@ -80,7 +62,8 @@ expm1 (double x)
So we calculate the polynomial P(f) = a + bf + cf^2 + ...
and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
double f2 = f * f;
- double p = fma (f2, eval_poly (f, f2), f);
+ double f4 = f2 * f2;
+ double p = fma (f2, ESTRIN_10 (f, f2, f4, f4 * f4, C), f);
/* Assemble the result, using a slight rearrangement to achieve acceptable
accuracy.
diff --git a/pl/math/expm1f_1u6.c b/pl/math/expm1f_1u6.c
index 44981ca..0904652 100644
--- a/pl/math/expm1f_1u6.c
+++ b/pl/math/expm1f_1u6.c
@@ -6,6 +6,7 @@
*/
#include "math_config.h"
+#include "hornerf.h"
#define Shift (0x1.8p23f)
#define InvLn2 (0x1.715476p+0f)
@@ -59,12 +60,7 @@ expm1f (float x)
x + ax^2 + bx^3 + cx^4 ....
So we calculate the polynomial P(f) = a + bf + cf^2 + ...
and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
- float p = fmaf (C (4), f, C (3));
- p = fmaf (p, f, C (2));
- p = fmaf (p, f, C (1));
- p = fmaf (p, f, C (0));
- p = fmaf (f * f, p, f);
-
+ float p = fmaf (f * f, HORNER_4 (f, C), f);
/* Assemble the result, using a slight rearrangement to achieve acceptable
accuracy.
expm1(x) ~= 2^i * (p + 1) - 1
diff --git a/pl/math/horner.h b/pl/math/horner.h
new file mode 100644
index 0000000..4dbc122
--- /dev/null
+++ b/pl/math/horner.h
@@ -0,0 +1,14 @@
+/*
+ * Helper macros for single-precision Horner polynomial evaluation.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#if V_SUPPORTED
+#define FMA v_fma_f64
+#else
+#define FMA fma
+#endif
+
+#include "horner_wrap.h"
diff --git a/pl/math/horner_wrap.h b/pl/math/horner_wrap.h
new file mode 100644
index 0000000..892d63b
--- /dev/null
+++ b/pl/math/horner_wrap.h
@@ -0,0 +1,34 @@
+/*
+ * Helper macros for Horner polynomial evaluation.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+// clang-format off
+#define HORNER_1_(x, c, i) FMA(C(i + 1), x, c(i))
+#define HORNER_2_(x, c, i) FMA(HORNER_1_ (x, c, i + 1), x, c(i))
+#define HORNER_3_(x, c, i) FMA(HORNER_2_ (x, c, i + 1), x, c(i))
+#define HORNER_4_(x, c, i) FMA(HORNER_3_ (x, c, i + 1), x, c(i))
+#define HORNER_5_(x, c, i) FMA(HORNER_4_ (x, c, i + 1), x, c(i))
+#define HORNER_6_(x, c, i) FMA(HORNER_5_ (x, c, i + 1), x, c(i))
+#define HORNER_7_(x, c, i) FMA(HORNER_6_ (x, c, i + 1), x, c(i))
+#define HORNER_8_(x, c, i) FMA(HORNER_7_ (x, c, i + 1), x, c(i))
+#define HORNER_9_(x, c, i) FMA(HORNER_8_ (x, c, i + 1), x, c(i))
+#define HORNER_10_(x, c, i) FMA(HORNER_9_ (x, c, i + 1), x, c(i))
+#define HORNER_11_(x, c, i) FMA(HORNER_10_(x, c, i + 1), x, c(i))
+#define HORNER_12_(x, c, i) FMA(HORNER_11_(x, c, i + 1), x, c(i))
+
+#define HORNER_1(x, c) HORNER_1_ (x, c, 0)
+#define HORNER_2(x, c) HORNER_2_ (x, c, 0)
+#define HORNER_3(x, c) HORNER_3_ (x, c, 0)
+#define HORNER_4(x, c) HORNER_4_ (x, c, 0)
+#define HORNER_5(x, c) HORNER_5_ (x, c, 0)
+#define HORNER_6(x, c) HORNER_6_ (x, c, 0)
+#define HORNER_7(x, c) HORNER_7_ (x, c, 0)
+#define HORNER_8(x, c) HORNER_8_ (x, c, 0)
+#define HORNER_9(x, c) HORNER_9_ (x, c, 0)
+#define HORNER_10(x, c) HORNER_10_(x, c, 0)
+#define HORNER_11(x, c) HORNER_11_(x, c, 0)
+#define HORNER_12(x, c) HORNER_12_(x, c, 0)
+// clang-format on
diff --git a/pl/math/hornerf.h b/pl/math/hornerf.h
new file mode 100644
index 0000000..bec1593
--- /dev/null
+++ b/pl/math/hornerf.h
@@ -0,0 +1,14 @@
+/*
+ * Helper macros for double-precision Horner polynomial evaluation.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#if V_SUPPORTED
+#define FMA v_fma_f32
+#else
+#define FMA fmaf
+#endif
+
+#include "horner_wrap.h"
diff --git a/pl/math/log1p_2u.c b/pl/math/log1p_2u.c
index b9c7e9e..ade5d87 100644
--- a/pl/math/log1p_2u.c
+++ b/pl/math/log1p_2u.c
@@ -5,8 +5,7 @@
*/
#include "math_config.h"
-
-#include "log1p_common.h"
+#include "estrin.h"
#define Ln2Hi 0x1.62e42fefa3800p-1
#define Ln2Lo 0x1.ef35793c76730p-45
@@ -19,6 +18,16 @@
#define Rt2MOne 0x3fda827999fcef32
#define AbsMask 0x7fffffffffffffff
#define ExpM63 0x3c00
+#define C(i) __log1p_data.coeffs[i]
+
+static inline double
+eval_poly (double f)
+{
+ double f2 = f * f;
+ double f4 = f2 * f2;
+ double f8 = f4 * f4;
+ return ESTRIN_18 (f, f2, f4, f8, f8 * f8, C);
+}
/* log1p approximation using polynomial on reduced interval. Largest
observed errors are near the lower boundary of the region where k
diff --git a/pl/math/log1p_common.h b/pl/math/log1p_common.h
deleted file mode 100644
index 24e6f20..0000000
--- a/pl/math/log1p_common.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Double-precision polynomial evaluation function for scalar and vector
- * log1p(x)
- *
- * Copyright (c) 2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#ifndef PL_MATH_LOG1P_COMMON_H
-#define PL_MATH_LOG1P_COMMON_H
-
-#include "math_config.h"
-
-#if V_SUPPORTED
-
-#include "v_math.h"
-
-#define DBL_T v_f64_t
-#define FMA v_fma_f64
-#define C(i) v_f64 (__log1p_data.coeffs[i])
-
-#else
-
-#define DBL_T double
-#define FMA fma
-#define C(i) __log1p_data.coeffs[i]
-
-#endif
-
-static inline DBL_T
-eval_poly (DBL_T f)
-{
- /* Evaluate polynomial using Estrin's method. */
- DBL_T p_01 = FMA (f, C (1), C (0));
- DBL_T p_23 = FMA (f, C (3), C (2));
- DBL_T p_45 = FMA (f, C (5), C (4));
- DBL_T p_67 = FMA (f, C (7), C (6));
- DBL_T p_89 = FMA (f, C (9), C (8));
- DBL_T p_ab = FMA (f, C (11), C (10));
- DBL_T p_cd = FMA (f, C (13), C (12));
- DBL_T p_ef = FMA (f, C (15), C (14));
- DBL_T p_gh = FMA (f, C (17), C (16));
-
- DBL_T f2 = f * f;
- DBL_T p_03 = FMA (f2, p_23, p_01);
- DBL_T p_47 = FMA (f2, p_67, p_45);
- DBL_T p_8b = FMA (f2, p_ab, p_89);
- DBL_T p_cf = FMA (f2, p_ef, p_cd);
- DBL_T p_gi = FMA (f2, C (18), p_gh);
-
- DBL_T f4 = f2 * f2;
- DBL_T p_07 = FMA (f4, p_47, p_03);
- DBL_T p_8f = FMA (f4, p_cf, p_8b);
-
- DBL_T f8 = f4 * f4;
- DBL_T p_0f = FMA (f8, p_8f, p_07);
-
- return FMA (f8 * f8, p_gi, p_0f);
-}
-
-#endif // PL_MATH_LOG1P_COMMON_H
diff --git a/pl/math/log1pf_2u1.c b/pl/math/log1pf_2u1.c
index 5b0d542..9b7cb94 100644
--- a/pl/math/log1pf_2u1.c
+++ b/pl/math/log1pf_2u1.c
@@ -5,6 +5,7 @@
*/
#include "math_config.h"
+#include "hornerf.h"
#define Ln2 (0x1.62e43p-1f)
#define SignMask (0x80000000)
@@ -21,8 +22,8 @@ eval_poly (float m, uint32_t e)
{
#ifdef LOG1PF_2U5
- /* 2.5 ulp variant. Approximate log(1+m) on [-0.25, 0.5] using Estrin
- scheme. */
+ /* 2.5 ulp variant. Approximate log(1+m) on [-0.25, 0.5] using
+ slightly modified Estrin scheme (no x^0 term, and x term is just x). */
float p_12 = fmaf (m, C (1), C (0));
float p_34 = fmaf (m, C (3), C (2));
float p_56 = fmaf (m, C (5), C (4));
@@ -49,15 +50,7 @@ eval_poly (float m, uint32_t e)
x + C1 * x^2 + C2 * x^3 + C3 * x^4 + ...
Hence approximation has the form m + m^2 * P(m)
where P(x) = C1 + C2 * x + C3 * x^2 + ... . */
- float p = fmaf (C (8), m, C (7));
- p = fmaf (p, m, C (6));
- p = fmaf (p, m, C (5));
- p = fmaf (p, m, C (4));
- p = fmaf (p, m, C (3));
- p = fmaf (p, m, C (2));
- p = fmaf (p, m, C (1));
- p = fmaf (p, m, C (0));
- return fmaf (m, m * p, m);
+ return fmaf (m, m * HORNER_8 (m, C), m);
#else
#error No log1pf approximation exists with the requested precision. Options are 13 or 25.
diff --git a/pl/math/pairwise_horner.h b/pl/math/pairwise_horner.h
new file mode 100644
index 0000000..bee7592
--- /dev/null
+++ b/pl/math/pairwise_horner.h
@@ -0,0 +1,14 @@
+/*
+ * Helper macros for double-precision pairwise Horner polynomial evaluation.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#if V_SUPPORTED
+#define FMA v_fma_f64
+#else
+#define FMA fma
+#endif
+
+#include "pairwise_horner_wrap.h"
diff --git a/pl/math/pairwise_horner_wrap.h b/pl/math/pairwise_horner_wrap.h
new file mode 100644
index 0000000..5bc287b
--- /dev/null
+++ b/pl/math/pairwise_horner_wrap.h
@@ -0,0 +1,36 @@
+/*
+ * Helper macros for pairwise Horner polynomial evaluation.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+// clang-format off
+#define PW_HORNER_1_(x, c, i) FMA(x, C(i + 1), C(i))
+#define PW_HORNER_3_(x, x2, c, i) FMA(x2, PW_HORNER_1_(x, c, i + 2), PW_HORNER_1_(x, c, i))
+#define PW_HORNER_5_(x, x2, c, i) FMA(x2, PW_HORNER_3_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define PW_HORNER_7_(x, x2, c, i) FMA(x2, PW_HORNER_5_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define PW_HORNER_9_(x, x2, c, i) FMA(x2, PW_HORNER_7_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define PW_HORNER_11_(x, x2, c, i) FMA(x2, PW_HORNER_9_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+
+#define PAIRWISE_HORNER_1(x, c) PW_HORNER_1_ (x, c, 0)
+#define PAIRWISE_HORNER_3(x, x2, c) PW_HORNER_3_ (x, x2, c, 0)
+#define PAIRWISE_HORNER_5(x, x2, c) PW_HORNER_5_ (x, x2, c, 0)
+#define PAIRWISE_HORNER_7(x, x2, c) PW_HORNER_7_ (x, x2, c, 0)
+#define PAIRWISE_HORNER_9(x, x2, c) PW_HORNER_9_ (x, x2, c, 0)
+#define PAIRWISE_HORNER_11(x, x2, c) PW_HORNER_11_(x, x2, c, 0)
+
+#define PW_HORNER_2_(x, x2, c, i) FMA(x2, c(i + 2), PW_HORNER_1_(x, c, i))
+#define PW_HORNER_4_(x, x2, c, i) FMA(x2, PW_HORNER_2_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define PW_HORNER_6_(x, x2, c, i) FMA(x2, PW_HORNER_4_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define PW_HORNER_8_(x, x2, c, i) FMA(x2, PW_HORNER_6_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define PW_HORNER_10_(x, x2, c, i) FMA(x2, PW_HORNER_8_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define PW_HORNER_12_(x, x2, c, i) FMA(x2, PW_HORNER_10_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+
+#define PAIRWISE_HORNER_2(x, x2, c) PW_HORNER_2_ (x, x2, c, 0)
+#define PAIRWISE_HORNER_4(x, x2, c) PW_HORNER_4_ (x, x2, c, 0)
+#define PAIRWISE_HORNER_6(x, x2, c) PW_HORNER_6_ (x, x2, c, 0)
+#define PAIRWISE_HORNER_8(x, x2, c) PW_HORNER_8_(x, x2, c, 0)
+#define PAIRWISE_HORNER_10(x, x2, c) PW_HORNER_10_(x, x2, c, 0)
+#define PAIRWISE_HORNER_12(x, x2, c) PW_HORNER_12_(x, x2, c, 0)
+// clang-format on
diff --git a/pl/math/pairwise_hornerf.h b/pl/math/pairwise_hornerf.h
new file mode 100644
index 0000000..a8aa4d1
--- /dev/null
+++ b/pl/math/pairwise_hornerf.h
@@ -0,0 +1,14 @@
+/*
+ * Helper macros for single-precision pairwise Horner polynomial evaluation.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#if V_SUPPORTED
+#define FMA v_fma_f32
+#else
+#define FMA fmaf
+#endif
+
+#include "pairwise_horner_wrap.h"
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 14abfc6..bd4c4c2 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -789,7 +789,7 @@ cbrtf __vn_cbrtf $runvn fenv
cbrtf _ZGVnN4v_cbrtf $runvn fenv
asinh __s_asinh $runs fenv
# Test vector asinh 3 times, with control lane < 1, > 1 and special.
-# Ensures the v_sel is choosing the right option in all cases.
+# Ensures the v_sel is choosing the right option in all cases.
asinh __v_asinh $runv fenv -c 0.5
asinh __vn_asinh $runvn fenv -c 0.5
asinh _ZGVnN2v_asinh $runvn fenv -c 0.5
diff --git a/pl/math/v_asinh_2u5.c b/pl/math/v_asinh_2u5.c
index 508ced1..974e6df 100644
--- a/pl/math/v_asinh_2u5.c
+++ b/pl/math/v_asinh_2u5.c
@@ -5,6 +5,7 @@
*/
#include "v_math.h"
+#include "estrin.h"
#if V_SUPPORTED
@@ -74,38 +75,6 @@ log_inline (v_f64_t x)
return y;
}
-static inline v_f64_t
-eval_poly (v_f64_t z)
-{
- /* Custom polynomial, shared with scalar routine, for calculating asinh(x) in
- [2^-26, 1]. Evaluated with Estrin scheme. */
- v_f64_t p_01 = v_fma_f64 (z, C (1), C (0));
- v_f64_t p_23 = v_fma_f64 (z, C (3), C (2));
- v_f64_t p_45 = v_fma_f64 (z, C (5), C (4));
- v_f64_t p_67 = v_fma_f64 (z, C (7), C (6));
- v_f64_t p_89 = v_fma_f64 (z, C (9), C (8));
- v_f64_t p_ab = v_fma_f64 (z, C (11), C (10));
- v_f64_t p_cd = v_fma_f64 (z, C (13), C (12));
- v_f64_t p_ef = v_fma_f64 (z, C (15), C (14));
- v_f64_t p_gh = v_fma_f64 (z, C (17), C (16));
-
- v_f64_t z2 = z * z;
- v_f64_t p_03 = v_fma_f64 (z2, p_23, p_01);
- v_f64_t p_47 = v_fma_f64 (z2, p_67, p_45);
- v_f64_t p_8b = v_fma_f64 (z2, p_ab, p_89);
- v_f64_t p_cf = v_fma_f64 (z2, p_ef, p_cd);
-
- v_f64_t z4 = z2 * z2;
- v_f64_t p_07 = v_fma_f64 (z4, p_47, p_03);
- v_f64_t p_8f = v_fma_f64 (z4, p_cf, p_8b);
-
- v_f64_t z8 = z4 * z4;
- v_f64_t p_0f = v_fma_f64 (z8, p_8f, p_07);
-
- v_f64_t z16 = z8 * z8;
- return v_fma_f64 (z16, p_gh, p_0f);
-}
-
/* Double-precision implementation of vector asinh(x).
asinh is very sensitive around 1, so it is impractical to devise a single
low-cost algorithm which is sufficiently accurate on a wide range of input.
@@ -162,7 +131,10 @@ VPCS_ATTR v_f64_t V_NAME (asinh) (v_f64_t x)
ax = v_sel_f64 (tiny | gt1, v_f64 (0), ax);
#endif
v_f64_t x2 = ax * ax;
- v_f64_t p = eval_poly (x2);
+ v_f64_t z2 = x2 * x2;
+ v_f64_t z4 = z2 * z2;
+ v_f64_t z8 = z4 * z4;
+ v_f64_t p = ESTRIN_17 (x2, z2, z4, z8, z8 * z8, C);
option_2 = v_fma_f64 (p, x2 * ax, ax);
#if WANT_ERRNO
option_2 = v_sel_f64 (tiny, x, option_2);
diff --git a/pl/math/v_erfc_4u.c b/pl/math/v_erfc_4u.c
index 603d8f5..80e11e7 100644
--- a/pl/math/v_erfc_4u.c
+++ b/pl/math/v_erfc_4u.c
@@ -7,6 +7,7 @@
#include "math_config.h"
#include "v_math.h"
+#include "horner.h"
#if V_SUPPORTED
/* Accurate exponential (vector variant of exp_dd). */
@@ -56,28 +57,6 @@ lookup (v_u64_t i)
return e;
}
-/* Evaluate order-12 polynomials using pairwise summation and Horner
- scheme. */
-static inline v_f64_t
-v_eval_poly (v_f64_t z, struct entry e)
-{
- v_f64_t r = e.P[12];
- r = v_fma_f64 (z, r, e.P[11]);
- r = v_fma_f64 (z, r, e.P[10]);
- r = v_fma_f64 (z, r, e.P[9]);
- r = v_fma_f64 (z, r, e.P[8]);
- r = v_fma_f64 (z, r, e.P[7]);
- r = v_fma_f64 (z, r, e.P[6]);
- r = v_fma_f64 (z, r, e.P[5]);
- r = v_fma_f64 (z, r, e.P[4]);
- r = v_fma_f64 (z, r, e.P[3]);
- r = v_fma_f64 (z, r, e.P[2]);
- r = v_fma_f64 (z, r, e.P[1]);
- r = v_fma_f64 (z, r, e.P[0]);
-
- return r;
-}
-
/* Accurate evaluation of exp(x^2) using compensated product
(x^2 ~ x*x + e2) and custom exp(y+d) routine for small
corrections d<<y. */
@@ -154,7 +133,8 @@ v_f64_t V_NAME (erfc) (v_f64_t x)
/* Evaluate Polynomial: P(|x|-x_i). */
z = a - dat.xi;
- p = v_eval_poly (z, dat);
+#define C(i) dat.P[i]
+ p = HORNER_12 (z, C);
/* Evaluate Gaussian: exp(-x^2). */
v_f64_t e = v_eval_gauss (a);
diff --git a/pl/math/v_erfcf_1u.c b/pl/math/v_erfcf_1u.c
index fc9571d..d9c65a5 100644
--- a/pl/math/v_erfcf_1u.c
+++ b/pl/math/v_erfcf_1u.c
@@ -7,6 +7,7 @@
#include "v_math.h"
#include "erfcf.h"
+#include "estrin.h"
#if V_SUPPORTED
@@ -41,39 +42,12 @@ interval_index (uint32_t ia12)
#endif
static inline v_f64_t
-v_eval_poly_estrin (v_f64_t z, const double *coeff1, const double *coeff2)
-{
- v_f64_t z2 = z * z;
- v_f64_t z4 = z2 * z2;
- v_f64_t z8 = z4 * z4;
-
- v_f64_t c0_zc1 = v_fma_f64 (z, C (1), C (0));
- v_f64_t c2_zc3 = v_fma_f64 (z, C (3), C (2));
- v_f64_t c4_zc5 = v_fma_f64 (z, C (5), C (4));
- v_f64_t c6_zc7 = v_fma_f64 (z, C (7), C (6));
- v_f64_t c8_zc9 = v_fma_f64 (z, C (9), C (8));
- v_f64_t c10_zc11 = v_fma_f64 (z, C (11), C (10));
- v_f64_t c12_zc13 = v_fma_f64 (z, C (13), C (12));
- v_f64_t c14_zc15 = v_fma_f64 (z, C (15), C (14));
-
- v_f64_t c0_z2c3 = v_fma_f64 (z2, c2_zc3, c0_zc1);
- v_f64_t c4_z2c7 = v_fma_f64 (z2, c6_zc7, c4_zc5);
- v_f64_t c8_z2c11 = v_fma_f64 (z2, c10_zc11, c8_zc9);
- v_f64_t c12_z2c15 = v_fma_f64 (z2, c14_zc15, c12_zc13);
-
- v_f64_t c0_z4c7 = v_fma_f64 (z4, c4_z2c7, c0_z2c3);
- v_f64_t c8_z4c15 = v_fma_f64 (z4, c12_z2c15, c8_z2c11);
-
- return v_fma_f64 (z8, c8_z4c15, c0_z4c7);
-}
-
-#undef C
-
-static inline v_f64_t
v_approx_erfcf_poly_gauss (v_f64_t x, const double *coeff1,
const double *coeff2)
{
- v_f64_t poly = v_eval_poly_estrin (x, coeff1, coeff2);
+ v_f64_t x2 = x * x;
+ v_f64_t x4 = x2 * x2;
+ v_f64_t poly = ESTRIN_15 (x, x2, x4, x4 * x4, C);
v_f64_t gauss = V_NAME (exp_tail) (-(x * x), v_f64 (0.0));
return poly * gauss;
}
@@ -81,7 +55,7 @@ v_approx_erfcf_poly_gauss (v_f64_t x, const double *coeff1,
static inline float
approx_poly_gauss (float abs_x, const double *coeff)
{
- return (float) (eval_poly_horner_lvl2 (abs_x, coeff) * eval_exp_mx2 (abs_x));
+ return (float) (eval_poly (abs_x, coeff) * eval_exp_mx2 (abs_x));
}
static v_f32_t
diff --git a/pl/math/v_log1p_2u5.c b/pl/math/v_log1p_2u5.c
index 51d0d51..d97a622 100644
--- a/pl/math/v_log1p_2u5.c
+++ b/pl/math/v_log1p_2u5.c
@@ -7,7 +7,7 @@
#include "v_math.h"
#if V_SUPPORTED
-#include "log1p_common.h"
+#include "estrin.h"
#define Ln2Hi v_f64 (0x1.62e42fefa3800p-1)
#define Ln2Lo v_f64 (0x1.ef35793c76730p-45)
@@ -18,6 +18,16 @@
#define OneTop12 0x3ff
#define BottomMask 0xffffffff
#define AbsMask 0x7fffffffffffffff
+#define C(i) v_f64 (__log1p_data.coeffs[i])
+
+static inline v_f64_t
+eval_poly (v_f64_t f)
+{
+ v_f64_t f2 = f * f;
+ v_f64_t f4 = f2 * f2;
+ v_f64_t f8 = f4 * f4;
+ return ESTRIN_18 (f, f2, f4, f8, f8 * f8, C);
+}
VPCS_ATTR
NOINLINE static v_f64_t
diff --git a/pl/math/v_tanf_3u2.c b/pl/math/v_tanf_3u2.c
index a6d9dd1..01f7f65 100644
--- a/pl/math/v_tanf_3u2.c
+++ b/pl/math/v_tanf_3u2.c
@@ -6,6 +6,8 @@
*/
#include "v_math.h"
+#include "estrinf.h"
+
#if V_SUPPORTED
/* Constants. */
@@ -32,13 +34,7 @@ eval_poly (v_f32_t z)
{
v_f32_t z2 = z * z;
v_f32_t z4 = z2 * z2;
- v_f32_t y_10 = v_fma_f32 (z, poly (1), poly (0));
- v_f32_t y_32 = v_fma_f32 (z, poly (3), poly (2));
- v_f32_t y_54 = v_fma_f32 (z, poly (5), poly (4));
- v_f32_t y_6_54 = v_fma_f32 (z2, poly (6), y_54);
- v_f32_t y_32_10 = v_fma_f32 (z2, y_32, y_10);
- v_f32_t y = v_fma_f32 (z4, y_6_54, y_32_10);
- return y;
+ return ESTRIN_6 (z, z2, z4, poly);
}
/* Fast implementation of Neon tanf.
diff --git a/pl/math/v_tanhf_2u6.c b/pl/math/v_tanhf_2u6.c
index 571fd8b..67e4520 100644
--- a/pl/math/v_tanhf_2u6.c
+++ b/pl/math/v_tanhf_2u6.c
@@ -6,6 +6,7 @@
#include "v_math.h"
#include "mathlib.h"
+#include "estrinf.h"
#if V_SUPPORTED
@@ -39,10 +40,7 @@ expm1f_inline (v_f32_t x)
/* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
Uses Estrin scheme, where the main __v_expm1f routine uses Horner. */
v_f32_t f2 = f * f;
- v_f32_t p_01 = v_fma_f32 (f, C (1), C (0));
- v_f32_t p_23 = v_fma_f32 (f, C (3), C (2));
- v_f32_t p = v_fma_f32 (f2, p_23, p_01);
- p = v_fma_f32 (f2 * f2, C (4), p);
+ v_f32_t p = ESTRIN_4 (f, f2, f2 * f2, C);
p = v_fma_f32 (f2, p, f);
/* t = 2^i. */