diff options
Diffstat (limited to 'pl/math')
-rw-r--r-- | pl/math/asinh_2u5.c | 39 | ||||
-rw-r--r-- | pl/math/asinhf_3u5.c | 13 | ||||
-rw-r--r-- | pl/math/atan_common.h | 34 | ||||
-rw-r--r-- | pl/math/atanf_common.h | 8 | ||||
-rw-r--r-- | pl/math/cbrtf_1u5.c | 6 | ||||
-rw-r--r-- | pl/math/erfc_4u5.c | 23 | ||||
-rw-r--r-- | pl/math/erfcf.h | 38 | ||||
-rw-r--r-- | pl/math/erfcf_2u.c | 4 | ||||
-rw-r--r-- | pl/math/erff_1u5.c | 22 | ||||
-rw-r--r-- | pl/math/estrin.h | 16 | ||||
-rw-r--r-- | pl/math/estrin_wrap.h | 48 | ||||
-rw-r--r-- | pl/math/estrinf.h | 14 | ||||
-rw-r--r-- | pl/math/expm1_2u5.c | 23 | ||||
-rw-r--r-- | pl/math/expm1f_1u6.c | 8 | ||||
-rw-r--r-- | pl/math/horner.h | 14 | ||||
-rw-r--r-- | pl/math/horner_wrap.h | 34 | ||||
-rw-r--r-- | pl/math/hornerf.h | 14 | ||||
-rw-r--r-- | pl/math/log1p_2u.c | 13 | ||||
-rw-r--r-- | pl/math/log1p_common.h | 61 | ||||
-rw-r--r-- | pl/math/log1pf_2u1.c | 15 | ||||
-rw-r--r-- | pl/math/pairwise_horner.h | 14 | ||||
-rw-r--r-- | pl/math/pairwise_horner_wrap.h | 36 | ||||
-rw-r--r-- | pl/math/pairwise_hornerf.h | 14 | ||||
-rwxr-xr-x | pl/math/test/runulp.sh | 2 | ||||
-rw-r--r-- | pl/math/v_asinh_2u5.c | 38 | ||||
-rw-r--r-- | pl/math/v_erfc_4u.c | 26 | ||||
-rw-r--r-- | pl/math/v_erfcf_1u.c | 36 | ||||
-rw-r--r-- | pl/math/v_log1p_2u5.c | 12 | ||||
-rw-r--r-- | pl/math/v_tanf_3u2.c | 10 | ||||
-rw-r--r-- | pl/math/v_tanhf_2u6.c | 6 |
30 files changed, 299 insertions, 342 deletions
diff --git a/pl/math/asinh_2u5.c b/pl/math/asinh_2u5.c index f22b342..9cbdd33 100644 --- a/pl/math/asinh_2u5.c +++ b/pl/math/asinh_2u5.c @@ -5,48 +5,17 @@ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" +#include "estrin.h" #define AbsMask 0x7fffffffffffffff #define ExpM26 0x3e50000000000000 /* asuint64(0x1.0p-26). */ #define One 0x3ff0000000000000 /* asuint64(1.0). */ #define Exp511 0x5fe0000000000000 /* asuint64(0x1.0p511). */ #define Ln2 0x1.62e42fefa39efp-1 -#define C(i) __asinh_data.poly[i] double optr_aor_log_f64 (double); -static inline double -eval_poly (double z) -{ - /* Evaluate polynomial using Estrin scheme. */ - double p_01 = fma (z, C (1), C (0)); - double p_23 = fma (z, C (3), C (2)); - double p_45 = fma (z, C (5), C (4)); - double p_67 = fma (z, C (7), C (6)); - double p_89 = fma (z, C (9), C (8)); - double p_ab = fma (z, C (11), C (10)); - double p_cd = fma (z, C (13), C (12)); - double p_ef = fma (z, C (15), C (14)); - double p_gh = fma (z, C (17), C (16)); - - double z2 = z * z; - double p_03 = fma (z2, p_23, p_01); - double p_47 = fma (z2, p_67, p_45); - double p_8b = fma (z2, p_ab, p_89); - double p_cf = fma (z2, p_ef, p_cd); - - double z4 = z2 * z2; - double p_07 = fma (z4, p_47, p_03); - double p_8f = fma (z4, p_cf, p_8b); - - double z8 = z4 * z4; - double p_0f = fma (z8, p_8f, p_07); - - double z16 = z8 * z8; - return fma (z16, p_gh, p_0f); -} - /* Scalar double-precision asinh implementation. This routine uses different approaches on different intervals: @@ -86,7 +55,11 @@ asinh (double x) if (ia < One) { double x2 = x * x; - double p = eval_poly (x2); + double z2 = x2 * x2; + double z4 = z2 * z2; + double z8 = z4 * z4; +#define C(i) __asinh_data.poly[i] + double p = ESTRIN_17 (x2, z2, z4, z8, z8 * z8, C); double y = fma (p, x2 * ax, ax); return asdouble (asuint64 (y) | sign); } diff --git a/pl/math/asinhf_3u5.c b/pl/math/asinhf_3u5.c index 8aa62ad..48acdef 100644 --- a/pl/math/asinhf_3u5.c +++ b/pl/math/asinhf_3u5.c @@ -5,6 +5,7 @@ */ #include "math_config.h" +#include "estrinf.h" #define AbsMask (0x7fffffff) #define SqrtFltMax (0x1.749e96p+10f) @@ -53,17 +54,7 @@ asinhf (float x) if (ia12 < One) { float x2 = ax * ax; - float x4 = x2 * x2; - - float p_01 = fmaf (ax, C (1), C (0)); - float p_23 = fmaf (ax, C (3), C (2)); - float p_45 = fmaf (ax, C (5), C (4)); - float p_67 = fmaf (ax, C (7), C (6)); - - float p_03 = fmaf (x2, p_23, p_01); - float p_47 = fmaf (x2, p_67, p_45); - - float p = fmaf (x4, p_47, p_03); + float p = ESTRIN_7 (ax, x2, x2 * x2, C); float y = fmaf (x2, p, ax); return asfloat (asuint (y) | sign); } diff --git a/pl/math/atan_common.h b/pl/math/atan_common.h index 1690e7e..331c1bb 100644 --- a/pl/math/atan_common.h +++ b/pl/math/atan_common.h @@ -7,19 +7,18 @@ */ #include "math_config.h" +#include "estrin.h" #if V_SUPPORTED #include "v_math.h" #define DBL_T v_f64_t -#define FMA v_fma_f64 #define P(i) v_f64 (__atan_poly_data.poly[i]) #else #define DBL_T double -#define FMA fma #define P(i) __atan_poly_data.poly[i] #endif @@ -29,37 +28,14 @@ static inline DBL_T eval_poly (DBL_T z, DBL_T az, DBL_T shift) { - /* Use full Estrin scheme for P(z^2) with deg(P)=19. */ + /* Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of + full scheme to avoid underflow in x^16. */ DBL_T z2 = z * z; - /* Level 1. */ - DBL_T P_1_0 = FMA (P (1), z2, P (0)); - DBL_T P_3_2 = FMA (P (3), z2, P (2)); - DBL_T P_5_4 = FMA (P (5), z2, P (4)); - DBL_T P_7_6 = FMA (P (7), z2, P (6)); - DBL_T P_9_8 = FMA (P (9), z2, P (8)); - DBL_T P_11_10 = FMA (P (11), z2, P (10)); - DBL_T P_13_12 = FMA (P (13), z2, P (12)); - DBL_T P_15_14 = FMA (P (15), z2, P (14)); - DBL_T P_17_16 = FMA (P (17), z2, P (16)); - DBL_T P_19_18 = FMA (P (19), z2, P (18)); - - /* Level 2. */ DBL_T x2 = z2 * z2; - DBL_T P_3_0 = FMA (P_3_2, x2, P_1_0); - DBL_T P_7_4 = FMA (P_7_6, x2, P_5_4); - DBL_T P_11_8 = FMA (P_11_10, x2, P_9_8); - DBL_T P_15_12 = FMA (P_15_14, x2, P_13_12); - DBL_T P_19_16 = FMA (P_19_18, x2, P_17_16); - - /* Level 3. */ DBL_T x4 = x2 * x2; - DBL_T P_7_0 = FMA (P_7_4, x4, P_3_0); - DBL_T P_15_8 = FMA (P_15_12, x4, P_11_8); - - /* Level 4. */ DBL_T x8 = x4 * x4; - DBL_T y = FMA (P_19_16, x8, P_15_8); - y = FMA (y, x8, P_7_0); + DBL_T y + = FMA (ESTRIN_11_ (z2, x2, x4, x8, P, 8), x8, ESTRIN_7 (z2, x2, x4, P)); /* Finalize. y = shift + z + z^3 * P(z^2). */ y = FMA (y, z2 * az, az); diff --git a/pl/math/atanf_common.h b/pl/math/atanf_common.h index 436b88b..3038e54 100644 --- a/pl/math/atanf_common.h +++ b/pl/math/atanf_common.h @@ -10,19 +10,18 @@ #define PL_MATH_ATANF_COMMON_H #include "math_config.h" +#include "estrinf.h" #if V_SUPPORTED #include "v_math.h" #define FLT_T v_f32_t -#define FMA v_fma_f32 #define P(i) v_f32 (__atanf_poly_data.poly[i]) #else #define FLT_T float -#define FMA fmaf #define P(i) __atanf_poly_data.poly[i] #endif @@ -42,10 +41,7 @@ eval_poly (FLT_T z, FLT_T az, FLT_T shift) FLT_T z4 = z2 * z2; /* Then assemble polynomial. */ - FLT_T y - = FMA (z4, - z4 * FMA (z4, (FMA (z2, P (7), P (6))), (FMA (z2, P (5), P (4)))), - FMA (z4, (FMA (z2, P (3), P (2))), (FMA (z2, P (1), P (0))))); + FLT_T y = FMA (z4, z4 * ESTRIN_3_ (z2, z4, P, 4), ESTRIN_3 (z2, z4, P)); /* Finalize: y = shift + z * P(z^2). */ diff --git a/pl/math/cbrtf_1u5.c b/pl/math/cbrtf_1u5.c index 73b9049..d544a68 100644 --- a/pl/math/cbrtf_1u5.c +++ b/pl/math/cbrtf_1u5.c @@ -8,6 +8,7 @@ #include <math.h> #include "math_config.h" +#include "estrinf.h" #define AbsMask 0x7fffffff #define SignMask 0x80000000 @@ -40,10 +41,7 @@ cbrtf (float x) /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is, the less accurate the next stage of the algorithm needs to be. An order-4 polynomial is enough for one Newton iteration. */ - float p_01 = fmaf (C (1), m, C (0)); - float p_23 = fmaf (C (3), m, C (2)); - float p = fmaf (m * m, p_23, p_01); - + float p = ESTRIN_3 (m, m * m, C); /* One iteration of Newton's method for iteratively approximating cbrt. */ float m_by_3 = m / 3; float a = fmaf (TwoThirds, p, m_by_3 / (p * p)); diff --git a/pl/math/erfc_4u5.c b/pl/math/erfc_4u5.c index 003a0cc..8088562 100644 --- a/pl/math/erfc_4u5.c +++ b/pl/math/erfc_4u5.c @@ -9,6 +9,7 @@ #include <math.h> #include <errno.h> #include "math_config.h" +#include "pairwise_horner.h" #define AbsMask (0x7fffffffffffffff) @@ -19,28 +20,12 @@ double __exp_dd (double x, double xtail); -/* Evaluate order-12 polynomials using - pairwise summation and Horner scheme - in double precision. */ static inline double eval_poly_horner (double z, int i) { - double r1, r2, r3, r4, r5, r6, z2; - r1 = fma (z, PX[i][1], PX[i][0]); - r2 = fma (z, PX[i][3], PX[i][2]); - r3 = fma (z, PX[i][5], PX[i][4]); - r4 = fma (z, PX[i][7], PX[i][6]); - r5 = fma (z, PX[i][9], PX[i][8]); - r6 = fma (z, PX[i][11], PX[i][10]); - z2 = z * z; - double r = PX[i][12]; - r = fma (z2, r, r6); - r = fma (z2, r, r5); - r = fma (z2, r, r4); - r = fma (z2, r, r3); - r = fma (z2, r, r2); - r = fma (z2, r, r1); - return r; + double z2 = z * z; +#define C(j) PX[i][j] + return PAIRWISE_HORNER_12 (z, z2, C); } /* Accurate evaluation of exp(x^2) diff --git a/pl/math/erfcf.h b/pl/math/erfcf.h index 6adc6b4..98ead38 100644 --- a/pl/math/erfcf.h +++ b/pl/math/erfcf.h @@ -8,39 +8,24 @@ #ifndef PL_MATH_ERFCF_H #define PL_MATH_ERFCF_H -#include <math.h> +#include "math_config.h" + +#define FMA fma +#include "estrin_wrap.h" /* Accurate exponential from optimized-routines. */ double __exp_dd (double x, double xtail); -/* Evaluate order-12 polynomials using pairwise summation and Horner scheme in - double precision. */ static inline double -eval_poly_horner_lvl2 (double z, const double *coeff) +eval_poly (double z, const double *coeff) { - double r1, r2, r3, r4, r5, r6, r7, r8; - double R1, R2, R3, R4; - double Q1, Q2; - double z2, z4, z8; - z2 = z * z; - r1 = fma (z, coeff[1], coeff[0]); - r2 = fma (z, coeff[3], coeff[2]); - z4 = z2 * z2; - z8 = z4 * z4; - R1 = fma (z2, r2, r1); - r3 = fma (z, coeff[5], coeff[4]); - r4 = fma (z, coeff[7], coeff[6]); - R2 = fma (z2, r4, r3); - Q1 = fma (z4, R2, R1); - r5 = fma (z, coeff[9], coeff[8]); - r6 = fma (z, coeff[11], coeff[10]); - R3 = fma (z2, r6, r5); - r7 = fma (z, coeff[13], coeff[12]); - r8 = fma (z, coeff[15], coeff[14]); - R4 = fma (z2, r8, r7); - Q2 = fma (z4, R4, R3); - return fma (z8, Q2, Q1); + double z2 = z * z; + double z4 = z2 * z2; + double z8 = z4 * z4; +#define C(i) coeff[i] + return ESTRIN_15 (z, z2, z4, z8, C); +#undef C } static inline double @@ -49,4 +34,5 @@ eval_exp_mx2 (double x) return __exp_dd (-(x * x), 0.0); } +#undef FMA #endif // PL_MATH_ERFCF_H diff --git a/pl/math/erfcf_2u.c b/pl/math/erfcf_2u.c index 80dba83..8d4bba1 100644 --- a/pl/math/erfcf_2u.c +++ b/pl/math/erfcf_2u.c @@ -21,7 +21,7 @@ approx_erfcf_hi (float x, uint32_t sign, const double *coeff) /* Polynomial contribution. */ double z = (double) fabs (x); - float p = (float) eval_poly_horner_lvl2 (z, coeff); + float p = (float) eval_poly (z, coeff); /* Gaussian contribution. */ float e_mx2 = (float) eval_exp_mx2 (z); @@ -34,7 +34,7 @@ approx_erfcf_lo (float x, uint32_t sign, const double *coeff) { /* Polynomial contribution. */ double z = (double) fabs (x); - float p = (float) eval_poly_horner_lvl2 (z, coeff); + float p = (float) eval_poly (z, coeff); /* Gaussian contribution. */ float e_mx2 = (float) eval_exp_mx2 (z); diff --git a/pl/math/erff_1u5.c b/pl/math/erff_1u5.c index 1073603..bad68a6 100644 --- a/pl/math/erff_1u5.c +++ b/pl/math/erff_1u5.c @@ -7,7 +7,10 @@ #include <stdint.h> #include <math.h> + #include "math_config.h" +#include "hornerf.h" +#include "estrinf.h" #define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f #define A __erff_data.erff_poly_A @@ -26,7 +29,7 @@ top12 (float x) float erff (float x) { - float r, x2, u; + float r, x2; /* Get top word. */ uint32_t ix = asuint (x); @@ -54,23 +57,18 @@ erff (float x) /* Normalized cases (|x| < 0.921875) - Use Horner scheme for x+x*P(x^2). */ - r = A[5]; - r = fmaf (r, x2, A[4]); - r = fmaf (r, x2, A[3]); - r = fmaf (r, x2, A[2]); - r = fmaf (r, x2, A[1]); - r = fmaf (r, x2, A[0]); - r = fmaf (r, x, x); +#define C(i) A[i] + r = fmaf (HORNER_5 (x2, C), x, x); +#undef C } else if (ia12 < 0x408) { /* |x| < 4.0 - Use a custom Estrin scheme. */ float a = fabsf (x); /* Use Estrin scheme on high order (small magnitude) coefficients. */ - r = fmaf (B[6], a, B[5]); - u = fmaf (B[4], a, B[3]); - x2 = x * x; - r = fmaf (r, x2, u); +#define C(i) B[i] + r = ESTRIN_3_ (a, x * x, C, 3); +#undef C /* Then switch to pure Horner scheme. */ r = fmaf (r, a, B[2]); r = fmaf (r, a, B[1]); diff --git a/pl/math/estrin.h b/pl/math/estrin.h new file mode 100644 index 0000000..89df329 --- /dev/null +++ b/pl/math/estrin.h @@ -0,0 +1,16 @@ +/* + * Helper macros for double-precision Estrin polynomial evaluation. + * + * Copyright (c) 2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +#if V_SUPPORTED +#define FMA v_fma_f64 +#else +#define FMA fma +#endif + +#include "estrin_wrap.h" diff --git a/pl/math/estrin_wrap.h b/pl/math/estrin_wrap.h new file mode 100644 index 0000000..93af2ab --- /dev/null +++ b/pl/math/estrin_wrap.h @@ -0,0 +1,48 @@ +/* + * Helper macros for double-precision Estrin polynomial evaluation. + * + * Copyright (c) 2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +// clang-format off +#define ESTRIN_1_(x, c, i) FMA(x, c(1 + i), c(i)) +#define ESTRIN_2_(x, x2, c, i) FMA(x2, c(2 + i), ESTRIN_1_(x, c, i)) +#define ESTRIN_3_(x, x2, c, i) FMA(x2, ESTRIN_1_(x, c, 2 + i), ESTRIN_1_(x, c, i)) +#define ESTRIN_4_(x, x2, x4, c, i) FMA(x4, c(4 + i), ESTRIN_3_(x, x2, c, i)) +#define ESTRIN_5_(x, x2, x4, c, i) FMA(x4, ESTRIN_1_(x, c, 4 + i), ESTRIN_3_(x, x2, c, i)) +#define ESTRIN_6_(x, x2, x4, c, i) FMA(x4, ESTRIN_2_(x, x2, c, 4 + i), ESTRIN_3_(x, x2, c, i)) +#define ESTRIN_7_(x, x2, x4, c, i) FMA(x4, ESTRIN_3_(x, x2, c, 4 + i), ESTRIN_3_(x, x2, c, i)) +#define ESTRIN_8_(x, x2, x4, x8, c, i) FMA(x8, c(8 + i), ESTRIN_7_(x, x2, x4, c, i)) +#define ESTRIN_9_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_1_(x, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i)) +#define ESTRIN_10_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_2_(x, x2, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i)) +#define ESTRIN_11_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_3_(x, x2, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i)) +#define ESTRIN_12_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_4_(x, x2, x4, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i)) +#define ESTRIN_13_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_5_(x, x2, x4, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i)) +#define ESTRIN_14_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_6_(x, x2, x4, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i)) +#define ESTRIN_15_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_7_(x, x2, x4, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i)) +#define ESTRIN_16_(x, x2, x4, x8, x16, c, i) FMA(x16, c(16 + i), ESTRIN_15_(x, x2, x4, x8, c, i)) +#define ESTRIN_17_(x, x2, x4, x8, x16, c, i) FMA(x16, ESTRIN_1_(x, c, 16 + i), ESTRIN_15_(x, x2, x4, x8, c, i)) +#define ESTRIN_18_(x, x2, x4, x8, x16, c, i) FMA(x16, ESTRIN_2_(x, x2, c, 16 + i), ESTRIN_15_(x, x2, x4, x8, c, i)) +#define ESTRIN_19_(x, x2, x4, x8, x16, c, i) FMA(x16, ESTRIN_3_(x, x2, c, 16 + i), ESTRIN_15_(x, x2, x4, x8, c, i)) + +#define ESTRIN_1(x, c) ESTRIN_1_(x, c, 0) +#define ESTRIN_2(x, x2, c) ESTRIN_2_(x, x2, c, 0) +#define ESTRIN_3(x, x2, c) ESTRIN_3_(x, x2, c, 0) +#define ESTRIN_4(x, x2, x4, c) ESTRIN_4_(x, x2, x4, c, 0) +#define ESTRIN_5(x, x2, x4, c) ESTRIN_5_(x, x2, x4, c, 0) +#define ESTRIN_6(x, x2, x4, c) ESTRIN_6_(x, x2, x4, c, 0) +#define ESTRIN_7(x, x2, x4, c) ESTRIN_7_(x, x2, x4, c, 0) +#define ESTRIN_8(x, x2, x4, x8, c) ESTRIN_8_(x, x2, x4, x8, c, 0) +#define ESTRIN_9(x, x2, x4, x8, c) ESTRIN_9_(x, x2, x4, x8, c, 0) +#define ESTRIN_10(x, x2, x4, x8, c) ESTRIN_10_(x, x2, x4, x8, c, 0) +#define ESTRIN_11(x, x2, x4, x8, c) ESTRIN_11_(x, x2, x4, x8, c, 0) +#define ESTRIN_12(x, x2, x4, x8, c) ESTRIN_12_(x, x2, x4, x8, c, 0) +#define ESTRIN_13(x, x2, x4, x8, c) ESTRIN_13_(x, x2, x4, x8, c, 0) +#define ESTRIN_14(x, x2, x4, x8, c) ESTRIN_14_(x, x2, x4, x8, c, 0) +#define ESTRIN_15(x, x2, x4, x8, c) ESTRIN_15_(x, x2, x4, x8, c, 0) +#define ESTRIN_16(x, x2, x4, x8, x16, c) ESTRIN_16_(x, x2, x4, x8, x16, c, 0) +#define ESTRIN_17(x, x2, x4, x8, x16, c) ESTRIN_17_(x, x2, x4, x8, x16, c, 0) +#define ESTRIN_18(x, x2, x4, x8, x16, c) ESTRIN_18_(x, x2, x4, x8, x16, c, 0) +#define ESTRIN_19(x, x2, x4, x8, x16, c) ESTRIN_19_(x, x2, x4, x8, x16, c, 0) +// clang-format on diff --git a/pl/math/estrinf.h b/pl/math/estrinf.h new file mode 100644 index 0000000..be52ab5 --- /dev/null +++ b/pl/math/estrinf.h @@ -0,0 +1,14 @@ +/* + * Helper macros for single-precision Estrin polynomial evaluation. + * + * Copyright (c) 2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#if V_SUPPORTED +#define FMA v_fma_f32 +#else +#define FMA fmaf +#endif + +#include "estrin_wrap.h" diff --git a/pl/math/expm1_2u5.c b/pl/math/expm1_2u5.c index c701d7e..98ef078 100644 --- a/pl/math/expm1_2u5.c +++ b/pl/math/expm1_2u5.c @@ -6,6 +6,7 @@ */ #include "math_config.h" +#include "estrin.h" #define InvLn2 0x1.71547652b82fep0 #define Ln2hi 0x1.62e42fefa39efp-1 @@ -19,25 +20,6 @@ #define C(i) __expm1_poly[i] -static inline double -eval_poly (double f, double f2) -{ - /* Evaluate custom polynomial using Estrin scheme. */ - double p_01 = fma (f, C (1), C (0)); - double p_23 = fma (f, C (3), C (2)); - double p_45 = fma (f, C (5), C (4)); - double p_67 = fma (f, C (7), C (6)); - double p_89 = fma (f, C (9), C (8)); - - double p_03 = fma (f2, p_23, p_01); - double p_47 = fma (f2, p_67, p_45); - double p_8a = fma (f2, C (10), p_89); - - double f4 = f2 * f2; - double p_07 = fma (f4, p_47, p_03); - return fma (f4 * f4, p_8a, p_07); -} - /* Approximation for exp(x) - 1 using polynomial on a reduced interval. The maximum error observed error is 2.17 ULP: expm1(0x1.63f90a866748dp-2) got 0x1.a9af56603878ap-2 @@ -80,7 +62,8 @@ expm1 (double x) So we calculate the polynomial P(f) = a + bf + cf^2 + ... and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ double f2 = f * f; - double p = fma (f2, eval_poly (f, f2), f); + double f4 = f2 * f2; + double p = fma (f2, ESTRIN_10 (f, f2, f4, f4 * f4, C), f); /* Assemble the result, using a slight rearrangement to achieve acceptable accuracy. diff --git a/pl/math/expm1f_1u6.c b/pl/math/expm1f_1u6.c index 44981ca..0904652 100644 --- a/pl/math/expm1f_1u6.c +++ b/pl/math/expm1f_1u6.c @@ -6,6 +6,7 @@ */ #include "math_config.h" +#include "hornerf.h" #define Shift (0x1.8p23f) #define InvLn2 (0x1.715476p+0f) @@ -59,12 +60,7 @@ expm1f (float x) x + ax^2 + bx^3 + cx^4 .... So we calculate the polynomial P(f) = a + bf + cf^2 + ... and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ - float p = fmaf (C (4), f, C (3)); - p = fmaf (p, f, C (2)); - p = fmaf (p, f, C (1)); - p = fmaf (p, f, C (0)); - p = fmaf (f * f, p, f); - + float p = fmaf (f * f, HORNER_4 (f, C), f); /* Assemble the result, using a slight rearrangement to achieve acceptable accuracy. expm1(x) ~= 2^i * (p + 1) - 1 diff --git a/pl/math/horner.h b/pl/math/horner.h new file mode 100644 index 0000000..4dbc122 --- /dev/null +++ b/pl/math/horner.h @@ -0,0 +1,14 @@ +/* + * Helper macros for single-precision Horner polynomial evaluation. + * + * Copyright (c) 2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#if V_SUPPORTED +#define FMA v_fma_f64 +#else +#define FMA fma +#endif + +#include "horner_wrap.h" diff --git a/pl/math/horner_wrap.h b/pl/math/horner_wrap.h new file mode 100644 index 0000000..892d63b --- /dev/null +++ b/pl/math/horner_wrap.h @@ -0,0 +1,34 @@ +/* + * Helper macros for Horner polynomial evaluation. + * + * Copyright (c) 2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +// clang-format off +#define HORNER_1_(x, c, i) FMA(C(i + 1), x, c(i)) +#define HORNER_2_(x, c, i) FMA(HORNER_1_ (x, c, i + 1), x, c(i)) +#define HORNER_3_(x, c, i) FMA(HORNER_2_ (x, c, i + 1), x, c(i)) +#define HORNER_4_(x, c, i) FMA(HORNER_3_ (x, c, i + 1), x, c(i)) +#define HORNER_5_(x, c, i) FMA(HORNER_4_ (x, c, i + 1), x, c(i)) +#define HORNER_6_(x, c, i) FMA(HORNER_5_ (x, c, i + 1), x, c(i)) +#define HORNER_7_(x, c, i) FMA(HORNER_6_ (x, c, i + 1), x, c(i)) +#define HORNER_8_(x, c, i) FMA(HORNER_7_ (x, c, i + 1), x, c(i)) +#define HORNER_9_(x, c, i) FMA(HORNER_8_ (x, c, i + 1), x, c(i)) +#define HORNER_10_(x, c, i) FMA(HORNER_9_ (x, c, i + 1), x, c(i)) +#define HORNER_11_(x, c, i) FMA(HORNER_10_(x, c, i + 1), x, c(i)) +#define HORNER_12_(x, c, i) FMA(HORNER_11_(x, c, i + 1), x, c(i)) + +#define HORNER_1(x, c) HORNER_1_ (x, c, 0) +#define HORNER_2(x, c) HORNER_2_ (x, c, 0) +#define HORNER_3(x, c) HORNER_3_ (x, c, 0) +#define HORNER_4(x, c) HORNER_4_ (x, c, 0) +#define HORNER_5(x, c) HORNER_5_ (x, c, 0) +#define HORNER_6(x, c) HORNER_6_ (x, c, 0) +#define HORNER_7(x, c) HORNER_7_ (x, c, 0) +#define HORNER_8(x, c) HORNER_8_ (x, c, 0) +#define HORNER_9(x, c) HORNER_9_ (x, c, 0) +#define HORNER_10(x, c) HORNER_10_(x, c, 0) +#define HORNER_11(x, c) HORNER_11_(x, c, 0) +#define HORNER_12(x, c) HORNER_12_(x, c, 0) +// clang-format on diff --git a/pl/math/hornerf.h b/pl/math/hornerf.h new file mode 100644 index 0000000..bec1593 --- /dev/null +++ b/pl/math/hornerf.h @@ -0,0 +1,14 @@ +/* + * Helper macros for double-precision Horner polynomial evaluation. + * + * Copyright (c) 2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#if V_SUPPORTED +#define FMA v_fma_f32 +#else +#define FMA fmaf +#endif + +#include "horner_wrap.h" diff --git a/pl/math/log1p_2u.c b/pl/math/log1p_2u.c index b9c7e9e..ade5d87 100644 --- a/pl/math/log1p_2u.c +++ b/pl/math/log1p_2u.c @@ -5,8 +5,7 @@ */ #include "math_config.h" - -#include "log1p_common.h" +#include "estrin.h" #define Ln2Hi 0x1.62e42fefa3800p-1 #define Ln2Lo 0x1.ef35793c76730p-45 @@ -19,6 +18,16 @@ #define Rt2MOne 0x3fda827999fcef32 #define AbsMask 0x7fffffffffffffff #define ExpM63 0x3c00 +#define C(i) __log1p_data.coeffs[i] + +static inline double +eval_poly (double f) +{ + double f2 = f * f; + double f4 = f2 * f2; + double f8 = f4 * f4; + return ESTRIN_18 (f, f2, f4, f8, f8 * f8, C); +} /* log1p approximation using polynomial on reduced interval. Largest observed errors are near the lower boundary of the region where k diff --git a/pl/math/log1p_common.h b/pl/math/log1p_common.h deleted file mode 100644 index 24e6f20..0000000 --- a/pl/math/log1p_common.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Double-precision polynomial evaluation function for scalar and vector - * log1p(x) - * - * Copyright (c) 2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#ifndef PL_MATH_LOG1P_COMMON_H -#define PL_MATH_LOG1P_COMMON_H - -#include "math_config.h" - -#if V_SUPPORTED - -#include "v_math.h" - -#define DBL_T v_f64_t -#define FMA v_fma_f64 -#define C(i) v_f64 (__log1p_data.coeffs[i]) - -#else - -#define DBL_T double -#define FMA fma -#define C(i) __log1p_data.coeffs[i] - -#endif - -static inline DBL_T -eval_poly (DBL_T f) -{ - /* Evaluate polynomial using Estrin's method. */ - DBL_T p_01 = FMA (f, C (1), C (0)); - DBL_T p_23 = FMA (f, C (3), C (2)); - DBL_T p_45 = FMA (f, C (5), C (4)); - DBL_T p_67 = FMA (f, C (7), C (6)); - DBL_T p_89 = FMA (f, C (9), C (8)); - DBL_T p_ab = FMA (f, C (11), C (10)); - DBL_T p_cd = FMA (f, C (13), C (12)); - DBL_T p_ef = FMA (f, C (15), C (14)); - DBL_T p_gh = FMA (f, C (17), C (16)); - - DBL_T f2 = f * f; - DBL_T p_03 = FMA (f2, p_23, p_01); - DBL_T p_47 = FMA (f2, p_67, p_45); - DBL_T p_8b = FMA (f2, p_ab, p_89); - DBL_T p_cf = FMA (f2, p_ef, p_cd); - DBL_T p_gi = FMA (f2, C (18), p_gh); - - DBL_T f4 = f2 * f2; - DBL_T p_07 = FMA (f4, p_47, p_03); - DBL_T p_8f = FMA (f4, p_cf, p_8b); - - DBL_T f8 = f4 * f4; - DBL_T p_0f = FMA (f8, p_8f, p_07); - - return FMA (f8 * f8, p_gi, p_0f); -} - -#endif // PL_MATH_LOG1P_COMMON_H diff --git a/pl/math/log1pf_2u1.c b/pl/math/log1pf_2u1.c index 5b0d542..9b7cb94 100644 --- a/pl/math/log1pf_2u1.c +++ b/pl/math/log1pf_2u1.c @@ -5,6 +5,7 @@ */ #include "math_config.h" +#include "hornerf.h" #define Ln2 (0x1.62e43p-1f) #define SignMask (0x80000000) @@ -21,8 +22,8 @@ eval_poly (float m, uint32_t e) { #ifdef LOG1PF_2U5 - /* 2.5 ulp variant. Approximate log(1+m) on [-0.25, 0.5] using Estrin - scheme. */ + /* 2.5 ulp variant. Approximate log(1+m) on [-0.25, 0.5] using + slightly modified Estrin scheme (no x^0 term, and x term is just x). */ float p_12 = fmaf (m, C (1), C (0)); float p_34 = fmaf (m, C (3), C (2)); float p_56 = fmaf (m, C (5), C (4)); @@ -49,15 +50,7 @@ eval_poly (float m, uint32_t e) x + C1 * x^2 + C2 * x^3 + C3 * x^4 + ... Hence approximation has the form m + m^2 * P(m) where P(x) = C1 + C2 * x + C3 * x^2 + ... . */ - float p = fmaf (C (8), m, C (7)); - p = fmaf (p, m, C (6)); - p = fmaf (p, m, C (5)); - p = fmaf (p, m, C (4)); - p = fmaf (p, m, C (3)); - p = fmaf (p, m, C (2)); - p = fmaf (p, m, C (1)); - p = fmaf (p, m, C (0)); - return fmaf (m, m * p, m); + return fmaf (m, m * HORNER_8 (m, C), m); #else #error No log1pf approximation exists with the requested precision. Options are 13 or 25. diff --git a/pl/math/pairwise_horner.h b/pl/math/pairwise_horner.h new file mode 100644 index 0000000..bee7592 --- /dev/null +++ b/pl/math/pairwise_horner.h @@ -0,0 +1,14 @@ +/* + * Helper macros for double-precision pairwise Horner polynomial evaluation. + * + * Copyright (c) 2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#if V_SUPPORTED +#define FMA v_fma_f64 +#else +#define FMA fma +#endif + +#include "pairwise_horner_wrap.h" diff --git a/pl/math/pairwise_horner_wrap.h b/pl/math/pairwise_horner_wrap.h new file mode 100644 index 0000000..5bc287b --- /dev/null +++ b/pl/math/pairwise_horner_wrap.h @@ -0,0 +1,36 @@ +/* + * Helper macros for pairwise Horner polynomial evaluation. + * + * Copyright (c) 2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +// clang-format off +#define PW_HORNER_1_(x, c, i) FMA(x, C(i + 1), C(i)) +#define PW_HORNER_3_(x, x2, c, i) FMA(x2, PW_HORNER_1_(x, c, i + 2), PW_HORNER_1_(x, c, i)) +#define PW_HORNER_5_(x, x2, c, i) FMA(x2, PW_HORNER_3_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) +#define PW_HORNER_7_(x, x2, c, i) FMA(x2, PW_HORNER_5_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) +#define PW_HORNER_9_(x, x2, c, i) FMA(x2, PW_HORNER_7_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) +#define PW_HORNER_11_(x, x2, c, i) FMA(x2, PW_HORNER_9_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) + +#define PAIRWISE_HORNER_1(x, c) PW_HORNER_1_ (x, c, 0) +#define PAIRWISE_HORNER_3(x, x2, c) PW_HORNER_3_ (x, x2, c, 0) +#define PAIRWISE_HORNER_5(x, x2, c) PW_HORNER_5_ (x, x2, c, 0) +#define PAIRWISE_HORNER_7(x, x2, c) PW_HORNER_7_ (x, x2, c, 0) +#define PAIRWISE_HORNER_9(x, x2, c) PW_HORNER_9_ (x, x2, c, 0) +#define PAIRWISE_HORNER_11(x, x2, c) PW_HORNER_11_(x, x2, c, 0) + +#define PW_HORNER_2_(x, x2, c, i) FMA(x2, c(i + 2), PW_HORNER_1_(x, c, i)) +#define PW_HORNER_4_(x, x2, c, i) FMA(x2, PW_HORNER_2_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) +#define PW_HORNER_6_(x, x2, c, i) FMA(x2, PW_HORNER_4_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) +#define PW_HORNER_8_(x, x2, c, i) FMA(x2, PW_HORNER_6_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) +#define PW_HORNER_10_(x, x2, c, i) FMA(x2, PW_HORNER_8_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) +#define PW_HORNER_12_(x, x2, c, i) FMA(x2, PW_HORNER_10_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) + +#define PAIRWISE_HORNER_2(x, x2, c) PW_HORNER_2_ (x, x2, c, 0) +#define PAIRWISE_HORNER_4(x, x2, c) PW_HORNER_4_ (x, x2, c, 0) +#define PAIRWISE_HORNER_6(x, x2, c) PW_HORNER_6_ (x, x2, c, 0) +#define PAIRWISE_HORNER_8(x, x2, c) PW_HORNER_8_(x, x2, c, 0) +#define PAIRWISE_HORNER_10(x, x2, c) PW_HORNER_10_(x, x2, c, 0) +#define PAIRWISE_HORNER_12(x, x2, c) PW_HORNER_12_(x, x2, c, 0) +// clang-format on diff --git a/pl/math/pairwise_hornerf.h b/pl/math/pairwise_hornerf.h new file mode 100644 index 0000000..a8aa4d1 --- /dev/null +++ b/pl/math/pairwise_hornerf.h @@ -0,0 +1,14 @@ +/* + * Helper macros for single-precision pairwise Horner polynomial evaluation. + * + * Copyright (c) 2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#if V_SUPPORTED +#define FMA v_fma_f32 +#else +#define FMA fmaf +#endif + +#include "pairwise_horner_wrap.h" diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh index 14abfc6..bd4c4c2 100755 --- a/pl/math/test/runulp.sh +++ b/pl/math/test/runulp.sh @@ -789,7 +789,7 @@ cbrtf __vn_cbrtf $runvn fenv cbrtf _ZGVnN4v_cbrtf $runvn fenv asinh __s_asinh $runs fenv # Test vector asinh 3 times, with control lane < 1, > 1 and special. -# Ensures the v_sel is choosing the right option in all cases. +# Ensures the v_sel is choosing the right option in all cases. asinh __v_asinh $runv fenv -c 0.5 asinh __vn_asinh $runvn fenv -c 0.5 asinh _ZGVnN2v_asinh $runvn fenv -c 0.5 diff --git a/pl/math/v_asinh_2u5.c b/pl/math/v_asinh_2u5.c index 508ced1..974e6df 100644 --- a/pl/math/v_asinh_2u5.c +++ b/pl/math/v_asinh_2u5.c @@ -5,6 +5,7 @@ */ #include "v_math.h" +#include "estrin.h" #if V_SUPPORTED @@ -74,38 +75,6 @@ log_inline (v_f64_t x) return y; } -static inline v_f64_t -eval_poly (v_f64_t z) -{ - /* Custom polynomial, shared with scalar routine, for calculating asinh(x) in - [2^-26, 1]. Evaluated with Estrin scheme. */ - v_f64_t p_01 = v_fma_f64 (z, C (1), C (0)); - v_f64_t p_23 = v_fma_f64 (z, C (3), C (2)); - v_f64_t p_45 = v_fma_f64 (z, C (5), C (4)); - v_f64_t p_67 = v_fma_f64 (z, C (7), C (6)); - v_f64_t p_89 = v_fma_f64 (z, C (9), C (8)); - v_f64_t p_ab = v_fma_f64 (z, C (11), C (10)); - v_f64_t p_cd = v_fma_f64 (z, C (13), C (12)); - v_f64_t p_ef = v_fma_f64 (z, C (15), C (14)); - v_f64_t p_gh = v_fma_f64 (z, C (17), C (16)); - - v_f64_t z2 = z * z; - v_f64_t p_03 = v_fma_f64 (z2, p_23, p_01); - v_f64_t p_47 = v_fma_f64 (z2, p_67, p_45); - v_f64_t p_8b = v_fma_f64 (z2, p_ab, p_89); - v_f64_t p_cf = v_fma_f64 (z2, p_ef, p_cd); - - v_f64_t z4 = z2 * z2; - v_f64_t p_07 = v_fma_f64 (z4, p_47, p_03); - v_f64_t p_8f = v_fma_f64 (z4, p_cf, p_8b); - - v_f64_t z8 = z4 * z4; - v_f64_t p_0f = v_fma_f64 (z8, p_8f, p_07); - - v_f64_t z16 = z8 * z8; - return v_fma_f64 (z16, p_gh, p_0f); -} - /* Double-precision implementation of vector asinh(x). asinh is very sensitive around 1, so it is impractical to devise a single low-cost algorithm which is sufficiently accurate on a wide range of input. @@ -162,7 +131,10 @@ VPCS_ATTR v_f64_t V_NAME (asinh) (v_f64_t x) ax = v_sel_f64 (tiny | gt1, v_f64 (0), ax); #endif v_f64_t x2 = ax * ax; - v_f64_t p = eval_poly (x2); + v_f64_t z2 = x2 * x2; + v_f64_t z4 = z2 * z2; + v_f64_t z8 = z4 * z4; + v_f64_t p = ESTRIN_17 (x2, z2, z4, z8, z8 * z8, C); option_2 = v_fma_f64 (p, x2 * ax, ax); #if WANT_ERRNO option_2 = v_sel_f64 (tiny, x, option_2); diff --git a/pl/math/v_erfc_4u.c b/pl/math/v_erfc_4u.c index 603d8f5..80e11e7 100644 --- a/pl/math/v_erfc_4u.c +++ b/pl/math/v_erfc_4u.c @@ -7,6 +7,7 @@ #include "math_config.h" #include "v_math.h" +#include "horner.h" #if V_SUPPORTED /* Accurate exponential (vector variant of exp_dd). */ @@ -56,28 +57,6 @@ lookup (v_u64_t i) return e; } -/* Evaluate order-12 polynomials using pairwise summation and Horner - scheme. */ -static inline v_f64_t -v_eval_poly (v_f64_t z, struct entry e) -{ - v_f64_t r = e.P[12]; - r = v_fma_f64 (z, r, e.P[11]); - r = v_fma_f64 (z, r, e.P[10]); - r = v_fma_f64 (z, r, e.P[9]); - r = v_fma_f64 (z, r, e.P[8]); - r = v_fma_f64 (z, r, e.P[7]); - r = v_fma_f64 (z, r, e.P[6]); - r = v_fma_f64 (z, r, e.P[5]); - r = v_fma_f64 (z, r, e.P[4]); - r = v_fma_f64 (z, r, e.P[3]); - r = v_fma_f64 (z, r, e.P[2]); - r = v_fma_f64 (z, r, e.P[1]); - r = v_fma_f64 (z, r, e.P[0]); - - return r; -} - /* Accurate evaluation of exp(x^2) using compensated product (x^2 ~ x*x + e2) and custom exp(y+d) routine for small corrections d<<y. */ @@ -154,7 +133,8 @@ v_f64_t V_NAME (erfc) (v_f64_t x) /* Evaluate Polynomial: P(|x|-x_i). */ z = a - dat.xi; - p = v_eval_poly (z, dat); +#define C(i) dat.P[i] + p = HORNER_12 (z, C); /* Evaluate Gaussian: exp(-x^2). */ v_f64_t e = v_eval_gauss (a); diff --git a/pl/math/v_erfcf_1u.c b/pl/math/v_erfcf_1u.c index fc9571d..d9c65a5 100644 --- a/pl/math/v_erfcf_1u.c +++ b/pl/math/v_erfcf_1u.c @@ -7,6 +7,7 @@ #include "v_math.h" #include "erfcf.h" +#include "estrin.h" #if V_SUPPORTED @@ -41,39 +42,12 @@ interval_index (uint32_t ia12) #endif static inline v_f64_t -v_eval_poly_estrin (v_f64_t z, const double *coeff1, const double *coeff2) -{ - v_f64_t z2 = z * z; - v_f64_t z4 = z2 * z2; - v_f64_t z8 = z4 * z4; - - v_f64_t c0_zc1 = v_fma_f64 (z, C (1), C (0)); - v_f64_t c2_zc3 = v_fma_f64 (z, C (3), C (2)); - v_f64_t c4_zc5 = v_fma_f64 (z, C (5), C (4)); - v_f64_t c6_zc7 = v_fma_f64 (z, C (7), C (6)); - v_f64_t c8_zc9 = v_fma_f64 (z, C (9), C (8)); - v_f64_t c10_zc11 = v_fma_f64 (z, C (11), C (10)); - v_f64_t c12_zc13 = v_fma_f64 (z, C (13), C (12)); - v_f64_t c14_zc15 = v_fma_f64 (z, C (15), C (14)); - - v_f64_t c0_z2c3 = v_fma_f64 (z2, c2_zc3, c0_zc1); - v_f64_t c4_z2c7 = v_fma_f64 (z2, c6_zc7, c4_zc5); - v_f64_t c8_z2c11 = v_fma_f64 (z2, c10_zc11, c8_zc9); - v_f64_t c12_z2c15 = v_fma_f64 (z2, c14_zc15, c12_zc13); - - v_f64_t c0_z4c7 = v_fma_f64 (z4, c4_z2c7, c0_z2c3); - v_f64_t c8_z4c15 = v_fma_f64 (z4, c12_z2c15, c8_z2c11); - - return v_fma_f64 (z8, c8_z4c15, c0_z4c7); -} - -#undef C - -static inline v_f64_t v_approx_erfcf_poly_gauss (v_f64_t x, const double *coeff1, const double *coeff2) { - v_f64_t poly = v_eval_poly_estrin (x, coeff1, coeff2); + v_f64_t x2 = x * x; + v_f64_t x4 = x2 * x2; + v_f64_t poly = ESTRIN_15 (x, x2, x4, x4 * x4, C); v_f64_t gauss = V_NAME (exp_tail) (-(x * x), v_f64 (0.0)); return poly * gauss; } @@ -81,7 +55,7 @@ v_approx_erfcf_poly_gauss (v_f64_t x, const double *coeff1, static inline float approx_poly_gauss (float abs_x, const double *coeff) { - return (float) (eval_poly_horner_lvl2 (abs_x, coeff) * eval_exp_mx2 (abs_x)); + return (float) (eval_poly (abs_x, coeff) * eval_exp_mx2 (abs_x)); } static v_f32_t diff --git a/pl/math/v_log1p_2u5.c b/pl/math/v_log1p_2u5.c index 51d0d51..d97a622 100644 --- a/pl/math/v_log1p_2u5.c +++ b/pl/math/v_log1p_2u5.c @@ -7,7 +7,7 @@ #include "v_math.h" #if V_SUPPORTED -#include "log1p_common.h" +#include "estrin.h" #define Ln2Hi v_f64 (0x1.62e42fefa3800p-1) #define Ln2Lo v_f64 (0x1.ef35793c76730p-45) @@ -18,6 +18,16 @@ #define OneTop12 0x3ff #define BottomMask 0xffffffff #define AbsMask 0x7fffffffffffffff +#define C(i) v_f64 (__log1p_data.coeffs[i]) + +static inline v_f64_t +eval_poly (v_f64_t f) +{ + v_f64_t f2 = f * f; + v_f64_t f4 = f2 * f2; + v_f64_t f8 = f4 * f4; + return ESTRIN_18 (f, f2, f4, f8, f8 * f8, C); +} VPCS_ATTR NOINLINE static v_f64_t diff --git a/pl/math/v_tanf_3u2.c b/pl/math/v_tanf_3u2.c index a6d9dd1..01f7f65 100644 --- a/pl/math/v_tanf_3u2.c +++ b/pl/math/v_tanf_3u2.c @@ -6,6 +6,8 @@ */ #include "v_math.h" +#include "estrinf.h" + #if V_SUPPORTED /* Constants. */ @@ -32,13 +34,7 @@ eval_poly (v_f32_t z) { v_f32_t z2 = z * z; v_f32_t z4 = z2 * z2; - v_f32_t y_10 = v_fma_f32 (z, poly (1), poly (0)); - v_f32_t y_32 = v_fma_f32 (z, poly (3), poly (2)); - v_f32_t y_54 = v_fma_f32 (z, poly (5), poly (4)); - v_f32_t y_6_54 = v_fma_f32 (z2, poly (6), y_54); - v_f32_t y_32_10 = v_fma_f32 (z2, y_32, y_10); - v_f32_t y = v_fma_f32 (z4, y_6_54, y_32_10); - return y; + return ESTRIN_6 (z, z2, z4, poly); } /* Fast implementation of Neon tanf. diff --git a/pl/math/v_tanhf_2u6.c b/pl/math/v_tanhf_2u6.c index 571fd8b..67e4520 100644 --- a/pl/math/v_tanhf_2u6.c +++ b/pl/math/v_tanhf_2u6.c @@ -6,6 +6,7 @@ #include "v_math.h" #include "mathlib.h" +#include "estrinf.h" #if V_SUPPORTED @@ -39,10 +40,7 @@ expm1f_inline (v_f32_t x) /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f). Uses Estrin scheme, where the main __v_expm1f routine uses Horner. */ v_f32_t f2 = f * f; - v_f32_t p_01 = v_fma_f32 (f, C (1), C (0)); - v_f32_t p_23 = v_fma_f32 (f, C (3), C (2)); - v_f32_t p = v_fma_f32 (f2, p_23, p_01); - p = v_fma_f32 (f2 * f2, C (4), p); + v_f32_t p = ESTRIN_4 (f, f2, f2 * f2, C); p = v_fma_f32 (f2, p, f); /* t = 2^i. */ |