pl/math: Reduce order of single-precision tan polynomial

For both vector and scalar routines we reduce the order from 6 to 5. For vector routines, this requires reducing RangeVal as for large values the tan polynomial is not quite accurate enough. However the cotan polynomial is used in the inaccurate region in the scalar routine, so this does not need to change. Accuracy of scalar routine is unchanged. Accuracy in both vector routines is now 3.45 ULP, with the same worst-case.
author: Joe Ramsay <Joe.Ramsay@arm.com> 2023-01-23 10:01:03 +0000
committer: Joe Ramsay <joe.ramsay@arm.com> 2023-01-23 10:01:03 +0000
commit: a7b6022090c9b43af519dc328c1aeece7258e558 (patch)
tree: 613d944fb82fa053a68ee0122dabc2b72930b002
parent: 76c2badba375dd8f785e92079dbd4290038a2750 (diff)
download: arm-optimized-routines-a7b6022090c9b43af519dc328c1aeece7258e558.tar.gz
10 files changed, 36 insertions, 59 deletions
diff --git a/pl/math/horner_wrap.h b/pl/math/horner_wrap.h
index a254b2d..6478968 100644
--- a/pl/math/horner_wrap.h
+++ b/pl/math/horner_wrap.h
@@ -6,7 +6,7 @@
  */
 
 // clang-format off
-#define  HORNER_1_(x, c, i) FMA(C(i + 1), x, c(i))
+#define  HORNER_1_(x, c, i) FMA(c(i + 1), x, c(i))
 #define  HORNER_2_(x, c, i) FMA(HORNER_1_ (x, c, i + 1), x, c(i))
 #define  HORNER_3_(x, c, i) FMA(HORNER_2_ (x, c, i + 1), x, c(i))
 #define  HORNER_4_(x, c, i) FMA(HORNER_3_ (x, c, i + 1), x, c(i))
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index 9a7ce96..dccb3ce 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -472,7 +472,7 @@ extern const struct log1pf_data
   float coeffs[LOG1PF_NCOEFFS];
 } __log1pf_data HIDDEN;
 
-#define TANF_P_POLY_NCOEFFS 7
+#define TANF_P_POLY_NCOEFFS 6
 /* cotan approach needs order 3 on [0, pi/4] to reach <3.5ulps.  */
 #define TANF_Q_POLY_NCOEFFS 4
 extern const struct tanf_poly_data
diff --git a/pl/math/pairwise_horner_wrap.h b/pl/math/pairwise_horner_wrap.h
index b6efb6f..e56f059 100644
--- a/pl/math/pairwise_horner_wrap.h
+++ b/pl/math/pairwise_horner_wrap.h
@@ -6,7 +6,7 @@
  */
 
 // clang-format off
-#define  PW_HORNER_1_(x, c,     i) FMA(x,  C(i + 1),                      C(i))
+#define  PW_HORNER_1_(x, c,     i) FMA(x,  c(i + 1),                       c(i))
 #define  PW_HORNER_3_(x, x2, c, i) FMA(x2, PW_HORNER_1_ (x,     c, i + 2), PW_HORNER_1_(x, c, i))
 #define  PW_HORNER_5_(x, x2, c, i) FMA(x2, PW_HORNER_3_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
 #define  PW_HORNER_7_(x, x2, c, i) FMA(x2, PW_HORNER_5_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
diff --git a/pl/math/s_tanf_3u2.c b/pl/math/s_tanf_3u5.c
index b5ddf94..fa64c8a 100644
--- a/pl/math/s_tanf_3u2.c
+++ b/pl/math/s_tanf_3u5.c
@@ -3,4 +3,4 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
-#include "v_tanf_3u2.c"
+#include "v_tanf_3u5.c"
diff --git a/pl/math/sv_tanf_3u2.c b/pl/math/sv_tanf_3u5.c
index 78ff480..cca43bd 100644
--- a/pl/math/sv_tanf_3u2.c
+++ b/pl/math/sv_tanf_3u5.c
@@ -16,7 +16,7 @@
 #define NegPio2_2 (sv_f32 (0x1.777a5cp-25f))
 #define NegPio2_3 (sv_f32 (0x1.ee59dap-50f))
 #define InvPio2 (sv_f32 (0x1.45f306p-1f))
-#define RangeVal (sv_f32 (0x1p17f))
+#define RangeVal (sv_f32 (0x1p15f))
 #define Shift (sv_f32 (0x1.8p+23f))
 
 #define poly(i) sv_f32 (__tanf_poly_data.poly_tan[i])
@@ -30,9 +30,8 @@ eval_poly (svbool_t pg, sv_f32_t z)
   sv_f32_t y_10 = sv_fma_f32_x (pg, z, poly (1), poly (0));
   sv_f32_t y_32 = sv_fma_f32_x (pg, z, poly (3), poly (2));
   sv_f32_t y_54 = sv_fma_f32_x (pg, z, poly (5), poly (4));
-  sv_f32_t y_6_54 = sv_fma_f32_x (pg, z2, poly (6), y_54);
   sv_f32_t y_32_10 = sv_fma_f32_x (pg, z2, y_32, y_10);
-  sv_f32_t y = sv_fma_f32_x (pg, z4, y_6_54, y_32_10);
+  sv_f32_t y = sv_fma_f32_x (pg, z4, y_54, y_32_10);
   return y;
 }
 
@@ -43,10 +42,9 @@ __sv_tanf_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp)
 }
 
 /* Fast implementation of SVE tanf.
-   The maximum measured errors were located near RangeVal.
-   Maximum error: 3.121ulps.
-   svtan_f32(0x1.ff3df8p+16) got -0x1.fbb7b8p-1
-			    want -0x1.fbb7b2p-1.  */
+   Maximum error is 3.45 ULP:
+   __sv_tanf(-0x1.e5f0cap+13) got 0x1.ff9856p-1
+			     want 0x1.ff9850p-1.  */
 sv_f32_t
 __sv_tanf_x (sv_f32_t x, const svbool_t pg)
 {
@@ -102,7 +100,7 @@ __sv_tanf_x (sv_f32_t x, const svbool_t pg)
 PL_ALIAS (__sv_tanf_x, _ZGVsMxv_tanf)
 
 PL_SIG (SV, F, 1, tan, -3.1, 3.1)
-PL_TEST_ULP (__sv_tanf, 2.7)
+PL_TEST_ULP (__sv_tanf, 2.96)
 PL_TEST_INTERVAL (__sv_tanf, -0.0, -0x1p126, 100)
 PL_TEST_INTERVAL (__sv_tanf, 0x1p-149, 0x1p-126, 4000)
 PL_TEST_INTERVAL (__sv_tanf, 0x1p-126, 0x1p-23, 50000)
diff --git a/pl/math/tanf_3u3.c b/pl/math/tanf_3u3.c
index 0b1617c..ec006dc 100644
--- a/pl/math/tanf_3u3.c
+++ b/pl/math/tanf_3u3.c
@@ -7,6 +7,7 @@
 #include "math_config.h"
 #include "pl_sig.h"
 #include "pl_test.h"
+#include "pairwise_hornerf.h"
 
 /* Useful constants.  */
 #define NegPio2_1 (-0x1.921fb6p+0f)
@@ -21,28 +22,19 @@
 /* 2PI * 2^-64.  */
 #define Pio2p63 (0x1.921FB54442D18p-62)
 
-#define P __tanf_poly_data.poly_tan
-#define Q __tanf_poly_data.poly_cotan
+#define P(i) __tanf_poly_data.poly_tan[i]
+#define Q(i) __tanf_poly_data.poly_cotan[i]
 
 static inline float
 eval_P (float z)
 {
-  float z2 = z * z;
-  float y_10 = fmaf (z, P[1], P[0]);
-  float y_32 = fmaf (z, P[3], P[2]);
-  float y_54 = fmaf (z, P[5], P[4]);
-  float y_6_54 = fmaf (z2, P[6], y_54);
-  float y_32_10 = fmaf (z2, y_32, y_10);
-  float y = fmaf (z2, z2 * y_6_54, y_32_10);
-  return y;
+  return PAIRWISE_HORNER_5 (z, z * z, P);
 }
 
 static inline float
 eval_Q (float z)
 {
-  float z2 = z * z;
-  float y = fmaf (z2, fmaf (z, Q[3], Q[2]), fmaf (z, Q[1], Q[0]));
-  return y;
+  return PAIRWISE_HORNER_3 (z, z * z, Q);
 }
 
 /* Reduction of the input argument x using Cody-Waite approach, such that x = r
diff --git a/pl/math/tanf_data.c b/pl/math/tanf_data.c
index 242cfaa..a6b9d51 100644
--- a/pl/math/tanf_data.c
+++ b/pl/math/tanf_data.c
@@ -10,23 +10,20 @@
 const struct tanf_poly_data __tanf_poly_data = {
 .poly_tan = {
 /* Coefficients generated using:
-   remez(f(x) = (tan(sqrt(x)) - sqrt(x)) / (x * sqrt(x)), deg, [a;b], 1, 1e-16, [|dtype ...|])
-   optimize each coefficient
+   poly = fpminimax((tan(sqrt(x))-sqrt(x))/x^(3/2), deg, [|single ...|], [a*a;b*b]);
    optimize relative error
    final prec : 23 bits
-   working prec : 128 bits
-   deg : 6
-   a : 0x1p-126
-   b : (pi) / 0x1p2
-   dirty rel error : 0x1.df324p-26
-   dirty abs error : 0x1.df3244p-26.  */
-0x1.555558p-2, /* 0.3333334.  */
-0x1.110e1cp-3, /* 0.1333277.  */
-0x1.bb0e7p-5, /* 5.408403e-2.  */
-0x1.5826c8p-6, /* 2.100534e-2.  */
-0x1.8426a6p-7, /* 1.1845428e-2.  */
--0x1.7a5adcp-10, /* -1.4433095e-3.  */
-0x1.5574dap-8, /* 5.210212e-3.  */
+   deg : 5
+   a : 0x1p-126 ^ 2
+   b : ((pi) / 0x1p2) ^ 2
+   dirty rel error: 0x1.f7c2e4p-25
+   dirty abs error: 0x1.f7c2ecp-25.  */
+0x1.55555p-2,
+0x1.11166p-3,
+0x1.b88a78p-5,
+0x1.7b5756p-6,
+0x1.4ef4cep-8,
+0x1.0e1e74p-7
 },
 .poly_cotan = {
 /* Coefficients generated using:
diff --git a/pl/math/tools/tanf.sollya b/pl/math/tools/tanf.sollya
index 8b2306b..f4b49b4 100644
--- a/pl/math/tools/tanf.sollya
+++ b/pl/math/tools/tanf.sollya
@@ -6,7 +6,7 @@
 dtype = single;
 
 mthd = 0; // approximate tan
-deg = 6; // poly degree
+deg = 5; // poly degree
 
 // // Uncomment for cotan
 // mthd = 1; // approximate cotan
@@ -38,19 +38,9 @@ if(mthd==0) then {
   F = proc(P) { return x + x^3 * P(x^2); };
   f = (g(sqrt(x))-sqrt(x))/(x*sqrt(x));
   init_poly = 0;
-  deg_init_poly = -1; // a value such that we actually start by building constant coefficient
   // Display info
   print("Approximate g(x) =", g, "as F(x)=", s, ".");
-  // Remez applied to minimise relative error
-  approx_remez = proc(func, poly, d) {
-    return remez(1 - poly / func, deg - d, [a;b], x^d/func(x), 1e-10);
-  };
-  // Iteratively find optimal coeffs
-  poly = init_poly;
-  for i from deg_init_poly+1 to deg do {
-    p = roundcoefficients(approx_remez(f, poly, i), [|dtype ...|]);
-    poly = poly + x^i * coeff(p,0);
-  };
+  poly = fpminimax(f, deg, [|dtype ...|], [a*a;b*b]);
 }
 else if (mthd==1) then {
   s = "1/x + x * P(x^2)";
diff --git a/pl/math/v_tanf_3u2.c b/pl/math/v_tanf_3u5.c
index a2b1fab..828466b 100644
--- a/pl/math/v_tanf_3u2.c
+++ b/pl/math/v_tanf_3u5.c
@@ -17,7 +17,7 @@
 #define NegPio2_2 (v_f32 (0x1.777a5cp-25f))
 #define NegPio2_3 (v_f32 (0x1.ee59dap-50f))
 #define InvPio2 (v_f32 (0x1.45f306p-1f))
-#define RangeVal (0x48000000)  /* asuint32(0x1p17f).  */
+#define RangeVal (0x47000000)  /* asuint32(0x1p15f).  */
 #define TinyBound (0x30000000) /* asuint32 (0x1p-31).  */
 #define Shift (v_f32 (0x1.8p+23f))
 #define AbsMask (v_u32 (0x7fffffff))
@@ -45,13 +45,13 @@ eval_poly (v_f32_t z)
     z2 = v_sel_f32 (will_uflow, v_f32 (0), z2);
 #endif
   v_f32_t z4 = z2 * z2;
-  return ESTRIN_6 (z, z2, z4, poly);
+  return ESTRIN_5 (z, z2, z4, poly);
 }
 
 /* Fast implementation of Neon tanf.
-   Maximum measured error: 3.121ulps.
-   vtanq_f32(0x1.ff3df8p+16) got -0x1.fbb7b8p-1
-			    want -0x1.fbb7b2p-1.  */
+   Maximum error is 3.45 ULP:
+   __v_tanf(-0x1.e5f0cap+13) got 0x1.ff9856p-1
+			    want 0x1.ff9850p-1.  */
 VPCS_ATTR
 v_f32_t V_NAME (tanf) (v_f32_t x)
 {
@@ -118,7 +118,7 @@ v_f32_t V_NAME (tanf) (v_f32_t x)
 VPCS_ALIAS
 
 PL_SIG (V, F, 1, tan, -3.1, 3.1)
-PL_TEST_ULP (V_NAME (tanf), 2.7)
+PL_TEST_ULP (V_NAME (tanf), 2.96)
 PL_TEST_EXPECT_FENV (V_NAME (tanf), WANT_SIMD_EXCEPT)
 PL_TEST_INTERVAL (V_NAME (tanf), -0.0, -0x1p126, 100)
 PL_TEST_INTERVAL (V_NAME (tanf), 0x1p-149, 0x1p-126, 4000)
diff --git a/pl/math/vn_tanf_3u2.c b/pl/math/vn_tanf_3u5.c
index ccdcab6..a88cb40 100644
--- a/pl/math/vn_tanf_3u2.c
+++ b/pl/math/vn_tanf_3u5.c
@@ -8,5 +8,5 @@
 #ifdef __vpcs
 #define VPCS 1
 #define VPCS_ALIAS PL_ALIAS (__vn_tanf, _ZGVnN4v_tanf)
-#include "v_tanf_3u2.c"
+#include "v_tanf_3u5.c"
 #endif
author	Joe Ramsay <Joe.Ramsay@arm.com>	2023-01-23 10:01:03 +0000
committer	Joe Ramsay <joe.ramsay@arm.com>	2023-01-23 10:01:03 +0000
commit	a7b6022090c9b43af519dc328c1aeece7258e558 (patch)
tree	613d944fb82fa053a68ee0122dabc2b72930b002
parent	76c2badba375dd8f785e92079dbd4290038a2750 (diff)
download	arm-optimized-routines-a7b6022090c9b43af519dc328c1aeece7258e558.tar.gz