Merge remote-tracking branch 'aosp/upstream-master' into HEAD

am: f1a22e7d49 Change-Id: I935353d51d2affd9d83920f6a46e16be6f735c7f
author: Elliott Hughes <enh@google.com> 2018-10-29 15:58:24 -0700
committer: android-build-merger <android-build-merger@google.com> 2018-10-29 15:58:24 -0700
commit: e5f6e157cee4e12b2ef08ffd178e84fa955b1220 (patch)
tree: a1998cb9f24af3c314114d4daf50b0e625f09386
parent: 745c75bfff7c74033ba5948ad91b3efbe069cbc0 (diff)
parent: f1a22e7d49fb48bf61e2416e8ba4fece1db6c0e6 (diff)
download: arm-optimized-routines-e5f6e157cee4e12b2ef08ffd178e84fa955b1220.tar.gz
13 files changed, 186 insertions, 77 deletions
diff --git a/METADATA b/METADATA
index 05f7942..eb07e87 100644
--- a/METADATA
+++ b/METADATA
@@ -1,8 +1,5 @@
 name: "ARM-software/optimized-routines"
-description:
-    "Optimized implementations of various library functions for ARM "
-    "architecture processors "
-
+description: "Optimized implementations of various library functions for ARM architecture processors "
 third_party {
   url {
     type: HOMEPAGE
@@ -12,7 +9,11 @@ third_party {
     type: GIT
     value: "https://github.com/ARM-software/optimized-routines.git"
   }
-  version: "41ed0e600fb5a37ba96742fdbadd3048e62d5045"
-  last_upgrade_date { year: 2018 month: 7 day: 27 }
+  version: "e875f40f0b2ad71c5381a431e6d71829770c7ab7"
   license_type: NOTICE
+  last_upgrade_date {
+    year: 2018
+    month: 10
+    day: 29
+  }
 }
diff --git a/auxiliary/remez.jl b/auxiliary/remez.jl
index 31266e9..01e62f9 100755
--- a/auxiliary/remez.jl
+++ b/auxiliary/remez.jl
@@ -18,6 +18,30 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import Base.\
+
+# ----------------------------------------------------------------------
+# Helper functions to cope with different Julia versions.
+if VERSION >= v"0.7.0"
+    array1d(T, d) = Array{T, 1}(undef, d)
+    array2d(T, d1, d2) = Array{T, 2}(undef, d1, d2)
+else
+    array1d(T, d) = Array(T, d)
+    array2d(T, d1, d2) = Array(T, d1, d2)
+end
+if VERSION < v"0.5.0"
+    String = ASCIIString
+end
+if VERSION >= v"0.6.0"
+    # Use Base.invokelatest to run functions made using eval(), to
+    # avoid "world age" error
+    run(f, x...) = Base.invokelatest(f, x...)
+else
+    # Prior to 0.6.0, invokelatest doesn't exist (but fortunately the
+    # world age problem also doesn't seem to exist)
+    run(f, x...) = f(x...)
+end
+
 # ----------------------------------------------------------------------
 # Global variables configured by command-line options.
 floatsuffix = "" # adjusted by --floatsuffix
@@ -26,7 +50,7 @@ epsbits = 256 # adjusted by --bits
 debug_facilities = Set() # adjusted by --debug
 full_output = false # adjusted by --full
 array_format = false # adjusted by --array
-preliminary_commands = String[] # adjusted by --pre
+preliminary_commands = array1d(String, 0) # adjusted by --pre
 
 # ----------------------------------------------------------------------
 # Diagnostic and utility functions.
@@ -56,7 +80,7 @@ macro debug(facility, printargs...)
     for arg in printargs
         printit = quote
             $printit
-            print($arg)
+            print($(esc(arg)))
         end
     end
     return quote
@@ -156,7 +180,7 @@ end
 #    xys      Array of (x,y) pairs of BigFloats.
 #
 # Return value is a string.
-function format_xylist(xys::Array{(BigFloat,BigFloat)})
+function format_xylist(xys::Array{Tuple{BigFloat,BigFloat}})
     return ("[\n" *
             join(["  "*string(x)*" -> "*string(y) for (x,y) in xys], "\n") *
             "\n]")
@@ -354,8 +378,8 @@ function ratfn_leastsquares(f::Function, xvals::Array{BigFloat}, n, d,
     # Build the matrix. We're solving n+d+1 equations in n+d+1
     # unknowns. (We actually have to return n+d+2 coefficients, but
     # one of them is hardwired to 1.)
-    matrix = Array(BigFloat, n+d+1, n+d+1)
-    vector = Array(BigFloat, n+d+1)
+    matrix = array2d(BigFloat, n+d+1, n+d+1)
+    vector = array1d(BigFloat, n+d+1)
     for i = 0:1:n
         # Equation obtained by differentiating with respect to p_i,
         # i.e. the numerator coefficient of x^i.
@@ -484,7 +508,7 @@ end
 # the extremum location and its value (i.e. y=f(x)).
 function find_extrema(f::Function, grid::Array{BigFloat})
     len = length(grid)
-    extrema = Array((BigFloat, BigFloat), 0)
+    extrema = array1d(Tuple{BigFloat, BigFloat}, 0)
     for i = 1:1:len
         # We have to provide goldensection() with three points
         # bracketing the extremum. If the extremum is at one end of
@@ -529,11 +553,11 @@ end
 # Returns a new array of (x,y) pairs which is a subsequence of the
 # original sequence. (So, in particular, if the input was sorted by x
 # then so will the output be.)
-function winnow_extrema(extrema::Array{(BigFloat,BigFloat)}, n)
+function winnow_extrema(extrema::Array{Tuple{BigFloat,BigFloat}}, n)
     # best[i,j] gives the best sequence so far of length i and with
     # sign j (where signs are coded as 1=positive, 2=negative), in the
     # form of a tuple (cost, actual array of x,y pairs).
-    best = fill((BigFloat(0), Array((BigFloat,BigFloat),0)), n, 2)
+    best = fill((BigFloat(0), array1d(Tuple{BigFloat,BigFloat}, 0)), n, 2)
 
     for (x,y) = extrema
         if y > 0
@@ -624,8 +648,8 @@ function ratfn_equal_deviation(f::Function, coords::Array{BigFloat},
         # in which the p_i and e are the variables, and the powers of
         # x and calls to w and f are the coefficients.
 
-        matrix = Array(BigFloat, n+2, n+2)
-        vector = Array(BigFloat, n+2)
+        matrix = array2d(BigFloat, n+2, n+2)
+        vector = array1d(BigFloat, n+2)
         currsign = +1
         for i = 1:1:n+2
             x = coords[i]
@@ -678,8 +702,8 @@ function ratfn_equal_deviation(f::Function, coords::Array{BigFloat},
             # based on the first n+d+1 of the n+d+2 coordinates, and
             # see what the error turns out to be at the final
             # coordinate.
-            matrix = Array(BigFloat, n+d+1, n+d+1)
-            vector = Array(BigFloat, n+d+1)
+            matrix = array2d(BigFloat, n+d+1, n+d+1)
+            vector = array1d(BigFloat, n+d+1)
             currsign = +1
             for i = 1:1:n+d+1
                 x = coords[i]
@@ -787,7 +811,7 @@ end
 #              means that the Chebyshev alternation theorem guarantees
 #              that any other function of the same degree must exceed
 #              the error of this one at at least one of those points.
-function ratfn_minimax(f::Function, interval::(BigFloat,BigFloat), n, d,
+function ratfn_minimax(f::Function, interval::Tuple{BigFloat,BigFloat}, n, d,
                        w = (x,y)->BigFloat(1))
     # We start off by finding a least-squares approximation. This
     # doesn't need to be perfect, but if we can get it reasonably good
@@ -934,7 +958,7 @@ function test()
     # Test leastsquares rational functions.
     println("Leastsquares test 1:")
     n = 10000
-    a = Array(BigFloat, n+1)
+    a = array1d(BigFloat, n+1)
     for i = 0:1:n
         a[1+i] = i/BigFloat(n)
     end
@@ -947,7 +971,7 @@ function test()
     # Test golden section search.
     println("Golden section test 1:")
     x, y = goldensection(x->sin(x),
-                              BigFloat(0), BigFloat("0.1"), BigFloat(4))
+                              BigFloat(0), BigFloat(1)/10, BigFloat(4))
     println("  ", x, " -> ", y)
     test(approx_eq(x, asin(BigFloat(1))))
     test(approx_eq(y, 1))
@@ -955,7 +979,7 @@ function test()
     # Test extrema-winnowing algorithm.
     println("Winnow test 1:")
     extrema = [(x, sin(20*x)*sin(197*x))
-               for x in BigFloat(0):BigFloat("0.001"):BigFloat(1)]
+               for x in BigFloat(0):BigFloat(1)/1000:BigFloat(1)]
     winnowed = winnow_extrema(extrema, 12)
     println("  ret = ", format_xylist(winnowed))
     prevx, prevy = -1, 0
@@ -973,7 +997,7 @@ function test()
     println("  ",e)
     println("  ",ratfn_to_string(nc, dc))
     test(0 < e < 1e-3)
-    for x = 0:BigFloat(0.001):1
+    for x = 0:BigFloat(1)/1000:1
         test(abs(ratfn_eval(nc, dc, x) - exp(x)) <= e * 1.0000001)
     end
 
@@ -982,7 +1006,7 @@ function test()
     println("  ",e)
     println("  ",ratfn_to_string(nc, dc))
     test(0 < e < 1e-3)
-    for x = 0:BigFloat(0.001):1
+    for x = 0:BigFloat(1)/1000:1
         test(abs(ratfn_eval(nc, dc, x) - exp(x)) <= e * 1.0000001)
     end
 
@@ -992,7 +1016,7 @@ function test()
     println("  ",e)
     println("  ",ratfn_to_string(nc, dc))
     test(0 < e < 1e-3)
-    for x = 0:BigFloat(0.001):1
+    for x = 0:BigFloat(1)/1000:1
         test(abs(ratfn_eval(nc, dc, x) - exp(x))/exp(x) <= e * 1.0000001)
     end
 
@@ -1002,7 +1026,7 @@ function test()
     println("  ",e)
     println("  ",ratfn_to_string(nc, dc))
     test(0 < e < 1e-3)
-    for x = 0:BigFloat(0.001):1
+    for x = 0:BigFloat(1)/1000:1
         test(abs(ratfn_eval(nc, dc, x) - exp(x))/exp(x) <= e * 1.0000001)
     end
 
@@ -1012,12 +1036,12 @@ function test()
     println("  ",e)
     println("  ",ratfn_to_string(nc, dc))
     test(0 < e < 1e-3)
-    for x = 0:BigFloat(0.001):1
+    for x = 0:BigFloat(1)/1000:1
         test(abs(ratfn_eval(nc, dc, x) - exp(x))/exp(x) <= e * 1.0000001)
     end
 
     total = passes + fails
-    @printf "%d passes %d fails %d total\n" passes fails total
+    println(passes, " passes ", fails, " fails ", total, " total")
 end
 
 # ----------------------------------------------------------------------
@@ -1206,14 +1230,14 @@ function main()
     end
 
     for preliminary_command in preliminary_commands
-        eval(parse(preliminary_command))
+        eval(Meta.parse(preliminary_command))
     end
 
-    lo = BigFloat(eval(parse(argwords[1])))
-    hi = BigFloat(eval(parse(argwords[2])))
-    n = int(argwords[3])
-    d = int(argwords[4])
-    f = eval(parse("x -> " * argwords[5]))
+    lo = BigFloat(eval(Meta.parse(argwords[1])))
+    hi = BigFloat(eval(Meta.parse(argwords[2])))
+    n = parse(Int,argwords[3])
+    d = parse(Int,argwords[4])
+    f = eval(Meta.parse("x -> " * argwords[5]))
 
     # Wrap the user-provided function with a function of our own. This
     # arranges to detect silly FP values (inf,nan) early and diagnose
@@ -1221,7 +1245,7 @@ function main()
     # function in case you suspect it's doing the wrong thing at some
     # special-case point.
     function func(x)
-        y = f(x)
+        y = run(f,x)
         @debug("f", x, " -> ", y)
         if !isfinite(y)
             error("f(" * string(x) * ") returned non-finite value " * string(y))
@@ -1231,9 +1255,9 @@ function main()
 
     if nargs == 6
         # Wrap the user-provided weight function similarly.
-        w = eval(parse("(x,y) -> " * argwords[6]))
+        w = eval(Meta.parse("(x,y) -> " * argwords[6]))
         function wrapped_weight(x,y)
-            ww = w(x,y)
+            ww = run(w,x,y)
             if !isfinite(ww)
                 error("w(" * string(x) * "," * string(y) *
                       ") returned non-finite value " * string(ww))
@@ -1283,9 +1307,11 @@ end
 what_to_do = main
 
 doing_opts = true
-argwords = Array(ASCIIString, 0)
+argwords = array1d(String, 0)
 for arg = ARGS
-    if doing_opts && beginswith(arg, "-")
+    global doing_opts, what_to_do, argwords
+    global full_output, array_format, xvarname, floatsuffix, epsbits
+    if doing_opts && startswith(arg, "-")
         if arg == "--"
             doing_opts = false
         elseif arg == "--help"
@@ -1296,18 +1322,19 @@ for arg = ARGS
             full_output = true
         elseif arg == "--array"
             array_format = true
-        elseif beginswith(arg, "--debug=")
-            enable_debug(arg[length("--debug=")+1:])
-        elseif beginswith(arg, "--variable=")
-            xvarname = arg[length("--variable=")+1:]
-        elseif beginswith(arg, "--suffix=")
-            floatsuffix = arg[length("--suffix=")+1:]
-        elseif beginswith(arg, "--bits=")
-            epsbits = int(arg[length("--bits=")+1:])
-        elseif beginswith(arg, "--bigfloatbits=")
-            set_bigfloat_precision(int(arg[length("--bigfloatbits=")+1:]))
-        elseif beginswith(arg, "--pre=")
-            push!(preliminary_commands, arg[length("--pre=")+1:])
+        elseif startswith(arg, "--debug=")
+            enable_debug(arg[length("--debug=")+1:end])
+        elseif startswith(arg, "--variable=")
+            xvarname = arg[length("--variable=")+1:end]
+        elseif startswith(arg, "--suffix=")
+            floatsuffix = arg[length("--suffix=")+1:end]
+        elseif startswith(arg, "--bits=")
+            epsbits = parse(Int,arg[length("--bits=")+1:end])
+        elseif startswith(arg, "--bigfloatbits=")
+            set_bigfloat_precision(
+                parse(Int,arg[length("--bigfloatbits=")+1:end]))
+        elseif startswith(arg, "--pre=")
+            push!(preliminary_commands, arg[length("--pre=")+1:end])
         else
             error("unrecognised option: ", arg)
         end
diff --git a/math/cosf.c b/math/cosf.c
index 7d57d06..5a635d1 100644
--- a/math/cosf.c
+++ b/math/cosf.c
@@ -26,11 +26,10 @@
 #include "math_config.h"
 #include "sincosf.h"
 
-/* Fast cosf implementation.  Worst-case ULP is 0.56072, maximum relative
-   error is 0.5303p-23.  A single-step signed range reduction is used for
+/* Fast cosf implementation.  Worst-case ULP is 0.5607, maximum relative
+   error is 0.5303 * 2^-23.  A single-step range reduction is used for
    small values.  Large inputs have their range reduced using fast integer
-   arithmetic.
-*/
+   arithmetic.  */
 float
 cosf (float y)
 {
diff --git a/math/log2_data.c b/math/log2_data.c
index 356662f..d2eba89 100644
--- a/math/log2_data.c
+++ b/math/log2_data.c
@@ -54,6 +54,32 @@ const struct log2_data __log2_data = {
 0x1.a6225e117f92ep-3,
 #endif
 },
+/* Algorithm:
+
+	x = 2^k z
+	log2(x) = k + log2(c) + log2(z/c)
+	log2(z/c) = poly(z/c - 1)
+
+where z is in [1.6p-1; 1.6p0] which is split into N subintervals and z falls
+into the ith one, then table entries are computed as
+
+	tab[i].invc = 1/c
+	tab[i].logc = (double)log2(c)
+	tab2[i].chi = (double)c
+	tab2[i].clo = (double)(c - (double)c)
+
+where c is near the center of the subinterval and is chosen by trying +-2^29
+floating point invc candidates around 1/center and selecting one for which
+
+	1) the rounding error in 0x1.8p10 + logc is 0,
+	2) the rounding error in z - chi - clo is < 0x1p-64 and
+	3) the rounding error in (double)log2(c) is minimized (< 0x1p-68).
+
+Note: 1) ensures that k + logc can be computed without rounding error, 2)
+ensures that z/c - 1 can be computed as (z - chi - clo)*invc with close to a
+single rounding error when there is no fast fma for z*invc - 1, 3) ensures
+that logc + poly(z/c - 1) has small error, however near x == 1 when
+|log2(x)| < 0x1p-4, this is not enough so that is special cased.  */
 .tab = {
 #if N == 64
 {0x1.724286bb1acf8p+0, -0x1.1095feecdb000p-1},
diff --git a/math/log_data.c b/math/log_data.c
index 88c8a2e..2e2e478 100644
--- a/math/log_data.c
+++ b/math/log_data.c
@@ -98,6 +98,32 @@ const struct log_data __log_data = {
 0x1.2493c29331a5cp-3,
 #endif
 },
+/* Algorithm:
+
+	x = 2^k z
+	log(x) = k ln2 + log(c) + log(z/c)
+	log(z/c) = poly(z/c - 1)
+
+where z is in [1.6p-1; 1.6p0] which is split into N subintervals and z falls
+into the ith one, then table entries are computed as
+
+	tab[i].invc = 1/c
+	tab[i].logc = (double)log(c)
+	tab2[i].chi = (double)c
+	tab2[i].clo = (double)(c - (double)c)
+
+where c is near the center of the subinterval and is chosen by trying +-2^29
+floating point invc candidates around 1/center and selecting one for which
+
+	1) the rounding error in 0x1.8p9 + logc is 0,
+	2) the rounding error in z - chi - clo is < 0x1p-66 and
+	3) the rounding error in (double)log(c) is minimized (< 0x1p-66).
+
+Note: 1) ensures that k*ln2hi + logc can be computed without rounding error,
+2) ensures that z/c - 1 can be computed as (z - chi - clo)*invc with close to
+a single rounding error when there is no fast fma for z*invc - 1, 3) ensures
+that logc + poly(z/c - 1) has small error, however near x == 1 when
+|log(x)| < 0x1p-4, this is not enough so that is special cased.  */
 .tab = {
 #if N == 64
 {0x1.7242886495cd8p+0, -0x1.79e267bdfe000p-2},
diff --git a/math/math_config.h b/math/math_config.h
index ce6fac8..ddac41c 100644
--- a/math/math_config.h
+++ b/math/math_config.h
@@ -56,7 +56,7 @@
 
 /* Compiler can inline fma as a single instruction.  */
 #ifndef HAVE_FAST_FMA
-# ifdef FP_FAST_FMA
+# if defined FP_FAST_FMA || __aarch64__
 #   define HAVE_FAST_FMA 1
 # else
 #   define HAVE_FAST_FMA 0
diff --git a/math/pow.c b/math/pow.c
index 9f953e9..2b77dc6 100644
--- a/math/pow.c
+++ b/math/pow.c
@@ -256,7 +256,8 @@ exp_inline (double x, double xtail, uint32_t sign_bias)
   return scale + scale * tmp;
 }
 
-/* Returns 0 if not int, 1 if odd int, 2 if even int.  */
+/* Returns 0 if not int, 1 if odd int, 2 if even int.  The argument is
+   the bit representation of a non-zero finite floating-point value.  */
 static inline int
 checkint (uint64_t iy)
 {
diff --git a/math/pow_log_data.c b/math/pow_log_data.c
index 1b73b7f..6d3bf5e 100644
--- a/math/pow_log_data.c
+++ b/math/pow_log_data.c
@@ -38,6 +38,28 @@ const struct pow_log_data __pow_log_data = {
 -0x1.0002b8b263fc3p-3 * -8,
 #endif
 },
+/* Algorithm:
+
+	x = 2^k z
+	log(x) = k ln2 + log(c) + log(z/c)
+	log(z/c) = poly(z/c - 1)
+
+where z is in [0x1.69555p-1; 0x1.69555p0] which is split into N subintervals
+and z falls into the ith one, then table entries are computed as
+
+	tab[i].invc = 1/c
+	tab[i].logc = round(0x1p43*log(c))/0x1p43
+	tab[i].logctail = (double)(log(c) - logc)
+
+where c is chosen near the center of the subinterval such that 1/c has only a
+few precision bits so z/c - 1 is exactly representible as double:
+
+	1/c = center < 1 ? round(N/center)/N : round(2*N/center)/N/2
+
+Note: |z/c - 1| < 1/N for the chosen c, |log(c) - logc - logctail| < 0x1p-97,
+the last few bits of logc are rounded away so k*ln2hi + logc has no rounding
+error and the interval for z is selected such that near x == 1, where log(x)
+is tiny, large cancellation error is avoided in logc + poly(z/c - 1).  */
 .tab = {
 #if N == 128
 #define A(a, b, c) {a, 0, b, c},
diff --git a/math/powf.c b/math/powf.c
index 5ea1e71..c4ea178 100644
--- a/math/powf.c
+++ b/math/powf.c
@@ -121,7 +121,8 @@ exp2_inline (double_t xd, uint32_t sign_bias)
   return y;
 }
 
-/* Returns 0 if not int, 1 if odd int, 2 if even int.  */
+/* Returns 0 if not int, 1 if odd int, 2 if even int.  The argument is
+   the bit representation of a non-zero finite floating-point value.  */
 static inline int
 checkint (uint32_t iy)
 {
diff --git a/math/sincosf.c b/math/sincosf.c
index 5b13d72..1af41f3 100644
--- a/math/sincosf.c
+++ b/math/sincosf.c
@@ -26,11 +26,10 @@
 #include "math_config.h"
 #include "sincosf.h"
 
-/* Fast sincosf implementation.  Worst-case ULP is 0.56072, maximum relative
-   error is 0.5303p-23.  A single-step signed range reduction is used for
+/* Fast sincosf implementation.  Worst-case ULP is 0.5607, maximum relative
+   error is 0.5303 * 2^-23.  A single-step range reduction is used for
    small values.  Large inputs have their range reduced using fast integer
-   arithmetic.
-*/
+   arithmetic.  */
 void
 sincosf (float y, float *sinp, float *cosp)
 {
diff --git a/math/sincosf.h b/math/sincosf.h
index 4aced58..395a30e 100644
--- a/math/sincosf.h
+++ b/math/sincosf.h
@@ -21,19 +21,25 @@
 #include <math.h>
 #include "math_config.h"
 
-/* PI * 2^-64.  */
-static const double pi64 = 0x1.921FB54442D18p-62;
+/* 2PI * 2^-64.  */
+static const double pi63 = 0x1.921FB54442D18p-62;
 /* PI / 4.  */
 static const double pio4 = 0x1.921FB54442D18p-1;
 
+/* The constants and polynomials for sine and cosine.  */
 typedef struct
 {
-  double sign[4];
-  double hpi_inv, hpi, c0, c1, c2, c3, c4, s1, s2, s3;
+  double sign[4];		/* Sign of sine in quadrants 0..3.  */
+  double hpi_inv;		/* 2 / PI ( * 2^24 if !TOINT_INTRINSICS).  */
+  double hpi;			/* PI / 2.  */
+  double c0, c1, c2, c3, c4;	/* Cosine polynomial.  */
+  double s1, s2, s3;		/* Sine polynomial.  */
 } sincos_t;
 
+/* Polynomial data (the cosine polynomial is negated in the 2nd entry).  */
 extern const sincos_t __sincosf_table[2] HIDDEN;
 
+/* Table with 4/PI to 192 bit precision.  */
 extern const uint32_t __inv_pio4[] HIDDEN;
 
 /* Top 12 bits of the float representation with the sign bit cleared.  */
@@ -107,18 +113,20 @@ sinf_poly (double x, double x2, const sincos_t *p, int n)
    X as a value between -PI/4 and PI/4 and store the quadrant in NP.
    The values for PI/2 and 2/PI are accessed via P.  Since PI/2 as a double
    is accurate to 55 bits and the worst-case cancellation happens at 6 * PI/4,
-   only 2 multiplies are required and the result is accurate for |X| <= 120.0.
-   Use round/lround if inlined, otherwise convert to int.  To avoid inaccuracies
-   introduced by truncating negative values, compute the quadrant * 2^24.  */
+   the result is accurate for |X| <= 120.0.  */
 static inline double
 reduce_fast (double x, const sincos_t *p, int *np)
 {
   double r;
 #if TOINT_INTRINSICS
+  /* Use fast round and lround instructions when available.  */
   r = x * p->hpi_inv;
   *np = converttoint (r);
   return x - roundtoint (r) * p->hpi;
 #else
+  /* Use scaled float to int conversion with explicit rounding.
+     hpi_inv is prescaled by 2^24 so the quadrant ends up in bits 24..31.
+     This avoids inaccuracies introduced by truncating negative values.  */
   r = x * p->hpi_inv;
   int n = ((int32_t)r + 0x800000) >> 24;
   *np = n;
@@ -126,7 +134,7 @@ reduce_fast (double x, const sincos_t *p, int *np)
 #endif
 }
 
-/* Reduce the range of XI to a multiple of PI/4 using fast integer arithmetic.
+/* Reduce the range of XI to a multiple of PI/2 using fast integer arithmetic.
    XI is a reinterpreted float and must be >= 2.0f (the sign bit is ignored).
    Return the modulo between -PI/4 and PI/4 and store the quadrant in NP.
    Reduction uses a table of 4/PI with 192 bits of precision.  A 32x96->128 bit
@@ -153,5 +161,5 @@ reduce_large (uint32_t xi, int *np)
   res0 -= n << 62;
   double x = (int64_t)res0;
   *np = n;
-  return x * pi64;
+  return x * pi63;
 }
diff --git a/math/sincosf_data.c b/math/sincosf_data.c
index deaed1b..3e7e5cd 100644
--- a/math/sincosf_data.c
+++ b/math/sincosf_data.c
@@ -30,7 +30,7 @@ const sincos_t __sincosf_table[2] =
 {
   {
     { 1.0, -1.0, -1.0, 1.0 },
-#if HAVE_FAST_ROUND
+#if TOINT_INTRINSICS
     0x1.45F306DC9C883p-1,
 #else
     0x1.45F306DC9C883p+23,
@@ -47,7 +47,7 @@ const sincos_t __sincosf_table[2] =
   },
   {
     { 1.0, -1.0, -1.0, 1.0 },
-#if HAVE_FAST_ROUND
+#if TOINT_INTRINSICS
     0x1.45F306DC9C883p-1,
 #else
     0x1.45F306DC9C883p+23,
diff --git a/math/sinf.c b/math/sinf.c
index 5369d50..3f35923 100644
--- a/math/sinf.c
+++ b/math/sinf.c
@@ -25,11 +25,10 @@
 #include "math_config.h"
 #include "sincosf.h"
 
-/* Fast sinf implementation.  Worst-case ULP is 0.56072, maximum relative
-   error is 0.5303p-23.  A single-step signed range reduction is used for
+/* Fast sinf implementation.  Worst-case ULP is 0.5607, maximum relative
+   error is 0.5303 * 2^-23.  A single-step range reduction is used for
    small values.  Large inputs have their range reduced using fast integer
-   arithmetic.
-*/
+   arithmetic.  */
 float
 sinf (float y)
 {
author	Elliott Hughes <enh@google.com>	2018-10-29 15:58:24 -0700
committer	android-build-merger <android-build-merger@google.com>	2018-10-29 15:58:24 -0700
commit	e5f6e157cee4e12b2ef08ffd178e84fa955b1220 (patch)
tree	a1998cb9f24af3c314114d4daf50b0e625f09386
parent	745c75bfff7c74033ba5948ad91b3efbe069cbc0 (diff)
parent	f1a22e7d49fb48bf61e2416e8ba4fece1db6c0e6 (diff)
download	arm-optimized-routines-e5f6e157cee4e12b2ef08ffd178e84fa955b1220.tar.gz