aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorElliott Hughes <enh@google.com>2018-10-29 15:58:24 -0700
committerandroid-build-merger <android-build-merger@google.com>2018-10-29 15:58:24 -0700
commite5f6e157cee4e12b2ef08ffd178e84fa955b1220 (patch)
treea1998cb9f24af3c314114d4daf50b0e625f09386
parent745c75bfff7c74033ba5948ad91b3efbe069cbc0 (diff)
parentf1a22e7d49fb48bf61e2416e8ba4fece1db6c0e6 (diff)
downloadarm-optimized-routines-e5f6e157cee4e12b2ef08ffd178e84fa955b1220.tar.gz
Merge remote-tracking branch 'aosp/upstream-master' into HEAD
am: f1a22e7d49 Change-Id: I935353d51d2affd9d83920f6a46e16be6f735c7f
-rw-r--r--METADATA13
-rwxr-xr-xauxiliary/remez.jl117
-rw-r--r--math/cosf.c7
-rw-r--r--math/log2_data.c26
-rw-r--r--math/log_data.c26
-rw-r--r--math/math_config.h2
-rw-r--r--math/pow.c3
-rw-r--r--math/pow_log_data.c22
-rw-r--r--math/powf.c3
-rw-r--r--math/sincosf.c7
-rw-r--r--math/sincosf.h26
-rw-r--r--math/sincosf_data.c4
-rw-r--r--math/sinf.c7
13 files changed, 186 insertions, 77 deletions
diff --git a/METADATA b/METADATA
index 05f7942..eb07e87 100644
--- a/METADATA
+++ b/METADATA
@@ -1,8 +1,5 @@
name: "ARM-software/optimized-routines"
-description:
- "Optimized implementations of various library functions for ARM "
- "architecture processors "
-
+description: "Optimized implementations of various library functions for ARM architecture processors "
third_party {
url {
type: HOMEPAGE
@@ -12,7 +9,11 @@ third_party {
type: GIT
value: "https://github.com/ARM-software/optimized-routines.git"
}
- version: "41ed0e600fb5a37ba96742fdbadd3048e62d5045"
- last_upgrade_date { year: 2018 month: 7 day: 27 }
+ version: "e875f40f0b2ad71c5381a431e6d71829770c7ab7"
license_type: NOTICE
+ last_upgrade_date {
+ year: 2018
+ month: 10
+ day: 29
+ }
}
diff --git a/auxiliary/remez.jl b/auxiliary/remez.jl
index 31266e9..01e62f9 100755
--- a/auxiliary/remez.jl
+++ b/auxiliary/remez.jl
@@ -18,6 +18,30 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+import Base.\
+
+# ----------------------------------------------------------------------
+# Helper functions to cope with different Julia versions.
+if VERSION >= v"0.7.0"
+ array1d(T, d) = Array{T, 1}(undef, d)
+ array2d(T, d1, d2) = Array{T, 2}(undef, d1, d2)
+else
+ array1d(T, d) = Array(T, d)
+ array2d(T, d1, d2) = Array(T, d1, d2)
+end
+if VERSION < v"0.5.0"
+ String = ASCIIString
+end
+if VERSION >= v"0.6.0"
+ # Use Base.invokelatest to run functions made using eval(), to
+ # avoid "world age" error
+ run(f, x...) = Base.invokelatest(f, x...)
+else
+ # Prior to 0.6.0, invokelatest doesn't exist (but fortunately the
+ # world age problem also doesn't seem to exist)
+ run(f, x...) = f(x...)
+end
+
# ----------------------------------------------------------------------
# Global variables configured by command-line options.
floatsuffix = "" # adjusted by --floatsuffix
@@ -26,7 +50,7 @@ epsbits = 256 # adjusted by --bits
debug_facilities = Set() # adjusted by --debug
full_output = false # adjusted by --full
array_format = false # adjusted by --array
-preliminary_commands = String[] # adjusted by --pre
+preliminary_commands = array1d(String, 0) # adjusted by --pre
# ----------------------------------------------------------------------
# Diagnostic and utility functions.
@@ -56,7 +80,7 @@ macro debug(facility, printargs...)
for arg in printargs
printit = quote
$printit
- print($arg)
+ print($(esc(arg)))
end
end
return quote
@@ -156,7 +180,7 @@ end
# xys Array of (x,y) pairs of BigFloats.
#
# Return value is a string.
-function format_xylist(xys::Array{(BigFloat,BigFloat)})
+function format_xylist(xys::Array{Tuple{BigFloat,BigFloat}})
return ("[\n" *
join([" "*string(x)*" -> "*string(y) for (x,y) in xys], "\n") *
"\n]")
@@ -354,8 +378,8 @@ function ratfn_leastsquares(f::Function, xvals::Array{BigFloat}, n, d,
# Build the matrix. We're solving n+d+1 equations in n+d+1
# unknowns. (We actually have to return n+d+2 coefficients, but
# one of them is hardwired to 1.)
- matrix = Array(BigFloat, n+d+1, n+d+1)
- vector = Array(BigFloat, n+d+1)
+ matrix = array2d(BigFloat, n+d+1, n+d+1)
+ vector = array1d(BigFloat, n+d+1)
for i = 0:1:n
# Equation obtained by differentiating with respect to p_i,
# i.e. the numerator coefficient of x^i.
@@ -484,7 +508,7 @@ end
# the extremum location and its value (i.e. y=f(x)).
function find_extrema(f::Function, grid::Array{BigFloat})
len = length(grid)
- extrema = Array((BigFloat, BigFloat), 0)
+ extrema = array1d(Tuple{BigFloat, BigFloat}, 0)
for i = 1:1:len
# We have to provide goldensection() with three points
# bracketing the extremum. If the extremum is at one end of
@@ -529,11 +553,11 @@ end
# Returns a new array of (x,y) pairs which is a subsequence of the
# original sequence. (So, in particular, if the input was sorted by x
# then so will the output be.)
-function winnow_extrema(extrema::Array{(BigFloat,BigFloat)}, n)
+function winnow_extrema(extrema::Array{Tuple{BigFloat,BigFloat}}, n)
# best[i,j] gives the best sequence so far of length i and with
# sign j (where signs are coded as 1=positive, 2=negative), in the
# form of a tuple (cost, actual array of x,y pairs).
- best = fill((BigFloat(0), Array((BigFloat,BigFloat),0)), n, 2)
+ best = fill((BigFloat(0), array1d(Tuple{BigFloat,BigFloat}, 0)), n, 2)
for (x,y) = extrema
if y > 0
@@ -624,8 +648,8 @@ function ratfn_equal_deviation(f::Function, coords::Array{BigFloat},
# in which the p_i and e are the variables, and the powers of
# x and calls to w and f are the coefficients.
- matrix = Array(BigFloat, n+2, n+2)
- vector = Array(BigFloat, n+2)
+ matrix = array2d(BigFloat, n+2, n+2)
+ vector = array1d(BigFloat, n+2)
currsign = +1
for i = 1:1:n+2
x = coords[i]
@@ -678,8 +702,8 @@ function ratfn_equal_deviation(f::Function, coords::Array{BigFloat},
# based on the first n+d+1 of the n+d+2 coordinates, and
# see what the error turns out to be at the final
# coordinate.
- matrix = Array(BigFloat, n+d+1, n+d+1)
- vector = Array(BigFloat, n+d+1)
+ matrix = array2d(BigFloat, n+d+1, n+d+1)
+ vector = array1d(BigFloat, n+d+1)
currsign = +1
for i = 1:1:n+d+1
x = coords[i]
@@ -787,7 +811,7 @@ end
# means that the Chebyshev alternation theorem guarantees
# that any other function of the same degree must exceed
# the error of this one at at least one of those points.
-function ratfn_minimax(f::Function, interval::(BigFloat,BigFloat), n, d,
+function ratfn_minimax(f::Function, interval::Tuple{BigFloat,BigFloat}, n, d,
w = (x,y)->BigFloat(1))
# We start off by finding a least-squares approximation. This
# doesn't need to be perfect, but if we can get it reasonably good
@@ -934,7 +958,7 @@ function test()
# Test leastsquares rational functions.
println("Leastsquares test 1:")
n = 10000
- a = Array(BigFloat, n+1)
+ a = array1d(BigFloat, n+1)
for i = 0:1:n
a[1+i] = i/BigFloat(n)
end
@@ -947,7 +971,7 @@ function test()
# Test golden section search.
println("Golden section test 1:")
x, y = goldensection(x->sin(x),
- BigFloat(0), BigFloat("0.1"), BigFloat(4))
+ BigFloat(0), BigFloat(1)/10, BigFloat(4))
println(" ", x, " -> ", y)
test(approx_eq(x, asin(BigFloat(1))))
test(approx_eq(y, 1))
@@ -955,7 +979,7 @@ function test()
# Test extrema-winnowing algorithm.
println("Winnow test 1:")
extrema = [(x, sin(20*x)*sin(197*x))
- for x in BigFloat(0):BigFloat("0.001"):BigFloat(1)]
+ for x in BigFloat(0):BigFloat(1)/1000:BigFloat(1)]
winnowed = winnow_extrema(extrema, 12)
println(" ret = ", format_xylist(winnowed))
prevx, prevy = -1, 0
@@ -973,7 +997,7 @@ function test()
println(" ",e)
println(" ",ratfn_to_string(nc, dc))
test(0 < e < 1e-3)
- for x = 0:BigFloat(0.001):1
+ for x = 0:BigFloat(1)/1000:1
test(abs(ratfn_eval(nc, dc, x) - exp(x)) <= e * 1.0000001)
end
@@ -982,7 +1006,7 @@ function test()
println(" ",e)
println(" ",ratfn_to_string(nc, dc))
test(0 < e < 1e-3)
- for x = 0:BigFloat(0.001):1
+ for x = 0:BigFloat(1)/1000:1
test(abs(ratfn_eval(nc, dc, x) - exp(x)) <= e * 1.0000001)
end
@@ -992,7 +1016,7 @@ function test()
println(" ",e)
println(" ",ratfn_to_string(nc, dc))
test(0 < e < 1e-3)
- for x = 0:BigFloat(0.001):1
+ for x = 0:BigFloat(1)/1000:1
test(abs(ratfn_eval(nc, dc, x) - exp(x))/exp(x) <= e * 1.0000001)
end
@@ -1002,7 +1026,7 @@ function test()
println(" ",e)
println(" ",ratfn_to_string(nc, dc))
test(0 < e < 1e-3)
- for x = 0:BigFloat(0.001):1
+ for x = 0:BigFloat(1)/1000:1
test(abs(ratfn_eval(nc, dc, x) - exp(x))/exp(x) <= e * 1.0000001)
end
@@ -1012,12 +1036,12 @@ function test()
println(" ",e)
println(" ",ratfn_to_string(nc, dc))
test(0 < e < 1e-3)
- for x = 0:BigFloat(0.001):1
+ for x = 0:BigFloat(1)/1000:1
test(abs(ratfn_eval(nc, dc, x) - exp(x))/exp(x) <= e * 1.0000001)
end
total = passes + fails
- @printf "%d passes %d fails %d total\n" passes fails total
+ println(passes, " passes ", fails, " fails ", total, " total")
end
# ----------------------------------------------------------------------
@@ -1206,14 +1230,14 @@ function main()
end
for preliminary_command in preliminary_commands
- eval(parse(preliminary_command))
+ eval(Meta.parse(preliminary_command))
end
- lo = BigFloat(eval(parse(argwords[1])))
- hi = BigFloat(eval(parse(argwords[2])))
- n = int(argwords[3])
- d = int(argwords[4])
- f = eval(parse("x -> " * argwords[5]))
+ lo = BigFloat(eval(Meta.parse(argwords[1])))
+ hi = BigFloat(eval(Meta.parse(argwords[2])))
+ n = parse(Int,argwords[3])
+ d = parse(Int,argwords[4])
+ f = eval(Meta.parse("x -> " * argwords[5]))
# Wrap the user-provided function with a function of our own. This
# arranges to detect silly FP values (inf,nan) early and diagnose
@@ -1221,7 +1245,7 @@ function main()
# function in case you suspect it's doing the wrong thing at some
# special-case point.
function func(x)
- y = f(x)
+ y = run(f,x)
@debug("f", x, " -> ", y)
if !isfinite(y)
error("f(" * string(x) * ") returned non-finite value " * string(y))
@@ -1231,9 +1255,9 @@ function main()
if nargs == 6
# Wrap the user-provided weight function similarly.
- w = eval(parse("(x,y) -> " * argwords[6]))
+ w = eval(Meta.parse("(x,y) -> " * argwords[6]))
function wrapped_weight(x,y)
- ww = w(x,y)
+ ww = run(w,x,y)
if !isfinite(ww)
error("w(" * string(x) * "," * string(y) *
") returned non-finite value " * string(ww))
@@ -1283,9 +1307,11 @@ end
what_to_do = main
doing_opts = true
-argwords = Array(ASCIIString, 0)
+argwords = array1d(String, 0)
for arg = ARGS
- if doing_opts && beginswith(arg, "-")
+ global doing_opts, what_to_do, argwords
+ global full_output, array_format, xvarname, floatsuffix, epsbits
+ if doing_opts && startswith(arg, "-")
if arg == "--"
doing_opts = false
elseif arg == "--help"
@@ -1296,18 +1322,19 @@ for arg = ARGS
full_output = true
elseif arg == "--array"
array_format = true
- elseif beginswith(arg, "--debug=")
- enable_debug(arg[length("--debug=")+1:])
- elseif beginswith(arg, "--variable=")
- xvarname = arg[length("--variable=")+1:]
- elseif beginswith(arg, "--suffix=")
- floatsuffix = arg[length("--suffix=")+1:]
- elseif beginswith(arg, "--bits=")
- epsbits = int(arg[length("--bits=")+1:])
- elseif beginswith(arg, "--bigfloatbits=")
- set_bigfloat_precision(int(arg[length("--bigfloatbits=")+1:]))
- elseif beginswith(arg, "--pre=")
- push!(preliminary_commands, arg[length("--pre=")+1:])
+ elseif startswith(arg, "--debug=")
+ enable_debug(arg[length("--debug=")+1:end])
+ elseif startswith(arg, "--variable=")
+ xvarname = arg[length("--variable=")+1:end]
+ elseif startswith(arg, "--suffix=")
+ floatsuffix = arg[length("--suffix=")+1:end]
+ elseif startswith(arg, "--bits=")
+ epsbits = parse(Int,arg[length("--bits=")+1:end])
+ elseif startswith(arg, "--bigfloatbits=")
+ set_bigfloat_precision(
+ parse(Int,arg[length("--bigfloatbits=")+1:end]))
+ elseif startswith(arg, "--pre=")
+ push!(preliminary_commands, arg[length("--pre=")+1:end])
else
error("unrecognised option: ", arg)
end
diff --git a/math/cosf.c b/math/cosf.c
index 7d57d06..5a635d1 100644
--- a/math/cosf.c
+++ b/math/cosf.c
@@ -26,11 +26,10 @@
#include "math_config.h"
#include "sincosf.h"
-/* Fast cosf implementation. Worst-case ULP is 0.56072, maximum relative
- error is 0.5303p-23. A single-step signed range reduction is used for
+/* Fast cosf implementation. Worst-case ULP is 0.5607, maximum relative
+ error is 0.5303 * 2^-23. A single-step range reduction is used for
small values. Large inputs have their range reduced using fast integer
- arithmetic.
-*/
+ arithmetic. */
float
cosf (float y)
{
diff --git a/math/log2_data.c b/math/log2_data.c
index 356662f..d2eba89 100644
--- a/math/log2_data.c
+++ b/math/log2_data.c
@@ -54,6 +54,32 @@ const struct log2_data __log2_data = {
0x1.a6225e117f92ep-3,
#endif
},
+/* Algorithm:
+
+ x = 2^k z
+ log2(x) = k + log2(c) + log2(z/c)
+ log2(z/c) = poly(z/c - 1)
+
+where z is in [1.6p-1; 1.6p0] which is split into N subintervals and z falls
+into the ith one, then table entries are computed as
+
+ tab[i].invc = 1/c
+ tab[i].logc = (double)log2(c)
+ tab2[i].chi = (double)c
+ tab2[i].clo = (double)(c - (double)c)
+
+where c is near the center of the subinterval and is chosen by trying +-2^29
+floating point invc candidates around 1/center and selecting one for which
+
+ 1) the rounding error in 0x1.8p10 + logc is 0,
+ 2) the rounding error in z - chi - clo is < 0x1p-64 and
+ 3) the rounding error in (double)log2(c) is minimized (< 0x1p-68).
+
+Note: 1) ensures that k + logc can be computed without rounding error, 2)
+ensures that z/c - 1 can be computed as (z - chi - clo)*invc with close to a
+single rounding error when there is no fast fma for z*invc - 1, 3) ensures
+that logc + poly(z/c - 1) has small error, however near x == 1 when
+|log2(x)| < 0x1p-4, this is not enough so that is special cased. */
.tab = {
#if N == 64
{0x1.724286bb1acf8p+0, -0x1.1095feecdb000p-1},
diff --git a/math/log_data.c b/math/log_data.c
index 88c8a2e..2e2e478 100644
--- a/math/log_data.c
+++ b/math/log_data.c
@@ -98,6 +98,32 @@ const struct log_data __log_data = {
0x1.2493c29331a5cp-3,
#endif
},
+/* Algorithm:
+
+ x = 2^k z
+ log(x) = k ln2 + log(c) + log(z/c)
+ log(z/c) = poly(z/c - 1)
+
+where z is in [1.6p-1; 1.6p0] which is split into N subintervals and z falls
+into the ith one, then table entries are computed as
+
+ tab[i].invc = 1/c
+ tab[i].logc = (double)log(c)
+ tab2[i].chi = (double)c
+ tab2[i].clo = (double)(c - (double)c)
+
+where c is near the center of the subinterval and is chosen by trying +-2^29
+floating point invc candidates around 1/center and selecting one for which
+
+ 1) the rounding error in 0x1.8p9 + logc is 0,
+ 2) the rounding error in z - chi - clo is < 0x1p-66 and
+ 3) the rounding error in (double)log(c) is minimized (< 0x1p-66).
+
+Note: 1) ensures that k*ln2hi + logc can be computed without rounding error,
+2) ensures that z/c - 1 can be computed as (z - chi - clo)*invc with close to
+a single rounding error when there is no fast fma for z*invc - 1, 3) ensures
+that logc + poly(z/c - 1) has small error, however near x == 1 when
+|log(x)| < 0x1p-4, this is not enough so that is special cased. */
.tab = {
#if N == 64
{0x1.7242886495cd8p+0, -0x1.79e267bdfe000p-2},
diff --git a/math/math_config.h b/math/math_config.h
index ce6fac8..ddac41c 100644
--- a/math/math_config.h
+++ b/math/math_config.h
@@ -56,7 +56,7 @@
/* Compiler can inline fma as a single instruction. */
#ifndef HAVE_FAST_FMA
-# ifdef FP_FAST_FMA
+# if defined FP_FAST_FMA || __aarch64__
# define HAVE_FAST_FMA 1
# else
# define HAVE_FAST_FMA 0
diff --git a/math/pow.c b/math/pow.c
index 9f953e9..2b77dc6 100644
--- a/math/pow.c
+++ b/math/pow.c
@@ -256,7 +256,8 @@ exp_inline (double x, double xtail, uint32_t sign_bias)
return scale + scale * tmp;
}
-/* Returns 0 if not int, 1 if odd int, 2 if even int. */
+/* Returns 0 if not int, 1 if odd int, 2 if even int. The argument is
+ the bit representation of a non-zero finite floating-point value. */
static inline int
checkint (uint64_t iy)
{
diff --git a/math/pow_log_data.c b/math/pow_log_data.c
index 1b73b7f..6d3bf5e 100644
--- a/math/pow_log_data.c
+++ b/math/pow_log_data.c
@@ -38,6 +38,28 @@ const struct pow_log_data __pow_log_data = {
-0x1.0002b8b263fc3p-3 * -8,
#endif
},
+/* Algorithm:
+
+ x = 2^k z
+ log(x) = k ln2 + log(c) + log(z/c)
+ log(z/c) = poly(z/c - 1)
+
+where z is in [0x1.69555p-1; 0x1.69555p0] which is split into N subintervals
+and z falls into the ith one, then table entries are computed as
+
+ tab[i].invc = 1/c
+ tab[i].logc = round(0x1p43*log(c))/0x1p43
+ tab[i].logctail = (double)(log(c) - logc)
+
+where c is chosen near the center of the subinterval such that 1/c has only a
+few precision bits so z/c - 1 is exactly representible as double:
+
+ 1/c = center < 1 ? round(N/center)/N : round(2*N/center)/N/2
+
+Note: |z/c - 1| < 1/N for the chosen c, |log(c) - logc - logctail| < 0x1p-97,
+the last few bits of logc are rounded away so k*ln2hi + logc has no rounding
+error and the interval for z is selected such that near x == 1, where log(x)
+is tiny, large cancellation error is avoided in logc + poly(z/c - 1). */
.tab = {
#if N == 128
#define A(a, b, c) {a, 0, b, c},
diff --git a/math/powf.c b/math/powf.c
index 5ea1e71..c4ea178 100644
--- a/math/powf.c
+++ b/math/powf.c
@@ -121,7 +121,8 @@ exp2_inline (double_t xd, uint32_t sign_bias)
return y;
}
-/* Returns 0 if not int, 1 if odd int, 2 if even int. */
+/* Returns 0 if not int, 1 if odd int, 2 if even int. The argument is
+ the bit representation of a non-zero finite floating-point value. */
static inline int
checkint (uint32_t iy)
{
diff --git a/math/sincosf.c b/math/sincosf.c
index 5b13d72..1af41f3 100644
--- a/math/sincosf.c
+++ b/math/sincosf.c
@@ -26,11 +26,10 @@
#include "math_config.h"
#include "sincosf.h"
-/* Fast sincosf implementation. Worst-case ULP is 0.56072, maximum relative
- error is 0.5303p-23. A single-step signed range reduction is used for
+/* Fast sincosf implementation. Worst-case ULP is 0.5607, maximum relative
+ error is 0.5303 * 2^-23. A single-step range reduction is used for
small values. Large inputs have their range reduced using fast integer
- arithmetic.
-*/
+ arithmetic. */
void
sincosf (float y, float *sinp, float *cosp)
{
diff --git a/math/sincosf.h b/math/sincosf.h
index 4aced58..395a30e 100644
--- a/math/sincosf.h
+++ b/math/sincosf.h
@@ -21,19 +21,25 @@
#include <math.h>
#include "math_config.h"
-/* PI * 2^-64. */
-static const double pi64 = 0x1.921FB54442D18p-62;
+/* 2PI * 2^-64. */
+static const double pi63 = 0x1.921FB54442D18p-62;
/* PI / 4. */
static const double pio4 = 0x1.921FB54442D18p-1;
+/* The constants and polynomials for sine and cosine. */
typedef struct
{
- double sign[4];
- double hpi_inv, hpi, c0, c1, c2, c3, c4, s1, s2, s3;
+ double sign[4]; /* Sign of sine in quadrants 0..3. */
+ double hpi_inv; /* 2 / PI ( * 2^24 if !TOINT_INTRINSICS). */
+ double hpi; /* PI / 2. */
+ double c0, c1, c2, c3, c4; /* Cosine polynomial. */
+ double s1, s2, s3; /* Sine polynomial. */
} sincos_t;
+/* Polynomial data (the cosine polynomial is negated in the 2nd entry). */
extern const sincos_t __sincosf_table[2] HIDDEN;
+/* Table with 4/PI to 192 bit precision. */
extern const uint32_t __inv_pio4[] HIDDEN;
/* Top 12 bits of the float representation with the sign bit cleared. */
@@ -107,18 +113,20 @@ sinf_poly (double x, double x2, const sincos_t *p, int n)
X as a value between -PI/4 and PI/4 and store the quadrant in NP.
The values for PI/2 and 2/PI are accessed via P. Since PI/2 as a double
is accurate to 55 bits and the worst-case cancellation happens at 6 * PI/4,
- only 2 multiplies are required and the result is accurate for |X| <= 120.0.
- Use round/lround if inlined, otherwise convert to int. To avoid inaccuracies
- introduced by truncating negative values, compute the quadrant * 2^24. */
+ the result is accurate for |X| <= 120.0. */
static inline double
reduce_fast (double x, const sincos_t *p, int *np)
{
double r;
#if TOINT_INTRINSICS
+ /* Use fast round and lround instructions when available. */
r = x * p->hpi_inv;
*np = converttoint (r);
return x - roundtoint (r) * p->hpi;
#else
+ /* Use scaled float to int conversion with explicit rounding.
+ hpi_inv is prescaled by 2^24 so the quadrant ends up in bits 24..31.
+ This avoids inaccuracies introduced by truncating negative values. */
r = x * p->hpi_inv;
int n = ((int32_t)r + 0x800000) >> 24;
*np = n;
@@ -126,7 +134,7 @@ reduce_fast (double x, const sincos_t *p, int *np)
#endif
}
-/* Reduce the range of XI to a multiple of PI/4 using fast integer arithmetic.
+/* Reduce the range of XI to a multiple of PI/2 using fast integer arithmetic.
XI is a reinterpreted float and must be >= 2.0f (the sign bit is ignored).
Return the modulo between -PI/4 and PI/4 and store the quadrant in NP.
Reduction uses a table of 4/PI with 192 bits of precision. A 32x96->128 bit
@@ -153,5 +161,5 @@ reduce_large (uint32_t xi, int *np)
res0 -= n << 62;
double x = (int64_t)res0;
*np = n;
- return x * pi64;
+ return x * pi63;
}
diff --git a/math/sincosf_data.c b/math/sincosf_data.c
index deaed1b..3e7e5cd 100644
--- a/math/sincosf_data.c
+++ b/math/sincosf_data.c
@@ -30,7 +30,7 @@ const sincos_t __sincosf_table[2] =
{
{
{ 1.0, -1.0, -1.0, 1.0 },
-#if HAVE_FAST_ROUND
+#if TOINT_INTRINSICS
0x1.45F306DC9C883p-1,
#else
0x1.45F306DC9C883p+23,
@@ -47,7 +47,7 @@ const sincos_t __sincosf_table[2] =
},
{
{ 1.0, -1.0, -1.0, 1.0 },
-#if HAVE_FAST_ROUND
+#if TOINT_INTRINSICS
0x1.45F306DC9C883p-1,
#else
0x1.45F306DC9C883p+23,
diff --git a/math/sinf.c b/math/sinf.c
index 5369d50..3f35923 100644
--- a/math/sinf.c
+++ b/math/sinf.c
@@ -25,11 +25,10 @@
#include "math_config.h"
#include "sincosf.h"
-/* Fast sinf implementation. Worst-case ULP is 0.56072, maximum relative
- error is 0.5303p-23. A single-step signed range reduction is used for
+/* Fast sinf implementation. Worst-case ULP is 0.5607, maximum relative
+ error is 0.5303 * 2^-23. A single-step range reduction is used for
small values. Large inputs have their range reduced using fast integer
- arithmetic.
-*/
+ arithmetic. */
float
sinf (float y)
{