diff options
author | Szabolcs Nagy <szabolcs.nagy@arm.com> | 2019-07-18 10:21:31 +0100 |
---|---|---|
committer | Szabolcs Nagy <szabolcs.nagy@arm.com> | 2019-07-18 10:21:31 +0100 |
commit | 4f408e745648aeae2f769e5675ddaed41e10b936 (patch) | |
tree | bb5c5430148a1d963c403baa6c99d2614fa85ecf /math | |
parent | 3a1d8e6f16d21cfda6f33e43a1d1df98a05c4dde (diff) | |
download | arm-optimized-routines-4f408e745648aeae2f769e5675ddaed41e10b936.tar.gz |
Remove math/single and rem_pio2
math/single contained code for systems without double precision fpu
and rem_pio2 is not used currently and likely will be designed
differently when double precision trigonometric functions are added.
Diffstat (limited to 'math')
-rw-r--r-- | math/cosf.c | 6 | ||||
-rw-r--r-- | math/expf.c | 5 | ||||
-rw-r--r-- | math/funder.c | 3 | ||||
-rw-r--r-- | math/logf.c | 5 | ||||
-rw-r--r-- | math/logf_data.c | 3 | ||||
-rw-r--r-- | math/powf.c | 5 | ||||
-rw-r--r-- | math/powf_log2_data.c | 3 | ||||
-rw-r--r-- | math/rem_pio2.c | 1 | ||||
-rw-r--r-- | math/rredf.c | 3 | ||||
-rw-r--r-- | math/sincosf.c | 6 | ||||
-rw-r--r-- | math/sincosf_data.c | 4 | ||||
-rw-r--r-- | math/sinf.c | 6 | ||||
-rw-r--r-- | math/single/dunder.c | 52 | ||||
-rw-r--r-- | math/single/e_expf.c | 117 | ||||
-rw-r--r-- | math/single/e_logf.c | 153 | ||||
-rw-r--r-- | math/single/e_powf.c | 658 | ||||
-rw-r--r-- | math/single/e_rem_pio2.c | 268 | ||||
-rw-r--r-- | math/single/funder.c | 51 | ||||
-rw-r--r-- | math/single/ieee_status.c | 38 | ||||
-rw-r--r-- | math/single/math_private.h | 138 | ||||
-rw-r--r-- | math/single/poly.c | 25 | ||||
-rw-r--r-- | math/single/rredf.c | 239 | ||||
-rw-r--r-- | math/single/rredf.h | 146 | ||||
-rw-r--r-- | math/single/s_cosf.c | 11 | ||||
-rw-r--r-- | math/single/s_sincosf.c | 109 | ||||
-rw-r--r-- | math/single/s_sinf.c | 11 | ||||
-rw-r--r-- | math/single/s_tanf.c | 77 | ||||
-rw-r--r-- | math/tanf.c | 3 |
28 files changed, 0 insertions, 2146 deletions
diff --git a/math/cosf.c b/math/cosf.c index 84a138f..831b39e 100644 --- a/math/cosf.c +++ b/math/cosf.c @@ -5,10 +5,6 @@ * SPDX-License-Identifier: MIT */ -#if WANT_SINGLEPREC -#include "single/s_cosf.c" -#else - #include <stdint.h> #include <math.h> #include "math_config.h" @@ -65,5 +61,3 @@ cosf (float y) else return __math_invalidf (y); } - -#endif diff --git a/math/expf.c b/math/expf.c index f8238a7..0fe1f7d 100644 --- a/math/expf.c +++ b/math/expf.c @@ -5,10 +5,6 @@ * SPDX-License-Identifier: MIT */ -#if WANT_SINGLEPREC -#include "single/e_expf.c" -#else - #include <math.h> #include <stdint.h> #include "math_config.h" @@ -93,4 +89,3 @@ expf (float x) strong_alias (expf, __expf_finite) hidden_alias (expf, __ieee754_expf) #endif -#endif diff --git a/math/funder.c b/math/funder.c deleted file mode 100644 index 14fdd2e..0000000 --- a/math/funder.c +++ /dev/null @@ -1,3 +0,0 @@ -#if WANT_SINGLEPREC -#include "single/funder.c" -#endif diff --git a/math/logf.c b/math/logf.c index 9c3657b..ee3120a 100644 --- a/math/logf.c +++ b/math/logf.c @@ -5,10 +5,6 @@ * SPDX-License-Identifier: MIT */ -#if WANT_SINGLEPREC -#include "single/e_logf.c" -#else - #include <math.h> #include <stdint.h> #include "math_config.h" @@ -81,4 +77,3 @@ logf (float x) strong_alias (logf, __logf_finite) hidden_alias (logf, __ieee754_logf) #endif -#endif diff --git a/math/logf_data.c b/math/logf_data.c index cee9271..53c5f62 100644 --- a/math/logf_data.c +++ b/math/logf_data.c @@ -5,8 +5,6 @@ * SPDX-License-Identifier: MIT */ -#if !WANT_SINGLEPREC - #include "math_config.h" const struct logf_data __logf_data = { @@ -33,4 +31,3 @@ const struct logf_data __logf_data = { -0x1.00ea348b88334p-2, 0x1.5575b0be00b6ap-2, -0x1.ffffef20a4123p-2, } }; -#endif diff --git a/math/powf.c b/math/powf.c index 1149842..1534a09 100644 --- a/math/powf.c +++ b/math/powf.c @@ -5,10 +5,6 @@ * SPDX-License-Identifier: MIT */ -#if WANT_SINGLEPREC -#include "single/e_powf.c" -#else - #include <math.h> #include <stdint.h> #include "math_config.h" @@ -223,4 +219,3 @@ powf (float x, float y) strong_alias (powf, __powf_finite) hidden_alias (powf, __ieee754_powf) #endif -#endif diff --git a/math/powf_log2_data.c b/math/powf_log2_data.c index 285d1bd..b9fbdc4 100644 --- a/math/powf_log2_data.c +++ b/math/powf_log2_data.c @@ -5,8 +5,6 @@ * SPDX-License-Identifier: MIT */ -#if !WANT_SINGLEPREC - #include "math_config.h" const struct powf_log2_data __powf_log2_data = { @@ -34,4 +32,3 @@ const struct powf_log2_data __powf_log2_data = { 0x1.71547652ab82bp0 * POWF_SCALE, } }; -#endif diff --git a/math/rem_pio2.c b/math/rem_pio2.c deleted file mode 100644 index 16edd74..0000000 --- a/math/rem_pio2.c +++ /dev/null @@ -1 +0,0 @@ -#include "single/e_rem_pio2.c" diff --git a/math/rredf.c b/math/rredf.c deleted file mode 100644 index fde20f0..0000000 --- a/math/rredf.c +++ /dev/null @@ -1,3 +0,0 @@ -#if WANT_SINGLEPREC -#include "single/rredf.c" -#endif diff --git a/math/sincosf.c b/math/sincosf.c index fb6a29d..e6cd41e 100644 --- a/math/sincosf.c +++ b/math/sincosf.c @@ -5,10 +5,6 @@ * SPDX-License-Identifier: MIT */ -#if WANT_SINGLEPREC -#include "single/s_sincosf.c" -#else - #include <stdint.h> #include <math.h> #include "math_config.h" @@ -81,5 +77,3 @@ sincosf (float y, float *sinp, float *cosp) #endif } } - -#endif diff --git a/math/sincosf_data.c b/math/sincosf_data.c index 2512f37..5d0b58e 100644 --- a/math/sincosf_data.c +++ b/math/sincosf_data.c @@ -5,8 +5,6 @@ * SPDX-License-Identifier: MIT */ -#if !WANT_SINGLEPREC - #include <stdint.h> #include <math.h> #include "math_config.h" @@ -63,5 +61,3 @@ const uint32_t __inv_pio4[24] = 0x34ddc0db, 0xddc0db62, 0xc0db6295, 0xdb629599, 0x6295993c, 0x95993c43, 0x993c4390, 0x3c439041 }; - -#endif diff --git a/math/sinf.c b/math/sinf.c index f624cde..770b294 100644 --- a/math/sinf.c +++ b/math/sinf.c @@ -5,10 +5,6 @@ * SPDX-License-Identifier: MIT */ -#if WANT_SINGLEPREC -#include "single/s_sinf.c" -#else - #include <math.h> #include "math_config.h" #include "sincosf.h" @@ -69,5 +65,3 @@ sinf (float y) else return __math_invalidf (y); } - -#endif diff --git a/math/single/dunder.c b/math/single/dunder.c deleted file mode 100644 index faec985..0000000 --- a/math/single/dunder.c +++ /dev/null @@ -1,52 +0,0 @@ -/* - * dunder.c - manually provoke FP exceptions for mathlib - * - * Copyright (c) 2009-2018, Arm Limited. - * SPDX-License-Identifier: MIT - */ - - -#include "math_private.h" -#include <fenv.h> - -__inline double __mathlib_dbl_infnan(double x) -{ - return x+x; -} - -__inline double __mathlib_dbl_infnan2(double x, double y) -{ - return x+y; -} - -double __mathlib_dbl_underflow(void) -{ -#ifdef CLANG_EXCEPTIONS - feraiseexcept(FE_UNDERFLOW); -#endif - return 0x1p-767 * 0x1p-767; -} - -double __mathlib_dbl_overflow(void) -{ -#ifdef CLANG_EXCEPTIONS - feraiseexcept(FE_OVERFLOW); -#endif - return 0x1p+769 * 0x1p+769; -} - -double __mathlib_dbl_invalid(void) -{ -#ifdef CLANG_EXCEPTIONS - feraiseexcept(FE_INVALID); -#endif - return 0.0 / 0.0; -} - -double __mathlib_dbl_divzero(void) -{ -#ifdef CLANG_EXCEPTIONS - feraiseexcept(FE_DIVBYZERO); -#endif - return 1.0 / 0.0; -} diff --git a/math/single/e_expf.c b/math/single/e_expf.c deleted file mode 100644 index 739fe1d..0000000 --- a/math/single/e_expf.c +++ /dev/null @@ -1,117 +0,0 @@ -/* - * e_expf.c - single-precision exp function - * - * Copyright (c) 2009-2018, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -/* - * Algorithm was once taken from Cody & Waite, but has been munged - * out of all recognition by SGT. - */ - -#include <math.h> -#include <errno.h> -#include "math_private.h" - -float -expf(float X) -{ - int N; float XN, g, Rg, Result; - unsigned ix = fai(X), edgecaseflag = 0; - - /* - * Handle infinities, NaNs and big numbers. - */ - if (__builtin_expect((ix << 1) - 0x67000000 > 0x85500000 - 0x67000000, 0)) { - if (!(0x7f800000 & ~ix)) { - if (ix == 0xff800000) - return 0.0f; - else - return FLOAT_INFNAN(X);/* do the right thing with both kinds of NaN and with +inf */ - } else if ((ix << 1) < 0x67000000) { - return 1.0f; /* magnitude so small the answer can't be distinguished from 1 */ - } else if ((ix << 1) > 0x85a00000) { - __set_errno(ERANGE); - if (ix & 0x80000000) { - return FLOAT_UNDERFLOW; - } else { - return FLOAT_OVERFLOW; - } - } else { - edgecaseflag = 1; - } - } - - /* - * Split the input into an integer multiple of log(2)/4, and a - * fractional part. - */ - XN = X * (4.0f*1.4426950408889634074f); -#ifdef __TARGET_FPU_SOFTVFP - XN = _frnd(XN); - N = (int)XN; -#else - N = (int)(XN + (ix & 0x80000000 ? -0.5f : 0.5f)); - XN = N; -#endif - g = (X - XN * 0x1.62ep-3F) - XN * 0x1.0bfbe8p-17F; /* prec-and-a-half representation of log(2)/4 */ - - /* - * Now we compute exp(X) in, conceptually, three parts: - * - a pure power of two which we get from N>>2 - * - exp(g) for g in [-log(2)/8,+log(2)/8], which we compute - * using a Remez-generated polynomial approximation - * - exp(k*log(2)/4) (aka 2^(k/4)) for k in [0..3], which we - * get from a lookup table in precision-and-a-half and - * multiply by g. - * - * We gain a bit of extra precision by the fact that actually - * our polynomial approximation gives us exp(g)-1, and we add - * the 1 back on by tweaking the prec-and-a-half multiplication - * step. - * - * Coefficients generated by the command - -./auxiliary/remez.jl --variable=g --suffix=f -- '-log(BigFloat(2))/8' '+log(BigFloat(2))/8' 3 0 '(expm1(x))/x' - - */ - Rg = g * ( - 9.999999412829185331953781321128516523408059996430919985217971370689774264850229e-01f+g*(4.999999608551332693833317084753864837160947932961832943901913087652889900683833e-01f+g*(1.667292360203016574303631953046104769969440903672618034272397630620346717392378e-01f+g*(4.168230895653321517750133783431970715648192153539929404872173693978116154823859e-02f))) - ); - - /* - * Do the table lookup and combine it with Rg, to get our final - * answer apart from the exponent. - */ - { - static const float twotokover4top[4] = { 0x1p+0F, 0x1.306p+0F, 0x1.6ap+0F, 0x1.ae8p+0F }; - static const float twotokover4bot[4] = { 0x0p+0F, 0x1.fc1464p-13F, 0x1.3cccfep-13F, 0x1.3f32b6p-13F }; - static const float twotokover4all[4] = { 0x1p+0F, 0x1.306fep+0F, 0x1.6a09e6p+0F, 0x1.ae89fap+0F }; - int index = (N & 3); - Rg = twotokover4top[index] + (twotokover4bot[index] + twotokover4all[index]*Rg); - N >>= 2; - } - - /* - * Combine the output exponent and mantissa, and return. - */ - if (__builtin_expect(edgecaseflag, 0)) { - Result = fhex(((N/2) << 23) + 0x3f800000); - Result *= Rg; - Result *= fhex(((N-N/2) << 23) + 0x3f800000); - /* - * Step not mentioned in C&W: set errno reliably. - */ - if (fai(Result) == 0) - return MATHERR_EXPF_UFL(Result); - if (fai(Result) == 0x7f800000) - return MATHERR_EXPF_OFL(Result); - return FLOAT_CHECKDENORM(Result); - } else { - Result = fhex(N * 8388608.0f + (float)0x3f800000); - Result *= Rg; - } - - return Result; -} diff --git a/math/single/e_logf.c b/math/single/e_logf.c deleted file mode 100644 index aa29489..0000000 --- a/math/single/e_logf.c +++ /dev/null @@ -1,153 +0,0 @@ -/* - * e_logf.c - single precision log function - * - * Copyright (c) 2009-2018, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -/* - * Algorithm was once taken from Cody & Waite, but has been munged - * out of all recognition by SGT. - */ - -#include <math.h> -#include <errno.h> -#include "math_private.h" - -float -logf(float X) -{ - int N = 0; - int aindex; - float a, x, s; - unsigned ix = fai(X); - - if (__builtin_expect((ix - 0x00800000) >= 0x7f800000 - 0x00800000, 0)) { - if ((ix << 1) > 0xff000000) /* NaN */ - return FLOAT_INFNAN(X); - if (ix == 0x7f800000) /* +inf */ - return X; - if (X < 0) { /* anything negative */ - return MATHERR_LOGF_NEG(X); - } - if (X == 0) { - return MATHERR_LOGF_0(X); - } - /* That leaves denormals. */ - N = -23; - X *= 0x1p+23F; - ix = fai(X); - } - - /* - * Separate X into three parts: - * - 2^N for some integer N - * - a number a of the form (1+k/8) for k=0,...,7 - * - a residual which we compute as s = (x-a)/(x+a), for - * x=X/2^N. - * - * We pick the _nearest_ (N,a) pair, so that (x-a) has magnitude - * at most 1/16. Hence, we must round things that are just - * _below_ a power of two up to the next power of two, so this - * isn't as simple as extracting the raw exponent of the FP - * number. Instead we must grab the exponent together with the - * top few bits of the mantissa, and round (in integers) there. - */ - { - int rounded = ix + 0x00080000; - int Nnew = (rounded >> 23) - 127; - aindex = (rounded >> 20) & 7; - a = fhex(0x3f800000 + (aindex << 20)); - N += Nnew; - x = fhex(ix - (Nnew << 23)); - } - - if (!N && !aindex) { - /* - * Use an alternative strategy for very small |x|, which - * avoids the 1ULP of relative error introduced in the - * computation of s. If our nearest (N,a) pair is N=0,a=1, - * that means we have -1/32 < x-a < 1/16, on which interval - * the ordinary series for log(1+z) (setting z-x-a) will - * converge adequately fast; so we can simply find an - * approximation to log(1+z)/z good on that interval and - * scale it by z on the way out. - * - * Coefficients generated by the command - -./auxiliary/remez.jl --variable=z --suffix=f -- '-1/BigFloat(32)' '+1/BigFloat(16)' 3 0 '(log1p(x)-x)/x^2' - - */ - float z = x - 1.0f; - float p = z*z * ( - -4.999999767382730053173434595877399055021398381370452534949864039404089549132551e-01f+z*(3.333416379155995401749506866323446447523793085809161350343357014272193712456391e-01f+z*(-2.501299948811686421962724839011563450757435183422532362736159418564644404218257e-01f+z*(1.903576945606738444146078468935429697455230136403008172485495359631510244557255e-01f))) - ); - - return z + p; - } - - /* - * Now we have N, a and x correct, so that |x-a| <= 1/16. - * Compute s. - * - * (Since |x+a| >= 2, this means that |s| will be at most 1/32.) - */ - s = (x - a) / (x + a); - - /* - * The point of computing s = (x-a)/(x+a) was that this makes x - * equal to a * (1+s)/(1-s). So we can now compute log(x) by - * means of computing log((1+s)/(1-s)) (which has a more - * efficiently converging series), and adding log(a) which we - * obtain from a lookup table. - * - * So our full answer to log(X) is now formed by adding together - * N*log(2) + log(a) + log((1+s)/(1-s)). - * - * Now log((1+s)/(1-s)) has the exact Taylor series - * - * 2s + 2s^3/3 + 2s^5/5 + ... - * - * and what we do is to compute all but the first term of that - * as a polynomial approximation in s^2, then add on the first - * term - and all the other bits and pieces above - in - * precision-and-a-half so as to keep the total error down. - */ - { - float s2 = s*s; - - /* - * We want a polynomial L(s^2) such that - * - * 2s + s^3*L(s^2) = log((1+s)/(1-s)) - * - * => L(s^2) = (log((1+s)/(1-s)) - 2s) / s^3 - * - * => L(z) = (log((1+sqrt(z))/(1-sqrt(z))) - 2*sqrt(z)) / sqrt(z)^3 - * - * The required range of the polynomial is only [0,1/32^2]. - * - * Our accuracy requirement for the polynomial approximation - * is that we don't want to introduce any error more than - * about 2^-23 times the _top_ bit of s. But the value of - * the polynomial has magnitude about s^3; and since |s| < - * 2^-5, this gives us |s^3/s| < 2^-10. In other words, - * our approximation only needs to be accurate to 13 bits or - * so before its error is lost in the noise when we add it - * to everything else. - * - * Coefficients generated by the command - -./auxiliary/remez.jl --variable=s2 --suffix=f -- '0' '1/BigFloat(32^2)' 1 0 '(abs(x) < 1e-20 ? BigFloat(2)/3 + 2*x/5 + 2*x^2/7 + 2*x^3/9 : (log((1+sqrt(x))/(1-sqrt(x)))-2*sqrt(x))/sqrt(x^3))' - - */ - float p = s * s2 * ( - 6.666666325680271091157649745099739739798210281016897722498744752867165138320995e-01f+s2*(4.002792299542401431889592846825025487338520940900492146195427243856292349188402e-01f) - ); - - static const float log2hi = 0x1.62ep-1F, log2lo = 0x1.0bfbe8p-15F; - static const float logahi[8] = { 0x0p+0F, 0x1.e26p-4F, 0x1.c8ep-3F, 0x1.46p-2F, 0x1.9f2p-2F, 0x1.f12p-2F, 0x1.1e8p-1F, 0x1.41cp-1F }; - static const float logalo[8] = { 0x0p+0F, 0x1.076e2ap-16F, 0x1.f7c79ap-15F, 0x1.8bc21cp-14F, 0x1.23eccp-14F, 0x1.1ebf5ep-15F, 0x1.7d79c2p-15F, 0x1.8fe846p-13F }; - return (N*log2hi + logahi[aindex]) + (2.0f*s + (N*log2lo + logalo[aindex] + p)); - } -} diff --git a/math/single/e_powf.c b/math/single/e_powf.c deleted file mode 100644 index fa000f6..0000000 --- a/math/single/e_powf.c +++ /dev/null @@ -1,658 +0,0 @@ -/* - * e_powf.c - single-precision power function - * - * Copyright (c) 2009-2018, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#include <math.h> -#include <errno.h> -#include "math_private.h" - -float -powf(float x, float y) -{ - float logh, logl; - float rlogh, rlogl; - float sign = 1.0f; - int expadjust = 0; - unsigned ix, iy; - - ix = fai(x); - iy = fai(y); - - if (__builtin_expect((ix - 0x00800000) >= (0x7f800000 - 0x00800000) || - ((iy << 1) + 0x02000000) < 0x40000000, 0)) { - /* - * The above test rules out, as quickly as I can see how to, - * all possible inputs except for a normalised positive x - * being raised to the power of a normalised (and not - * excessively small) y. That's the fast-path case: if - * that's what the user wants, we can skip all of the - * difficult special-case handling. - * - * Now we must identify, as efficiently as we can, cases - * which will return to the fast path with a little tidying - * up. These are, in order of likelihood and hence of - * processing: - * - * - a normalised _negative_ x raised to the power of a - * non-zero finite y. Having identified this case, we - * must categorise y into one of the three categories - * 'odd integer', 'even integer' and 'non-integer'; for - * the last of these we return an error, while for the - * other two we rejoin the main code path having rendered - * x positive and stored an appropriate sign to append to - * the eventual result. - * - * - a _denormal_ x raised to the power of a non-zero - * finite y, in which case we multiply it up by a power - * of two to renormalise it, store an appropriate - * adjustment for its base-2 logarithm, and depending on - * the sign of y either return straight to the main code - * path or go via the categorisation of y above. - * - * - any of the above kinds of x raised to the power of a - * zero, denormal, nearly-denormal or nearly-infinite y, - * in which case we must do the checks on x as above but - * otherwise the algorithm goes through basically - * unchanged. Denormal and very tiny y values get scaled - * up to something not in range of accidental underflow - * when split into prec-and-a-half format; very large y - * values get scaled down by a factor of two to prevent - * CLEARBOTTOMHALF's round-up from overflowing them to - * infinity. (Of course the _output_ will overflow either - * way - the largest y value that can possibly yield a - * finite result is well below this range anyway - so - * this is a safe change.) - */ - if (__builtin_expect(((iy << 1) + 0x02000000) >= 0x40000000, 1)) { /* normalised and sensible y */ - y_ok_check_x: - - if (__builtin_expect((ix - 0x80800000) < (0xff800000 - 0x80800000), 1)) { /* normal but negative x */ - y_ok_x_negative: - - x = fabsf(x); - ix = fai(x); - - /* - * Determine the parity of y, if it's an integer at - * all. - */ - { - int yexp, yunitsbit; - - /* - * Find the exponent of y. - */ - yexp = (iy >> 23) & 0xFF; - /* - * Numbers with an exponent smaller than 0x7F - * are strictly smaller than 1, and hence must - * be fractional. - */ - if (yexp < 0x7F) - return MATHERR_POWF_NEGFRAC(x,y); - /* - * Numbers with an exponent at least 0x97 are by - * definition even integers. - */ - if (yexp >= 0x97) - goto mainpath; /* rejoin main code, giving positive result */ - /* - * In between, we must check the mantissa. - * - * Note that this case includes yexp==0x7f, - * which means 1 point something. In this case, - * the 'units bit' we're testing is semantically - * the lowest bit of the exponent field, not the - * leading 1 on the mantissa - but fortunately, - * that bit position will just happen to contain - * the 1 that we would wish it to, because the - * exponent describing that particular case just - * happens to be odd. - */ - yunitsbit = 0x96 - yexp; - if (iy & ((1 << yunitsbit)-1)) - return MATHERR_POWF_NEGFRAC(x,y); - else if (iy & (1 << yunitsbit)) - sign = -1.0f; /* y is odd; result should be negative */ - goto mainpath; /* now we can rejoin the main code */ - } - } else if (__builtin_expect((ix << 1) != 0 && (ix << 1) < 0x01000000, 0)) { /* denormal x */ - /* - * Renormalise x. - */ - x *= 0x1p+27F; - ix = fai(x); - /* - * Set expadjust to compensate for that. - */ - expadjust = -27; - - /* Now we need to handle negative x as above. */ - if (ix & 0x80000000) - goto y_ok_x_negative; - else - goto mainpath; - } else if ((ix - 0x00800000) < (0x7f800000 - 0x00800000)) { - /* normal positive x, back here from denormal-y case below */ - goto mainpath; - } - } else if (((iy << 1) + 0x02000000) >= 0x02000000) { /* denormal, nearly-denormal or zero y */ - if (y == 0.0F) { - /* - * y == 0. Any finite x returns 1 here. (Quiet NaNs - * do too, but we handle that below since we don't - * mind doing them more slowly.) - */ - if ((ix << 1) != 0 && (ix << 1) < 0xFF000000) - return 1.0f; - } else { - /* - * Denormal or very very small y. In this situation - * we have to be a bit careful, because when we - * break up y into precision-and-a-half later on we - * risk working with denormals and triggering - * underflow exceptions within this function that - * aren't related to the smallness of the output. So - * here we convert all such y values into a standard - * small-but-not-too-small value which will give the - * same output. - * - * What value should that be? Well, we work in - * 16*log2(x) below (equivalently, log to the base - * 2^{1/16}). So the maximum magnitude of that for - * any finite x is about 2416 (= 16 * (128+23), for - * log of the smallest denormal x), i.e. certainly - * less than 2^12. If multiplying that by y gives - * anything of magnitude less than 2^-32 (and even - * that's being generous), the final output will be - * indistinguishable from 1. So any value of y with - * magnitude less than 2^-(32+12) = 2^-44 is - * completely indistinguishable from any other such - * value. Hence we got here in the first place by - * checking the exponent of y against 64 (i.e. -63, - * counting the exponent bias), so we might as well - * normalise all tiny y values to the same threshold - * of 2^-64. - */ - iy = 0x1f800000 | (iy & 0x80000000); /* keep the sign; that's important */ - y = fhex(iy); - } - goto y_ok_check_x; - } else if (((iy << 1) + 0x02000000) < 0x01000000) { /* y in top finite exponent bracket */ - y = fhex(fai(y) - 0x00800000); /* scale down by a factor of 2 */ - goto y_ok_check_x; - } - - /* - * Having dealt with the above cases, we now know that - * either x is zero, infinite or NaN, or y is infinite or - * NaN, or both. We can deal with all of those cases without - * ever rejoining the main code path. - */ - if ((unsigned)(((ix & 0x7FFFFFFF) - 0x7f800001) < 0x7fc00000 - 0x7f800001) || - (unsigned)(((iy & 0x7FFFFFFF) - 0x7f800001) < 0x7fc00000 - 0x7f800001)) { - /* - * At least one signalling NaN. Do a token arithmetic - * operation on the two operands to provoke an exception - * and return the appropriate QNaN. - */ - return FLOAT_INFNAN2(x,y); - } else if (ix==0x3f800000 || (iy << 1)==0) { - /* - * C99 says that 1^anything and anything^0 should both - * return 1, _even for a NaN_. I modify that slightly to - * apply only to QNaNs (which doesn't violate C99, since - * C99 doesn't specify anything about SNaNs at all), - * because I can't bring myself not to throw an - * exception on an SNaN since its _entire purpose_ is to - * throw an exception whenever touched. - */ - return 1.0f; - } else - if (((ix & 0x7FFFFFFF) > 0x7f800000) || - ((iy & 0x7FFFFFFF) > 0x7f800000)) { - /* - * At least one QNaN. Do a token arithmetic operation on - * the two operands to get the right one to propagate to - * the output. - */ - return FLOAT_INFNAN2(x,y); - } else if (ix == 0x7f800000) { - /* - * x = +infinity. Return +infinity for positive y, +0 - * for negative y, and 1 for zero y. - */ - if (!(iy << 1)) - return MATHERR_POWF_INF0(x,y); - else if (iy & 0x80000000) - return 0.0f; - else - return INFINITY; - } else { - /* - * Repeat the parity analysis of y above, returning 1 - * (odd), 2 (even) or 0 (fraction). - */ - int ypar, yexp, yunitsbit; - yexp = (iy >> 23) & 0xFF; - if (yexp < 0x7F) - ypar = 0; - else if (yexp >= 0x97) - ypar = 2; - else { - yunitsbit = 0x96 - yexp; - if (iy & ((1 << yunitsbit)-1)) - ypar = 0; - else if (iy & (1 << yunitsbit)) - ypar = 1; - else - ypar = 2; - } - - if (ix == 0xff800000) { - /* - * x = -infinity. We return infinity or zero - * depending on whether y is positive or negative, - * and the sign is negative iff y is an odd integer. - * (SGT: I don't like this, but it's what C99 - * mandates.) - */ - if (!(iy & 0x80000000)) { - if (ypar == 1) - return -INFINITY; - else - return INFINITY; - } else { - if (ypar == 1) - return -0.0f; - else - return +0.0f; - } - } else if (ix == 0) { - /* - * x = +0. We return +0 for all positive y including - * infinity; a divide-by-zero-like error for all - * negative y including infinity; and an 0^0 error - * for zero y. - */ - if ((iy << 1) == 0) - return MATHERR_POWF_00(x,y); - else if (iy & 0x80000000) - return MATHERR_POWF_0NEGEVEN(x,y); - else - return +0.0f; - } else if (ix == 0x80000000) { - /* - * x = -0. We return errors in almost all cases (the - * exception being positive integer y, in which case - * we return a zero of the appropriate sign), but - * the errors are almost all different. Gah. - */ - if ((iy << 1) == 0) - return MATHERR_POWF_00(x,y); - else if (iy == 0x7f800000) - return MATHERR_POWF_NEG0FRAC(x,y); - else if (iy == 0xff800000) - return MATHERR_POWF_0NEG(x,y); - else if (iy & 0x80000000) - return (ypar == 0 ? MATHERR_POWF_0NEG(x,y) : - ypar == 1 ? MATHERR_POWF_0NEGODD(x,y) : - /* ypar == 2 ? */ MATHERR_POWF_0NEGEVEN(x,y)); - else - return (ypar == 0 ? MATHERR_POWF_NEG0FRAC(x,y) : - ypar == 1 ? -0.0f : - /* ypar == 2 ? */ +0.0f); - } else { - /* - * Now we know y is an infinity of one sign or the - * other and x is finite and nonzero. If x == -1 (+1 - * is already ruled out), we return +1; otherwise - * C99 mandates that we return either +0 or +inf, - * the former iff exactly one of |x| < 1 and y<0 is - * true. - */ - if (ix == 0xbf800000) { - return +1.0f; - } else if (!((ix << 1) < 0x7f000000) ^ !(iy & 0x80000000)) { - return +0.0f; - } - else { - return INFINITY; - } - } - } - } - - mainpath: - -#define PHMULTIPLY(rh,rl, xh,xl, yh,yl) do { \ - float tmph, tmpl; \ - tmph = (xh) * (yh); \ - tmpl = (xh) * (yl) + (xl) * ((yh)+(yl)); \ -/* printf("PHMULTIPLY: tmp=%08x+%08x\n", fai(tmph), fai(tmpl)); */ \ - (rh) = CLEARBOTTOMHALF(tmph + tmpl); \ - (rl) = tmpl + (tmph - (rh)); \ -} while (0) - -/* - * Same as the PHMULTIPLY macro above, but bounds the absolute value - * of rh+rl. In multiplications uncontrolled enough that rh can go - * infinite, we can get an IVO exception from the subtraction tmph - - * rh, so we should spot that case in advance and avoid it. - */ -#define PHMULTIPLY_SATURATE(rh,rl, xh,xl, yh,yl, bound) do { \ - float tmph, tmpl; \ - tmph = (xh) * (yh); \ - if (fabsf(tmph) > (bound)) { \ - (rh) = copysignf((bound),(tmph)); \ - (rl) = 0.0f; \ - } else { \ - tmpl = (xh) * (yl) + (xl) * ((yh)+(yl)); \ - (rh) = CLEARBOTTOMHALF(tmph + tmpl); \ - (rl) = tmpl + (tmph - (rh)); \ - } \ - } while (0) - - /* - * Determine log2 of x to relative prec-and-a-half, as logh + - * logl. - * - * Well, we're actually computing 16*log2(x), so that it's the - * right size for the subsequently fiddly messing with powers of - * 2^(1/16) in the exp step at the end. - */ - if (__builtin_expect((ix - 0x3f7ff000) <= (0x3f801000 - 0x3f7ff000), 0)) { - /* - * For x this close to 1, we write x = 1 + t and then - * compute t - t^2/2 + t^3/3 - t^4/4; and the neat bit is - * that t itself, being the bottom half of an input - * mantissa, is in half-precision already, so the output is - * naturally in canonical prec-and-a-half form. - */ - float t = x - 1.0; - float lnh, lnl; - /* - * Compute natural log of x in prec-and-a-half. - */ - lnh = t; - lnl = - (t * t) * ((1.0f/2.0f) - t * ((1.0f/3.0f) - t * (1.0f/4.0f))); - - /* - * Now we must scale by 16/log(2), still in prec-and-a-half, - * to turn this from natural log(x) into 16*log2(x). - */ - PHMULTIPLY(logh, logl, lnh, lnl, 0x1.716p+4F, -0x1.7135a8p-9F); - } else { - /* - * For all other x, we start by normalising to [1,2), and - * then dividing that further into subintervals. For each - * subinterval we pick a number a in that interval, compute - * s = (x-a)/(x+a) in precision-and-a-half, and then find - * the log base 2 of (1+s)/(1-s), still in precision-and-a- - * half. - * - * Why would we do anything so silly? For two main reasons. - * - * Firstly, if s = (x-a)/(x+a), then a bit of algebra tells - * us that x = a * (1+s)/(1-s); so once we've got - * log2((1+s)/(1-s)), we need only add on log2(a) and then - * we've got log2(x). So this lets us treat all our - * subintervals in essentially the same way, rather than - * requiring a separate approximation for each one; the only - * correction factor we need is to store a table of the - * base-2 logs of all our values of a. - * - * Secondly, log2((1+s)/(1-s)) is a nice thing to compute, - * once we've got s. Switching to natural logarithms for the - * moment (it's only a scaling factor to sort that out at - * the end), we write it as the difference of two logs: - * - * log((1+s)/(1-s)) = log(1+s) - log(1-s) - * - * Now recall that Taylor series expansion gives us - * - * log(1+s) = s - s^2/2 + s^3/3 - ... - * - * and therefore we also have - * - * log(1-s) = -s - s^2/2 - s^3/3 - ... - * - * These series are exactly the same except that the odd - * terms (s, s^3 etc) have flipped signs; so subtracting the - * latter from the former gives us - * - * log(1+s) - log(1-s) = 2s + 2s^3/3 + 2s^5/5 + ... - * - * which requires only half as many terms to be computed - * before the powers of s get too small to see. Then, of - * course, we have to scale the result by 1/log(2) to - * convert natural logs into logs base 2. - * - * To compute the above series in precision-and-a-half, we - * first extract a factor of 2s (which we can multiply back - * in later) so that we're computing 1 + s^2/3 + s^4/5 + ... - * and then observe that if s starts off small enough to - * make s^2/3 at most 2^-12, we need only compute the first - * couple of terms in laborious prec-and-a-half, and can - * delegate everything after that to a simple polynomial - * approximation whose error will end up at the bottom of - * the low word of the result. - * - * How many subintervals does that mean we need? - * - * To go back to s = (x-a)/(x+a). Let x = a + e, for some - * positive e. Then |s| = |e| / |2a+e| <= |e/2a|. So suppose - * we have n subintervals of equal width covering the space - * from 1 to 2. If a is at the centre of each interval, then - * we have e at most 1/2n and a can equal any of 1, 1+1/n, - * 1+2/n, ... 1+(n-1)/n. In that case, clearly the largest - * value of |e/2a| is given by the largest e (i.e. 1/2n) and - * the smallest a (i.e. 1); so |s| <= 1/4n. Hence, when we - * know how big we're prepared to let s be, we simply make - * sure 1/4n is at most that. - * - * And if we want s^2/3 to be at most 2^-12, then that means - * s^2 is at most 3*2^-12, so that s is at most sqrt(3)*2^-6 - * = 0.02706. To get 1/4n smaller than that, we need to have - * n>=9.23; so we'll set n=16 (for ease of bit-twiddling), - * and then s is at most 1/64. - */ - int n, i; - float a, ax, sh, sl, lsh, lsl; - - /* - * Let ax be x normalised to a single exponent range. - * However, the exponent range in question is not a simple - * one like [1,2). What we do is to round up the top four - * bits of the mantissa, so that the top 1/32 of each - * natural exponent range rounds up to the next one and is - * treated as a displacement from the lowest a in that - * range. - * - * So this piece of bit-twiddling gets us our input exponent - * and our subinterval index. - */ - n = (ix + 0x00040000) >> 19; - i = n & 15; - n = ((n >> 4) & 0xFF) - 0x7F; - ax = fhex(ix - (n << 23)); - n += expadjust; - - /* - * Compute the subinterval centre a. - */ - a = 1.0f + i * (1.0f/16.0f); - - /* - * Compute s = (ax-a)/(ax+a), in precision-and-a-half. - */ - { - float u, vh, vl, vapprox, rvapprox; - - u = ax - a; /* exact numerator */ - vapprox = ax + a; /* approximate denominator */ - vh = CLEARBOTTOMHALF(vapprox); - vl = (a - vh) + ax; /* vh+vl is exact denominator */ - rvapprox = 1.0f/vapprox; /* approximate reciprocal of denominator */ - - sh = CLEARBOTTOMHALF(u * rvapprox); - sl = ((u - sh*vh) - sh*vl) * rvapprox; - } - - /* - * Now compute log2(1+s) - log2(1-s). We do this in several - * steps. - * - * By polynomial approximation, we compute - * - * log(1+s) - log(1-s) - * p = ------------------- - 1 - * 2s - * - * in single precision only, using a single-precision - * approximation to s. This polynomial has s^2 as its - * lowest-order term, so we expect the result to be in - * [0,2^-12). - * - * Then we form a prec-and-a-half number out of 1 and p, - * which is therefore equal to (log(1+s) - log(1-s))/(2s). - * - * Finally, we do two prec-and-a-half multiplications: one - * by s itself, and one by the constant 32/log(2). - */ - { - float s = sh + sl; - float s2 = s*s; - /* - * p is actually a polynomial in s^2, with the first - * term constrained to zero. In other words, treated on - * its own terms, we're computing p(s^2) such that p(x) - * is an approximation to the sum of the series 1/3 + - * x/5 + x^2/7 + ..., valid on the range [0, 1/40^2]. - */ - float p = s2 * (0.33333332920177422f + s2 * 0.20008275183621479f); - float th, tl; - - PHMULTIPLY(th,tl, 1.0f,p, sh,sl); - PHMULTIPLY(lsh,lsl, th,tl, 0x1.716p+5F,-0x1.7135a8p-8F); - } - - /* - * And our final answer for 16*log2(x) is equal to 16n (from - * the exponent), plus lsh+lsl (the result of the above - * computation), plus 16*log2(a) which we must look up in a - * table. - */ - { - struct f2 { float h, l; }; - static const struct f2 table[16] = { - /* - * When constructing this table, we have to be sure - * that we produce the same values of a which will - * be produced by the computation above. Ideally, I - * would tell Perl to actually do its _arithmetic_ - * in single precision here; but I don't know a way - * to do that, so instead I just scrupulously - * convert every intermediate value to and from SP. - */ - // perl -e 'for ($i=0; $i<16; $i++) { $v = unpack "f", pack "f", 1/16.0; $a = unpack "f", pack "f", $i * $v; $a = unpack "f", pack "f", $a+1.0; $l = 16*log($a)/log(2); $top = unpack "f", pack "f", int($l*256.0+0.5)/256.0; $bot = unpack "f", pack "f", $l - $top; printf "{0f_%08X,0f_%08X}, ", unpack "VV", pack "ff", $top, $bot; } print "\n"' | fold -s -w56 | sed 's/^/ /' - {0x0p+0F,0x0p+0F}, {0x1.66p+0F,0x1.fb7d64p-11F}, - {0x1.5cp+1F,0x1.a39fbep-15F}, {0x1.fcp+1F,-0x1.f4a37ep-10F}, - {0x1.49cp+2F,-0x1.87b432p-10F}, {0x1.91cp+2F,-0x1.15db84p-12F}, - {0x1.d68p+2F,-0x1.583f9ap-11F}, {0x1.0c2p+3F,-0x1.f5fe54p-10F}, - {0x1.2b8p+3F,0x1.a39fbep-16F}, {0x1.49ap+3F,0x1.e12f34p-11F}, - {0x1.66ap+3F,0x1.1c8f12p-18F}, {0x1.828p+3F,0x1.3ab7cep-14F}, - {0x1.9d6p+3F,-0x1.30158p-12F}, {0x1.b74p+3F,0x1.291eaap-10F}, - {0x1.d06p+3F,-0x1.8125b4p-10F}, {0x1.e88p+3F,0x1.8d66c4p-10F}, - }; - float lah = table[i].h, lal = table[i].l; - float fn = 16*n; - logh = CLEARBOTTOMHALF(lsl + lal + lsh + lah + fn); - logl = lsl - ((((logh - fn) - lah) - lsh) - lal); - } - } - - /* - * Now we have 16*log2(x), multiply it by y in prec-and-a-half. - */ - { - float yh, yl; - int savedexcepts; - - yh = CLEARBOTTOMHALF(y); - yl = y - yh; - - /* This multiplication could become infinite, so to avoid IVO - * in PHMULTIPLY we bound the output at 4096, which is big - * enough to allow any non-overflowing case through - * unmodified. Also, we must mask out the OVF exception, which - * we won't want left in the FP status word in the case where - * rlogh becomes huge and _negative_ (since that will be an - * underflow from the perspective of powf's return value, not - * an overflow). */ - savedexcepts = __ieee_status(0,0) & (FE_IEEE_OVERFLOW | FE_IEEE_UNDERFLOW); - PHMULTIPLY_SATURATE(rlogh, rlogl, logh, logl, yh, yl, 4096.0f); - __ieee_status(FE_IEEE_OVERFLOW | FE_IEEE_UNDERFLOW, savedexcepts); - } - - /* - * And raise 2 to the power of whatever that gave. Again, this - * is done in three parts: the fractional part of our input is - * fed through a polynomial approximation, all but the bottom - * four bits of the integer part go straight into the exponent, - * and the bottom four bits of the integer part index into a - * lookup table of powers of 2^(1/16) in prec-and-a-half. - */ - { - float rlog = rlogh + rlogl; - int i16 = (rlog + (rlog < 0 ? -0.5f : +0.5f)); - float rlogi = i16 >> 4; - - float x = rlogl + (rlogh - i16); - - static const float powersof2to1over16top[16] = { 0x1p+0F, 0x1.0b4p+0F, 0x1.172p+0F, 0x1.238p+0F, 0x1.306p+0F, 0x1.3dep+0F, 0x1.4bep+0F, 0x1.5aap+0F, 0x1.6ap+0F, 0x1.7ap+0F, 0x1.8acp+0F, 0x1.9c4p+0F, 0x1.ae8p+0F, 0x1.c18p+0F, 0x1.d58p+0F, 0x1.ea4p+0F }; - static const float powersof2to1over16bot[16] = { 0x0p+0F, 0x1.586cfap-12F, 0x1.7078fap-13F, 0x1.e9b9d6p-14F, 0x1.fc1464p-13F, 0x1.4c9824p-13F, 0x1.dad536p-12F, 0x1.07dd48p-12F, 0x1.3cccfep-13F, 0x1.1473ecp-12F, 0x1.ca8456p-13F, 0x1.230548p-13F, 0x1.3f32b6p-13F, 0x1.9bdd86p-12F, 0x1.8dcfbap-16F, 0x1.5f454ap-13F }; - static const float powersof2to1over16all[16] = { 0x1p+0F, 0x1.0b5586p+0F, 0x1.172b84p+0F, 0x1.2387a6p+0F, 0x1.306fep+0F, 0x1.3dea64p+0F, 0x1.4bfdaep+0F, 0x1.5ab07ep+0F, 0x1.6a09e6p+0F, 0x1.7a1148p+0F, 0x1.8ace54p+0F, 0x1.9c4918p+0F, 0x1.ae89fap+0F, 0x1.c199bep+0F, 0x1.d5818ep+0F, 0x1.ea4afap+0F }; - /* - * Coefficients generated using the command - -./auxiliary/remez.jl --suffix=f -- '-1/BigFloat(2)' '+1/BigFloat(2)' 2 0 'expm1(x*log(BigFloat(2))/16)/x' - - */ - float p = x * ( - 4.332169876512769231967668743345473181486157887703125512683507537369503902991722e-02f+x*(9.384123108485637159805511308039285411735300871134684682779057580789341719567367e-04f+x*(1.355120515540562256928614563584948866224035897564701496826514330445829352922309e-05f)) - ); - int index = (i16 & 15); - p = powersof2to1over16top[index] + (powersof2to1over16bot[index] + powersof2to1over16all[index]*p); - - if ( - fabsf(rlogi) < 126.0f - ) { - return sign * p * fhex((unsigned)((127.0f+rlogi) * 8388608.0f)); - } else if ( - fabsf(rlogi) < 192.0f - ) { - int i = rlogi; - float ret; - - ret = sign * p * - fhex((unsigned)((0x7f+i/2) * 8388608)) * - fhex((unsigned)((0x7f+i-i/2) * 8388608)); - - if ((fai(ret) << 1) == 0xFF000000) - return MATHERR_POWF_OFL(x, y, sign); - else if ((fai(ret) << 1) == 0) - return MATHERR_POWF_UFL(x, y, sign); - else - return FLOAT_CHECKDENORM(ret); - } else { - if (rlogi < 0) - return MATHERR_POWF_UFL(x, y, sign); - else - return MATHERR_POWF_OFL(x, y, sign); - } - } -} diff --git a/math/single/e_rem_pio2.c b/math/single/e_rem_pio2.c deleted file mode 100644 index 0cd3a22..0000000 --- a/math/single/e_rem_pio2.c +++ /dev/null @@ -1,268 +0,0 @@ -/* - * e_rem_pio2.c - * - * Copyright (c) 1999-2018, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#include <math.h> -#include "math_private.h" - -int __ieee754_rem_pio2(double x, double *y) { - int q; - - y[1] = 0.0; /* default */ - - /* - * Simple cases: all nicked from the fdlibm version for speed. - */ - { - static const double invpio2 = 0x1.45f306dc9c883p-1; - static const double pio2s[] = { - 0x1.921fb544p+0, /* 1.57079632673412561417e+00 */ - 0x1.0b4611a626331p-34, /* 6.07710050650619224932e-11 */ - 0x1.0b4611a6p-34, /* 6.07710050630396597660e-11 */ - 0x1.3198a2e037073p-69, /* 2.02226624879595063154e-21 */ - 0x1.3198a2ep-69, /* 2.02226624871116645580e-21 */ - 0x1.b839a252049c1p-104, /* 8.47842766036889956997e-32 */ - }; - - double z,w,t,r,fn; - int i,j,idx,n,ix,hx; - - hx = __HI(x); /* high word of x */ - ix = hx&0x7fffffff; - if(ix<=0x3fe921fb) /* |x| ~<= pi/4 , no need for reduction */ - {y[0] = x; y[1] = 0; return 0;} - if(ix<0x4002d97c) { /* |x| < 3pi/4, special case with n=+-1 */ - if(hx>0) { - z = x - pio2s[0]; - if(ix!=0x3ff921fb) { /* 33+53 bit pi is good enough */ - y[0] = z - pio2s[1]; -#ifdef bottomhalf - y[1] = (z-y[0])-pio2s[1]; -#endif - } else { /* near pi/2, use 33+33+53 bit pi */ - z -= pio2s[2]; - y[0] = z - pio2s[3]; -#ifdef bottomhalf - y[1] = (z-y[0])-pio2s[3]; -#endif - } - return 1; - } else { /* negative x */ - z = x + pio2s[0]; - if(ix!=0x3ff921fb) { /* 33+53 bit pi is good enough */ - y[0] = z + pio2s[1]; -#ifdef bottomhalf - y[1] = (z-y[0])+pio2s[1]; -#endif - } else { /* near pi/2, use 33+33+53 bit pi */ - z += pio2s[2]; - y[0] = z + pio2s[3]; -#ifdef bottomhalf - y[1] = (z-y[0])+pio2s[3]; -#endif - } - return -1; - } - } - if(ix<=0x413921fb) { /* |x| ~<= 2^19*(pi/2), medium size */ - t = fabs(x); - n = (int) (t*invpio2+0.5); - fn = (double)n; - r = t-fn*pio2s[0]; - w = fn*pio2s[1]; /* 1st round good to 85 bit */ - j = ix>>20; - idx = 1; - while (1) { - y[0] = r-w; - if (idx == 3) - break; - i = j-(((__HI(y[0]))>>20)&0x7ff); - if(i <= 33*idx-17) - break; - t = r; - w = fn*pio2s[2*idx]; - r = t-w; - w = fn*pio2s[2*idx+1]-((t-r)-w); - idx++; - } -#ifdef bottomhalf - y[1] = (r-y[0])-w; -#endif - if(hx<0) { - y[0] = -y[0]; -#ifdef bottomhalf - y[1] = -y[1]; -#endif - return -n; - } else { - return n; - } - } - } - - { - static const unsigned twooverpi[] = { - /* We start with two zero words, because they take up less - * space than the array bounds checking and special-case - * handling that would have to occur in their absence. */ - 0, 0, - /* 2/pi in hex is 0.a2f9836e... */ - 0xa2f9836e, 0x4e441529, 0xfc2757d1, 0xf534ddc0, 0xdb629599, - 0x3c439041, 0xfe5163ab, 0xdebbc561, 0xb7246e3a, 0x424dd2e0, - 0x06492eea, 0x09d1921c, 0xfe1deb1c, 0xb129a73e, 0xe88235f5, - 0x2ebb4484, 0xe99c7026, 0xb45f7e41, 0x3991d639, 0x835339f4, - 0x9c845f8b, 0xbdf9283b, 0x1ff897ff, 0xde05980f, 0xef2f118b, - 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7, 0x4f463f66, 0x9e5fea2d, - 0x7527bac7, 0xebe5f17b, 0x3d0739f7, 0x8a5292ea, 0x6bfb5fb1, - 0x1f8d5d08, 0x56033046, - }; - - /* - * Multiprecision multiplication of this nature is more - * readily done in integers than in VFP, since we can use - * UMULL (on CPUs that support it) to multiply 32 by 32 bits - * at a time whereas the VFP would only be able to do 12x12 - * without losing accuracy. - * - * So extract the mantissa of the input number as two 32-bit - * integers. - */ - unsigned mant1 = 0x00100000 | (__HI(x) & 0xFFFFF); - unsigned mant2 = __LO(x); - /* - * Now work out which part of our stored value of 2/pi we're - * supposed to be multiplying by. - * - * Let the IEEE exponent field of x be e. With its bias - * removed, (e-1023) is the index of the set bit in bit 20 - * of 'mant1' (i.e. that set bit has real place value - * 2^(e-1023)). So the lowest set bit in 'mant1', 52 bits - * further down, must have place value 2^(e-1075). - * - * We begin taking an interest in the value of 2/pi at the - * bit which multiplies by _that_ to give something with - * place value at most 2. In other words, the highest bit of - * 2/pi we're interested in is the one with place value - * 2/(2^(e-1075)) = 2^(1076-e). - * - * The bit at the top of the first zero word of the above - * array has place value 2^63. Hence, the bit we want to put - * at the top of the first word we extract from that array - * is the one at bit index n, where 63-n = 1076-e and hence - * n=e-1013. - */ - int topbitindex = ((__HI(x) >> 20) & 0x7FF) - 1013; - int wordindex = topbitindex >> 5; - int shiftup = topbitindex & 31; - int shiftdown = 32 - shiftup; - unsigned scaled[8]; - int i; - - scaled[7] = scaled[6] = 0; - - for (i = 6; i-- > 0 ;) { - /* - * Extract a word from our representation of 2/pi. - */ - unsigned word; - if (shiftup) - word = (twooverpi[wordindex + i] << shiftup) | (twooverpi[wordindex + i + 1] >> shiftdown); - else - word = twooverpi[wordindex + i]; - /* - * Multiply it by both words of our mantissa. (Should - * generate UMULLs where available.) - */ - unsigned long long mult1 = (unsigned long long)word * mant1; - unsigned long long mult2 = (unsigned long long)word * mant2; - /* - * Split those up into 32-bit chunks. - */ - unsigned bottom1 = (unsigned)mult1, top1 = (unsigned)(mult1 >> 32); - unsigned bottom2 = (unsigned)mult2, top2 = (unsigned)(mult2 >> 32); - /* - * Add those two chunks together. - */ - unsigned out1, out2, out3; - - out3 = bottom2; - out2 = top2 + bottom1; - out1 = top1 + (out2 < top2); - /* - * And finally add them to our 'scaled' array. - */ - unsigned s3 = scaled[i+2], s2 = scaled[i+1], s1; - unsigned carry; - s3 = out3 + s3; carry = (s3 < out3); - s2 = out2 + s2 + carry; carry = carry ? (s2 <= out2) : (s2 < out2); - s1 = out1 + carry; - - scaled[i+2] = s3; - scaled[i+1] = s2; - scaled[i] = s1; - } - - - /* - * The topmost set bit in mant1 is bit 20, and that has - * place value 2^(e-1023). The topmost bit (bit 31) of the - * most significant word we extracted from our twooverpi - * array had place value 2^(1076-e). So the product of those - * two bits must have place value 2^53; and that bit will - * have ended up as bit 19 of scaled[0]. Hence, the integer - * quadrant value we want to output, consisting of the bits - * with place values 2^1 and 2^0, must be 52 and 53 bits - * below that, i.e. precisely the topmost two bits of - * scaled[2]. - * - * Or, at least, it will be once we add 1/2, to round to the - * _nearest_ multiple of pi/2 rather than the next one down. - */ - q = (scaled[2] + (1<<29)) >> 30; - - /* - * Now we construct the output fraction, which is most - * simply done in the VFP. We just extract four consecutive - * bit strings from our chunk of binary data, convert them - * to doubles, equip each with an appropriate FP exponent, - * add them together, and (don't forget) multiply back up by - * pi/2. That way we don't have to work out ourselves where - * the highest fraction bit ended up. - * - * Since our displacement from the nearest multiple of pi/2 - * can be positive or negative, the topmost of these four - * values must be arranged with its 2^-1 bit at the very top - * of the word, and then treated as a _signed_ integer. - */ - int i1 = (scaled[2] << 2); - unsigned i2 = scaled[3]; - unsigned i3 = scaled[4]; - unsigned i4 = scaled[5]; - double f1 = i1, f2 = i2 * (0x1.0p-30), f3 = i3 * (0x1.0p-62), f4 = i4 * (0x1.0p-94); - - /* - * Now f1+f2+f3+f4 is a representation, potentially to - * twice double precision, of 2^32 times ((2/pi)*x minus - * some integer). So our remaining job is to multiply - * back down by (pi/2)*2^-32, and convert back to one - * single-precision output number. - */ - double ftop = f1 + (f2 + (f3 + f4)); - ftop = __SET_LO(ftop, 0); - double fbot = f4 - (((ftop-f1)-f2)-f3); - - /* ... and multiply by a prec-and-a-half value of (pi/2)*2^-32. */ - double ret = (ftop * 0x1.921fb54p-32) + (ftop * 0x1.10b4611a62633p-62 + fbot * 0x1.921fb54442d18p-32); - - /* Just before we return, take the input sign into account. */ - if (__HI(x) & 0x80000000) { - q = -q; - ret = -ret; - } - y[0] = ret; - return q; - } -} diff --git a/math/single/funder.c b/math/single/funder.c deleted file mode 100644 index ef16f68..0000000 --- a/math/single/funder.c +++ /dev/null @@ -1,51 +0,0 @@ -/* - * funder.c - manually provoke SP exceptions for mathlib - * - * Copyright (c) 2009-2018, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#include <fenv.h> -#include "math_private.h" - -__inline float __mathlib_flt_infnan2(float x, float y) -{ - return x+y; -} - -__inline float __mathlib_flt_infnan(float x) -{ - return x+x; -} - -float __mathlib_flt_underflow(void) -{ -#ifdef CLANG_EXCEPTIONS - feraiseexcept(FE_UNDERFLOW); -#endif - return 0x1p-95F * 0x1p-95F; -} - -float __mathlib_flt_overflow(void) -{ -#ifdef CLANG_EXCEPTIONS - feraiseexcept(FE_OVERFLOW); -#endif - return 0x1p+97F * 0x1p+97F; -} - -float __mathlib_flt_invalid(void) -{ -#ifdef CLANG_EXCEPTIONS - feraiseexcept(FE_INVALID); -#endif - return 0.0f / 0.0f; -} - -float __mathlib_flt_divzero(void) -{ -#ifdef CLANG_EXCEPTIONS - feraiseexcept(FE_DIVBYZERO); -#endif - return 1.0f / 0.0f; -} diff --git a/math/single/ieee_status.c b/math/single/ieee_status.c deleted file mode 100644 index ef846f4..0000000 --- a/math/single/ieee_status.c +++ /dev/null @@ -1,38 +0,0 @@ -/* - * ieee_status.c - * - * Copyright (c) 2015-2018, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#include "math_private.h" - -__inline unsigned __ieee_status(unsigned bicmask, unsigned xormask) -{ -#if defined __aarch64__ && defined __FP_FENV_EXCEPTIONS - unsigned status_word; - unsigned ret; - -#ifdef __FP_FENV_ROUNDING -# define MASK (1<<27)|FE_IEEE_FLUSHZERO|FE_IEEE_MASK_ALL_EXCEPT|FE_IEEE_ALL_EXCEPT|FE_IEEE_ROUND_MASK -#else -# define MASK (1<<27)|FE_IEEE_FLUSHZERO|FE_IEEE_MASK_ALL_EXCEPT|FE_IEEE_ALL_EXCEPT -#endif - - /* mask out read-only bits */ - bicmask &= MASK; - xormask &= MASK; - - /* modify the status word */ - __asm__ __volatile__ ("mrs %0, fpsr" : "=r" (status_word)); - ret = status_word; - status_word &= ~bicmask; - status_word ^= xormask; - __asm__ __volatile__ ("msr fpsr, %0" : : "r" (status_word)); - - /* and return what it used to be */ - return ret; -#else - return 0; -#endif -} diff --git a/math/single/math_private.h b/math/single/math_private.h deleted file mode 100644 index eef997d..0000000 --- a/math/single/math_private.h +++ /dev/null @@ -1,138 +0,0 @@ -/* - * math_private.h - * - * Copyright (c) 2015-2018, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -/* - * Header file containing some definitions and other small reusable pieces of - * code that we need in the libraries. - */ - -#ifndef __MATH_PRIVATE_H -#define __MATH_PRIVATE_H - -#include <errno.h> -#include <stdint.h> - -extern int __ieee754_rem_pio2(double, double *); -extern double __kernel_poly(const double *, int, double); - -#define __FP_IEEE -#define __FP_FENV_EXCEPTIONS -#define __FP_FENV_ROUNDING -#define __FP_INEXACT_EXCEPTION - -#define __set_errno(val) (errno = (val)) - -static __inline uint32_t __LO(double x) -{ - union {double f; uint64_t i;} u = {x}; - return (uint32_t)u.i; -} - -static __inline uint32_t __HI(double x) -{ - union {double f; uint64_t i;} u = {x}; - return u.i >> 32; -} - -static __inline double __SET_LO(double x, uint32_t lo) -{ - union {double f; uint64_t i;} u = {x}; - u.i = (u.i >> 32) << 32 | lo; - return u.f; -} - -static __inline unsigned int fai(float f) -{ - union {float f; uint32_t i;} u = {f}; - return u.i; -} - -static __inline float fhex(unsigned int n) -{ - union {uint32_t i; float f;} u = {n}; - return u.f; -} - -#define CLEARBOTTOMHALF(x) fhex((fai(x) + 0x00000800) & 0xFFFFF000) - -#define FE_IEEE_OVERFLOW (0x00000004) -#define FE_IEEE_UNDERFLOW (0x00000008) -#define FE_IEEE_FLUSHZERO (0x01000000) -#define FE_IEEE_ROUND_TONEAREST (0x00000000) -#define FE_IEEE_ROUND_UPWARD (0x00400000) -#define FE_IEEE_ROUND_DOWNWARD (0x00800000) -#define FE_IEEE_ROUND_TOWARDZERO (0x00C00000) -#define FE_IEEE_ROUND_MASK (0x00C00000) -#define FE_IEEE_MASK_INVALID (0x00000100) -#define FE_IEEE_MASK_DIVBYZERO (0x00000200) -#define FE_IEEE_MASK_OVERFLOW (0x00000400) -#define FE_IEEE_MASK_UNDERFLOW (0x00000800) -#define FE_IEEE_MASK_INEXACT (0x00001000) -#define FE_IEEE_MASK_INPUTDENORMAL (0x00008000) -#define FE_IEEE_MASK_ALL_EXCEPT (0x00009F00) -#define FE_IEEE_INVALID (0x00000001) -#define FE_IEEE_DIVBYZERO (0x00000002) -#define FE_IEEE_INEXACT (0x00000010) -#define FE_IEEE_INPUTDENORMAL (0x00000080) -#define FE_IEEE_ALL_EXCEPT (0x0000009F) - -extern double __mathlib_dbl_overflow(void); -extern float __mathlib_flt_overflow(void); -extern double __mathlib_dbl_underflow(void); -extern float __mathlib_flt_underflow(void); -extern double __mathlib_dbl_invalid(void); -extern float __mathlib_flt_invalid(void); -extern double __mathlib_dbl_divzero(void); -extern float __mathlib_flt_divzero(void); -#define DOUBLE_OVERFLOW ( __mathlib_dbl_overflow() ) -#define FLOAT_OVERFLOW ( __mathlib_flt_overflow() ) -#define DOUBLE_UNDERFLOW ( __mathlib_dbl_underflow() ) -#define FLOAT_UNDERFLOW ( __mathlib_flt_underflow() ) -#define DOUBLE_INVALID ( __mathlib_dbl_invalid() ) -#define FLOAT_INVALID ( __mathlib_flt_invalid() ) -#define DOUBLE_DIVZERO ( __mathlib_dbl_divzero() ) -#define FLOAT_DIVZERO ( __mathlib_flt_divzero() ) - -extern float __mathlib_flt_infnan(float); -extern float __mathlib_flt_infnan2(float, float); -extern double __mathlib_dbl_infnan(double); -extern double __mathlib_dbl_infnan2(double, double); -extern unsigned __ieee_status(unsigned, unsigned); - -#define FLOAT_INFNAN(x) __mathlib_flt_infnan(x) -#define FLOAT_INFNAN2(x,y) __mathlib_flt_infnan2(x,y) -#define DOUBLE_INFNAN(x) __mathlib_dbl_infnan(x) -#define DOUBLE_INFNAN2(x,y) __mathlib_dbl_infnan2(x,y) - -#define MATHERR_POWF_00(x,y) (__set_errno(EDOM), 1.0f) -#define MATHERR_POWF_INF0(x,y) (__set_errno(EDOM), 1.0f) -#define MATHERR_POWF_0NEG(x,y) (__set_errno(ERANGE), FLOAT_DIVZERO) -#define MATHERR_POWF_NEG0FRAC(x,y) (0.0f) -#define MATHERR_POWF_0NEGODD(x,y) (__set_errno(ERANGE), -FLOAT_DIVZERO) -#define MATHERR_POWF_0NEGEVEN(x,y) (__set_errno(ERANGE), FLOAT_DIVZERO) -#define MATHERR_POWF_NEGFRAC(x,y) (__set_errno(EDOM), FLOAT_INVALID) -#define MATHERR_POWF_ONEINF(x,y) (1.0f) -#define MATHERR_POWF_OFL(x,y,z) (__set_errno(ERANGE), copysignf(FLOAT_OVERFLOW,z)) -#define MATHERR_POWF_UFL(x,y,z) (__set_errno(ERANGE), copysignf(FLOAT_UNDERFLOW,z)) - -#define MATHERR_LOGF_0(x) (__set_errno(ERANGE), -FLOAT_DIVZERO) -#define MATHERR_LOGF_NEG(x) (__set_errno(EDOM), FLOAT_INVALID) - -#define MATHERR_SIN_INF(x) (__set_errno(EDOM), DOUBLE_INVALID) -#define MATHERR_SINF_INF(x) (__set_errno(EDOM), FLOAT_INVALID) -#define MATHERR_COS_INF(x) (__set_errno(EDOM), DOUBLE_INVALID) -#define MATHERR_COSF_INF(x) (__set_errno(EDOM), FLOAT_INVALID) -#define MATHERR_TAN_INF(x) (__set_errno(EDOM), DOUBLE_INVALID) -#define MATHERR_TANF_INF(x) (__set_errno(EDOM), FLOAT_INVALID) - -#define MATHERR_EXPF_UFL(x) (__set_errno(ERANGE), FLOAT_UNDERFLOW) -#define MATHERR_EXPF_OFL(x) (__set_errno(ERANGE), FLOAT_OVERFLOW) - -#define FLOAT_CHECKDENORM(x) ( (fpclassify(x) == FP_SUBNORMAL ? FLOAT_UNDERFLOW : 0), x ) -#define DOUBLE_CHECKDENORM(x) ( (fpclassify(x) == FP_SUBNORMAL ? DOUBLE_UNDERFLOW : 0), x ) - -#endif /* __MATH_PRIVATE_H */ diff --git a/math/single/poly.c b/math/single/poly.c deleted file mode 100644 index 4eaf7ff..0000000 --- a/math/single/poly.c +++ /dev/null @@ -1,25 +0,0 @@ -/* - * poly.c - * - * Copyright (c) 1998-2018, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -double __kernel_poly(const double *coeffs, int n, double x) -{ - double result = coeffs[--n]; - - while ((n & ~0x6) != 0) /* Loop until n even and < 8 */ - result = (result * x) + coeffs[--n]; - - switch (n) - { - case 6: result = (result * x) + coeffs[5]; - result = (result * x) + coeffs[4]; - case 4: result = (result * x) + coeffs[3]; - result = (result * x) + coeffs[2]; - case 2: result = (result * x) + coeffs[1]; - result = (result * x) + coeffs[0]; - } - return result; -} diff --git a/math/single/rredf.c b/math/single/rredf.c deleted file mode 100644 index a5c3122..0000000 --- a/math/single/rredf.c +++ /dev/null @@ -1,239 +0,0 @@ -/* - * rredf.c - trigonometric range reduction function - * - * Copyright (c) 2009-2018, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -/* - * This code is intended to be used as the second half of a range - * reducer whose first half is an inline function defined in - * rredf.h. Each trig function performs range reduction by invoking - * that, which handles the quickest and most common cases inline - * before handing off to this function for everything else. Thus a - * reasonable compromise is struck between speed and space. (I - * hope.) In particular, this approach avoids a function call - * overhead in the common case. - */ - -#include "math_private.h" - -#ifdef __cplusplus -extern "C" { -#endif /* __cplusplus */ - -/* - * Input values to this function: - * - x is the original user input value, unchanged by the - * first-tier reducer in the case where it hands over to us. - * - q is still the place where the caller expects us to leave the - * quadrant code. - * - k is the IEEE bit pattern of x (which it would seem a shame to - * recompute given that the first-tier reducer already went to - * the effort of extracting it from the VFP). FIXME: in softfp, - * on the other hand, it's unconscionably wasteful to replicate - * this value into a second register and we should change the - * prototype! - */ -float __mathlib_rredf2(float x, int *q, unsigned k) -{ - /* - * First, weed out infinities and NaNs, and deal with them by - * returning a negative q. - */ - if ((k << 1) >= 0xFF000000) { - *q = -1; - return x; - } - /* - * We do general range reduction by multiplying by 2/pi, and - * retaining the bottom two bits of the integer part and an - * initial chunk of the fraction below that. The integer bits - * are directly output as *q; the fraction is then multiplied - * back up by pi/2 before returning it. - * - * To get this right, we don't have to multiply by the _whole_ - * of 2/pi right from the most significant bit downwards: - * instead we can discard any bit of 2/pi with a place value - * high enough that multiplying it by the LSB of x will yield a - * place value higher than 2. Thus we can bound the required - * work by a reasonably small constant regardless of the size of - * x (unlike, for instance, the IEEE remainder operation). - * - * At the other end, however, we must take more care: it isn't - * adequate just to acquire two integer bits and 24 fraction - * bits of (2/pi)x, because if a lot of those fraction bits are - * zero then we will suffer significance loss. So we must keep - * computing fraction bits as far down as 23 bits below the - * _highest set fraction bit_. - * - * The immediate question, therefore, is what the bound on this - * end of the job will be. In other words: what is the smallest - * difference between an integer multiple of pi/2 and a - * representable IEEE single precision number larger than the - * maximum size handled by rredf.h? - * - * The most difficult cases for each exponent can readily be - * found by Tim Peters's modular minimisation algorithm, and are - * tabulated in mathlib/tests/directed/rredf.tst. The single - * worst case is the IEEE single-precision number 0x6F79BE45, - * whose numerical value is in the region of 7.7*10^28; when - * reduced mod pi/2, it attains the value 0x30DDEEA9, or about - * 0.00000000161. The highest set bit of this value is the one - * with place value 2^-30; so its lowest is 2^-53. Hence, to be - * sure of having enough fraction bits to output at full single - * precision, we must be prepared to collect up to 53 bits of - * fraction in addition to our two bits of integer part. - * - * To begin with, this means we must store the value of 2/pi to - * a precision of 128+53 = 181 bits. That's six 32-bit words. - * (Hardly a chore, unlike the equivalent problem in double - * precision!) - */ - { - static const unsigned twooverpi[] = { - /* We start with a zero word, because that takes up less - * space than the array bounds checking and special-case - * handling that would have to occur in its absence. */ - 0, - /* 2/pi in hex is 0.a2f9836e... */ - 0xa2f9836e, 0x4e441529, 0xfc2757d1, - 0xf534ddc0, 0xdb629599, 0x3c439041, - /* Again, to avoid array bounds overrun, we store a spare - * word at the end. And it would be a shame to fill it - * with zeroes when we could use more bits of 2/pi... */ - 0xfe5163ab - }; - - /* - * Multiprecision multiplication of this nature is more - * readily done in integers than in VFP, since we can use - * UMULL (on CPUs that support it) to multiply 32 by 32 bits - * at a time whereas the VFP would only be able to do 12x12 - * without losing accuracy. - * - * So extract the mantissa of the input number as a 32-bit - * integer. - */ - unsigned mantissa = 0x80000000 | (k << 8); - - /* - * Now work out which part of our stored value of 2/pi we're - * supposed to be multiplying by. - * - * Let the IEEE exponent field of x be e. With its bias - * removed, (e-127) is the index of the set bit at the top - * of 'mantissa' (i.e. that set bit has real place value - * 2^(e-127)). So the lowest set bit in 'mantissa', 23 bits - * further down, must have place value 2^(e-150). - * - * We begin taking an interest in the value of 2/pi at the - * bit which multiplies by _that_ to give something with - * place value at most 2. In other words, the highest bit of - * 2/pi we're interested in is the one with place value - * 2/(2^(e-150)) = 2^(151-e). - * - * The bit at the top of the first (zero) word of the above - * array has place value 2^31. Hence, the bit we want to put - * at the top of the first word we extract from that array - * is the one at bit index n, where 31-n = 151-e and hence - * n=e-120. - */ - int topbitindex = ((k >> 23) & 0xFF) - 120; - int wordindex = topbitindex >> 5; - int shiftup = topbitindex & 31; - int shiftdown = 32 - shiftup; - unsigned word1, word2, word3; - if (shiftup) { - word1 = (twooverpi[wordindex] << shiftup) | (twooverpi[wordindex+1] >> shiftdown); - word2 = (twooverpi[wordindex+1] << shiftup) | (twooverpi[wordindex+2] >> shiftdown); - word3 = (twooverpi[wordindex+2] << shiftup) | (twooverpi[wordindex+3] >> shiftdown); - } else { - word1 = twooverpi[wordindex]; - word2 = twooverpi[wordindex+1]; - word3 = twooverpi[wordindex+2]; - } - - /* - * Do the multiplications, and add them together. - */ - unsigned long long mult1 = (unsigned long long)word1 * mantissa; - unsigned long long mult2 = (unsigned long long)word2 * mantissa; - unsigned long long mult3 = (unsigned long long)word3 * mantissa; - - unsigned /* bottom3 = (unsigned)mult3, */ top3 = (unsigned)(mult3 >> 32); - unsigned bottom2 = (unsigned)mult2, top2 = (unsigned)(mult2 >> 32); - unsigned bottom1 = (unsigned)mult1, top1 = (unsigned)(mult1 >> 32); - - unsigned out3, out2, out1, carry; - - out3 = top3 + bottom2; carry = (out3 < top3); - out2 = top2 + bottom1 + carry; carry = carry ? (out2 <= top2) : (out2 < top2); - out1 = top1 + carry; - - /* - * The two words we multiplied to get mult1 had their top - * bits at (respectively) place values 2^(151-e) and - * 2^(e-127). The value of those two bits multiplied - * together will have ended up in bit 62 (the - * topmost-but-one bit) of mult1, i.e. bit 30 of out1. - * Hence, that bit has place value 2^(151-e+e-127) = 2^24. - * So the integer value that we want to output as q, - * consisting of the bits with place values 2^1 and 2^0, - * must be 23 and 24 bits below that, i.e. in bits 7 and 6 - * of out1. - * - * Or, at least, it will be once we add 1/2, to round to the - * _nearest_ multiple of pi/2 rather than the next one down. - */ - *q = (out1 + (1<<5)) >> 6; - - /* - * Now we construct the output fraction, which is most - * simply done in the VFP. We just extract three consecutive - * bit strings from our chunk of binary data, convert them - * to integers, equip each with an appropriate FP exponent, - * add them together, and (don't forget) multiply back up by - * pi/2. That way we don't have to work out ourselves where - * the highest fraction bit ended up. - * - * Since our displacement from the nearest multiple of pi/2 - * can be positive or negative, the topmost of these three - * values must be arranged with its 2^-1 bit at the very top - * of the word, and then treated as a _signed_ integer. - */ - { - int i1 = (out1 << 26) | ((out2 >> 19) << 13); - unsigned i2 = out2 << 13; - unsigned i3 = out3; - float f1 = i1, f2 = i2 * (1.0f/524288.0f), f3 = i3 * (1.0f/524288.0f/524288.0f); - - /* - * Now f1+f2+f3 is a representation, potentially to - * twice double precision, of 2^32 times ((2/pi)*x minus - * some integer). So our remaining job is to multiply - * back down by (pi/2)*2^-32, and convert back to one - * single-precision output number. - */ - - /* Normalise to a prec-and-a-half representation... */ - float ftop = CLEARBOTTOMHALF(f1+f2+f3), fbot = f3-((ftop-f1)-f2); - - /* ... and multiply by a prec-and-a-half value of (pi/2)*2^-32. */ - float ret = (ftop * 0x1.92p-32F) + (ftop * 0x1.fb5444p-44F + fbot * 0x1.921fb6p-32F); - - /* Just before we return, take the input sign into account. */ - if (k & 0x80000000) { - *q = 0x10000000 - *q; - ret = -ret; - } - return ret; - } - } -} - -#ifdef __cplusplus -} /* end of extern "C" */ -#endif /* __cplusplus */ - -/* end of rredf.c */ diff --git a/math/single/rredf.h b/math/single/rredf.h deleted file mode 100644 index 41d3a0c..0000000 --- a/math/single/rredf.h +++ /dev/null @@ -1,146 +0,0 @@ -/* - * rredf.h - trigonometric range reduction function written new for RVCT 4.1 - * - * Copyright (c) 2009-2018, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -/* - * This header file defines an inline function which all three of - * the single-precision trig functions (sinf, cosf, tanf) should use - * to perform range reduction. The inline function handles the - * quickest and most common cases inline, before handing off to an - * out-of-line function defined in rredf.c for everything else. Thus - * a reasonable compromise is struck between speed and space. (I - * hope.) In particular, this approach avoids a function call - * overhead in the common case. - */ - -#ifndef _included_rredf_h -#define _included_rredf_h - -#include "math_private.h" - -#ifdef __cplusplus -extern "C" { -#endif /* __cplusplus */ - -extern float __mathlib_rredf2(float x, int *q, unsigned k); - -/* - * Semantics of the function: - * - x is the single-precision input value provided by the user - * - the return value is in the range [-pi/4,pi/4], and is equal - * (within reasonable accuracy bounds) to x minus n*pi/2 for some - * integer n. (FIXME: perhaps some slippage on the output - * interval is acceptable, requiring more range from the - * following polynomial approximations but permitting more - * approximate rred decisions?) - * - *q is set to a positive value whose low two bits match those - * of n. Alternatively, it comes back as -1 indicating that the - * input value was trivial in some way (infinity, NaN, or so - * small that we can safely return sin(x)=tan(x)=x,cos(x)=1). - */ -static __inline float __mathlib_rredf(float x, int *q) -{ - /* - * First, extract the bit pattern of x as an integer, so that we - * can repeatedly compare things to it without multiple - * overheads in retrieving comparison results from the VFP. - */ - unsigned k = fai(x); - - /* - * Deal immediately with the simplest possible case, in which x - * is already within the interval [-pi/4,pi/4]. This also - * identifies the subcase of ludicrously small x. - */ - if ((k << 1) < (0x3f490fdb << 1)) { - if ((k << 1) < (0x39800000 << 1)) - *q = -1; - else - *q = 0; - return x; - } - - /* - * The next plan is to multiply x by 2/pi and convert to an - * integer, which gives us n; then we subtract n*pi/2 from x to - * get our output value. - * - * By representing pi/2 in that final step by a prec-and-a-half - * approximation, we can arrange good accuracy for n strictly - * less than 2^13 (so that an FP representation of n has twelve - * zero bits at the bottom). So our threshold for this strategy - * is 2^13 * pi/2 - pi/4, otherwise known as 8191.75 * pi/2 or - * 4095.875*pi. (Or, for those perverse people interested in - * actual numbers rather than multiples of pi/2, about 12867.5.) - */ - if (__builtin_expect((k & 0x7fffffff) < 0x46490e49, 1)) { - float nf = 0.636619772367581343f * x; - /* - * The difference between that single-precision constant and - * the real 2/pi is about 2.568e-8. Hence the product nf has a - * potential error of 2.568e-8|x| even before rounding; since - * |x| < 4096 pi, that gives us an error bound of about - * 0.0003305. - * - * nf is then rounded to single precision, with a max error of - * 1/2 ULP, and since nf goes up to just under 8192, half a - * ULP could be as big as 2^-12 ~= 0.0002441. - * - * So by the time we convert nf to an integer, it could be off - * by that much, causing the wrong integer to be selected, and - * causing us to return a value a little bit outside the - * theoretical [-pi/4,+pi/4] output interval. - * - * How much outside? Well, we subtract nf*pi/2 from x, so the - * error bounds above have be be multiplied by pi/2. And if - * both of the above sources of error suffer their worst cases - * at once, then the very largest value we could return is - * obtained by adding that lot to the interval bound pi/4 to - * get - * - * pi/4 + ((2/pi - 0f_3f22f983)*4096*pi + 2^-12) * pi/2 - * - * which comes to 0f_3f494b02. (Compare 0f_3f490fdb = pi/4.) - * - * So callers of this range reducer should be prepared to - * handle numbers up to that large. - */ -#ifdef __TARGET_FPU_SOFTVFP - nf = _frnd(nf); -#else - if (k & 0x80000000) - nf = (nf - 8388608.0f) + 8388608.0f; - else - nf = (nf + 8388608.0f) - 8388608.0f; /* round to _nearest_ integer. FIXME: use some sort of frnd in softfp */ -#endif - *q = 3 & (int)nf; -#if 0 - /* - * FIXME: now I need a bunch of special cases to avoid - * having to do the full four-word reduction every time. - * Also, adjust the comment at the top of this section! - */ - if (__builtin_expect((k & 0x7fffffff) < 0x46490e49, 1)) - return ((x - nf * 0x1.92p+0F) - nf * 0x1.fb4p-12F) - nf * 0x1.4442d2p-24F; - else -#endif - return ((x - nf * 0x1.92p+0F) - nf * 0x1.fb4p-12F) - nf * 0x1.444p-24F - nf * 0x1.68c234p-39F; - } - - /* - * That's enough to do in-line; if we're still playing, hand off - * to the out-of-line main range reducer. - */ - return __mathlib_rredf2(x, q, k); -} - -#ifdef __cplusplus -} /* end of extern "C" */ -#endif /* __cplusplus */ - -#endif /* included */ - -/* end of rredf.h */ diff --git a/math/single/s_cosf.c b/math/single/s_cosf.c deleted file mode 100644 index 0e40b52..0000000 --- a/math/single/s_cosf.c +++ /dev/null @@ -1,11 +0,0 @@ -/* - * s_cosf.c - single precision cosine function - * - * Copyright (c) 2009-2018, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#define COSINE -#include "s_sincosf.c" - -/* end of s_cosf.c */ diff --git a/math/single/s_sincosf.c b/math/single/s_sincosf.c deleted file mode 100644 index 38a8511..0000000 --- a/math/single/s_sincosf.c +++ /dev/null @@ -1,109 +0,0 @@ -/* - * s_sincosf.c - single precision sine and cosine functions - * - * Copyright (c) 2009-2018, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -/* - * Source: my own head, and Remez-generated polynomial approximations. - */ - -#include <fenv.h> -#include <math.h> -#include <errno.h> -#include "rredf.h" -#include "math_private.h" - -#ifdef __cplusplus -extern "C" { -#endif /* __cplusplus */ - -#ifndef COSINE -#define FUNCNAME sinf -#define SOFTFP_FUNCNAME __softfp_sinf -#define DO_SIN (!(q & 1)) -#define NEGATE_SIN ((q & 2)) -#define NEGATE_COS ((q & 2)) -#define TRIVIAL_RESULT(x) FLOAT_CHECKDENORM(x) -#define ERR_INF MATHERR_SINF_INF -#else -#define FUNCNAME cosf -#define SOFTFP_FUNCNAME __softfp_cosf -#define DO_SIN (q & 1) -#define NEGATE_SIN (!(q & 2)) -#define NEGATE_COS ((q & 2)) -#define TRIVIAL_RESULT(x) 1.0f -#define ERR_INF MATHERR_COSF_INF -#endif - -float FUNCNAME(float x) -{ - int q; - - /* - * Range-reduce x to the range [-pi/4,pi/4]. - */ - { - /* - * I enclose the call to __mathlib_rredf in braces so that - * the address-taken-ness of qq does not propagate - * throughout the rest of the function, for what that might - * be worth. - */ - int qq; - x = __mathlib_rredf(x, &qq); - q = qq; - } - if (__builtin_expect(q < 0, 0)) { /* this signals tiny, inf, or NaN */ - unsigned k = fai(x) << 1; - if (k < 0xFF000000) /* tiny */ - return TRIVIAL_RESULT(x); - else if (k == 0xFF000000) /* inf */ - return ERR_INF(x); - else /* NaN */ - return FLOAT_INFNAN(x); - } - - /* - * Depending on the quadrant we were in, we may have to compute - * a sine-like function (f(0)=0) or a cosine-like one (f(0)=1), - * and we may have to negate it. - */ - if (DO_SIN) { - float x2 = x*x; - /* - * Coefficients generated by the command - -./auxiliary/remez.jl --variable=x2 --suffix=f -- '0' 'atan(BigFloat(1))^2' 2 0 'x==0 ? -1/BigFloat(6) : (x->(sin(x)-x)/x^3)(sqrt(x))' 'sqrt(x^3)' - - */ - x += x * (x2 * ( - -1.666665066929417292436220415142244613956015227491999719404711781344783392564922e-01f+x2*(8.331978663157089651408875887703995477889496917296385733254577121461421466427772e-03f+x2*(-1.949563623766929906511886482584265500187314705496861877317774185883215997931494e-04f)) - )); - if (NEGATE_SIN) - x = -x; - return x; - } else { - float x2 = x*x; - /* - * Coefficients generated by the command - -./auxiliary/remez.jl --variable=x2 --suffix=f -- '0' 'atan(BigFloat(1))^2' 2 0 'x==0 ? -1/BigFloat(6) : (x->(cos(x)-1)/x^2)(sqrt(x))' 'x' - - */ - x = 1.0f + x2*( - -4.999989478137016757327030935768632852012781143541026304540837816323349768666875e-01f+x2*(4.165629457842617238353362092016541041535652603456375154392942188742496860024377e-02f+x2*(-1.35978231111049428748566568960423202948250988565693107969571193763372093404347e-03f)) - ); - if (NEGATE_COS) - x = -x; - return x; - } -} - - -#ifdef __cplusplus -} /* end of extern "C" */ -#endif /* __cplusplus */ - -/* end of sincosf.c */ diff --git a/math/single/s_sinf.c b/math/single/s_sinf.c deleted file mode 100644 index ab0f628..0000000 --- a/math/single/s_sinf.c +++ /dev/null @@ -1,11 +0,0 @@ -/* - * s_sinf.c - single precision sine function - * - * Copyright (c) 2009-2018, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#define SINE -#include "s_sincosf.c" - -/* end of s_sinf.c */ diff --git a/math/single/s_tanf.c b/math/single/s_tanf.c deleted file mode 100644 index 0e13d95..0000000 --- a/math/single/s_tanf.c +++ /dev/null @@ -1,77 +0,0 @@ -/* - * s_tanf.c - single precision tangent function - * - * Copyright (c) 2009-2018, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -/* - * Source: my own head, and Remez-generated polynomial approximations. - */ - -#include <math.h> -#include "math_private.h" -#include <errno.h> -#include <fenv.h> -#include "rredf.h" - -#ifdef __cplusplus -extern "C" { -#endif /* __cplusplus */ - -float tanf(float x) -{ - int q; - - /* - * Range-reduce x to the range [-pi/4,pi/4]. - */ - { - /* - * I enclose the call to __mathlib_rredf in braces so that - * the address-taken-ness of qq does not propagate - * throughout the rest of the function, for what that might - * be worth. - */ - int qq; - x = __mathlib_rredf(x, &qq); - q = qq; - } - if (__builtin_expect(q < 0, 0)) { /* this signals tiny, inf, or NaN */ - unsigned k = fai(x) << 1; - if (k < 0xFF000000) /* tiny */ - return FLOAT_CHECKDENORM(x); - else if (k == 0xFF000000) /* inf */ - return MATHERR_TANF_INF(x); - else /* NaN */ - return FLOAT_INFNAN(x); - } - - /* - * We use a direct polynomial approximation for tan(x) on - * [-pi/4,pi/4], and then take the negative reciprocal of the - * result if we're in an interval surrounding an odd rather than - * even multiple of pi/2. - * - * Coefficients generated by the command - -./auxiliary/remez.jl --variable=x2 --suffix=f -- '0' '(pi/BigFloat(4))^2' 5 0 'x==0 ? 1/BigFloat(3) : (tan(sqrt(x))-sqrt(x))/sqrt(x^3)' 'sqrt(x^3)' - - */ - { - float x2 = x*x; - x += x * (x2 * ( - 3.333294809182307633621540045249152105330074691488121206914336806061620616979305e-01f+x2*(1.334274588580033216191949445078951865160600494428914956688702429547258497367525e-01f+x2*(5.315177279765676178198868818834880279286012428084733419724267810723468887753723e-02f+x2*(2.520300881849204519070372772571624013984546591252791443673871814078418474596388e-02f+x2*(2.051177187082974766686645514206648277055233230110624602600687812103764075834307e-03f+x2*(9.943421494628597182458186353995299429948224864648292162238582752158235742046109e-03f))))) - )); - if (q & 1) - x = -1.0f/x; - - return x; - } -} - -#ifdef __cplusplus -} /* end of extern "C" */ -#endif /* __cplusplus */ - -/* end of s_tanf.c */ diff --git a/math/tanf.c b/math/tanf.c deleted file mode 100644 index d3799f5..0000000 --- a/math/tanf.c +++ /dev/null @@ -1,3 +0,0 @@ -#if WANT_SINGLEPREC -#include "single/s_tanf.c" -#endif |