From da55ef9510a53822b5706c61ad97795828999c80 Mon Sep 17 00:00:00 2001 From: George Lander Date: Thu, 19 Nov 2015 12:05:06 +0000 Subject: Initial release of Optimized Routines --- math/CMakeLists.txt | 60 +++++ math/dunder.c | 66 +++++ math/e_expf.c | 132 ++++++++++ math/e_logf.c | 169 ++++++++++++ math/e_powf.c | 673 ++++++++++++++++++++++++++++++++++++++++++++++++ math/e_rem_pio2.c | 283 ++++++++++++++++++++ math/funder.c | 65 +++++ math/ieee_status.c | 52 ++++ math/include/arm_math.h | 33 +++ math/k_cos.c | 107 ++++++++ math/k_sin.c | 88 +++++++ math/k_tan.c | 154 +++++++++++ math/math_private.h | 137 ++++++++++ math/poly.c | 39 +++ math/rredf.c | 253 ++++++++++++++++++ math/rredf.h | 145 +++++++++++ math/s_cos.c | 98 +++++++ math/s_cosf.c | 25 ++ math/s_sin.c | 100 +++++++ math/s_sincosf.c | 123 +++++++++ math/s_sinf.c | 25 ++ math/s_tan.c | 136 ++++++++++ math/s_tanf.c | 91 +++++++ 23 files changed, 3054 insertions(+) create mode 100644 math/CMakeLists.txt create mode 100644 math/dunder.c create mode 100644 math/e_expf.c create mode 100644 math/e_logf.c create mode 100644 math/e_powf.c create mode 100644 math/e_rem_pio2.c create mode 100644 math/funder.c create mode 100644 math/ieee_status.c create mode 100644 math/include/arm_math.h create mode 100644 math/k_cos.c create mode 100644 math/k_sin.c create mode 100644 math/k_tan.c create mode 100644 math/math_private.h create mode 100644 math/poly.c create mode 100644 math/rredf.c create mode 100644 math/rredf.h create mode 100644 math/s_cos.c create mode 100644 math/s_cosf.c create mode 100644 math/s_sin.c create mode 100644 math/s_sincosf.c create mode 100644 math/s_sinf.c create mode 100644 math/s_tan.c create mode 100644 math/s_tanf.c (limited to 'math') diff --git a/math/CMakeLists.txt b/math/CMakeLists.txt new file mode 100644 index 0000000..86c7b18 --- /dev/null +++ b/math/CMakeLists.txt @@ -0,0 +1,60 @@ +# CMakeLists.txt +# +# Copyright (C) 2015, ARM Limited, All Rights Reserved +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# This file is part of the Optimized Routines project + +# Use -frounding-math so gcc doesn't constant fold away expressions that raise +# exceptions in funder.c and dunder.c. +set_source_files_properties(funder.c dunder.c PROPERTIES COMPILE_FLAGS -frounding-math) + +# Add files to the library. +set(mathlib_src + e_expf.c e_logf.c e_powf.c + e_rem_pio2.c + dunder.c funder.c ieee_status.c poly.c rredf.c + k_cos.c s_cos.c + s_cosf.c + k_sin.c s_sin.c + s_sinf.c + k_tan.c s_tan.c + s_tanf.c) + +add_library(mathlib SHARED ${mathlib_src}) +# Build a static library for cross-testing purposes (QEMU). +add_library(mathlib_static STATIC ${mathlib_src}) + +# Set CPU-specific tuning and compatibility flags. +if (MATHBENCH_TARGET_CPU MATCHES "A57") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=cortex-a57") +elseif (MATHBENCH_TARGET_CPU MATCHES "A53") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=cortex-a53") +else () + # Tuning for other CPUs is not supported yet. +endif () + +# Link against libc. +target_link_libraries(mathlib c) +target_link_libraries(mathlib_static c) + +# Install the shared library to the build directory. +install(TARGETS mathlib DESTINATION ${CMAKE_BINARY_DIR}/lib) +install(TARGETS mathlib_static DESTINATION ${CMAKE_BINARY_DIR}/lib) +configure_file( + ${PROJECT_SOURCE_DIR}/math/include/arm_math.h + ${PROJECT_BINARY_DIR}/include/arm_math.h + COPYONLY + ) diff --git a/math/dunder.c b/math/dunder.c new file mode 100644 index 0000000..8576bfd --- /dev/null +++ b/math/dunder.c @@ -0,0 +1,66 @@ +/* + * dunder.c - manually provoke FP exceptions for mathlib + * + * Copyright (C) 2009-2015, ARM Limited, All Rights Reserved + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * This file is part of the Optimized Routines project + */ + + +#include "math_private.h" +#include + +__inline double __mathlib_dbl_infnan(double x) +{ + return x+x; +} + +__inline double __mathlib_dbl_infnan2(double x, double y) +{ + return x+y; +} + +double __mathlib_dbl_underflow(void) +{ +#ifdef CLANG_EXCEPTIONS + feraiseexcept(FE_UNDERFLOW); +#endif + return 0x1p-767 * 0x1p-767; +} + +double __mathlib_dbl_overflow(void) +{ +#ifdef CLANG_EXCEPTIONS + feraiseexcept(FE_OVERFLOW); +#endif + return 0x1p+769 * 0x1p+769; +} + +double __mathlib_dbl_invalid(void) +{ +#ifdef CLANG_EXCEPTIONS + feraiseexcept(FE_INVALID); +#endif + return 0.0 / 0.0; +} + +double __mathlib_dbl_divzero(void) +{ +#ifdef CLANG_EXCEPTIONS + feraiseexcept(FE_DIVBYZERO); +#endif + return 1.0 / 0.0; +} diff --git a/math/e_expf.c b/math/e_expf.c new file mode 100644 index 0000000..49f1ea6 --- /dev/null +++ b/math/e_expf.c @@ -0,0 +1,132 @@ +/* + * e_expf.c - single-precision exp function + * + * Copyright (C) 2009-2015, ARM Limited, All Rights Reserved + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * This file is part of the Optimized Routines project + */ + +/* + * Algorithm was once taken from Cody & Waite, but has been munged + * out of all recognition by SGT. + */ + +#include "arm_math.h" +#include "math_private.h" +#include +#include + +float +ARM__expf(float X) +{ + int N; float XN, g, Rg, Result; + unsigned ix = fai(X), edgecaseflag = 0; + + /* + * Handle infinities, NaNs and big numbers. + */ + if (__builtin_expect((ix << 1) - 0x67000000 > 0x85500000 - 0x67000000, 0)) { + if (!(0x7f800000 & ~ix)) { + if (ix == 0xff800000) + return 0.0f; + else + return FLOAT_INFNAN(X);/* do the right thing with both kinds of NaN and with +inf */ + } else if ((ix << 1) < 0x67000000) { + return 1.0f; /* magnitude so small the answer can't be distinguished from 1 */ + } else if ((ix << 1) > 0x85a00000) { + __set_errno(ERANGE); + if (ix & 0x80000000) { + return FLOAT_UNDERFLOW; + } else { + return FLOAT_OVERFLOW; + } + } else { + edgecaseflag = 1; + } + } + + /* + * Split the input into an integer multiple of log(2)/4, and a + * fractional part. + */ + XN = X * (4.0f*1.4426950408889634074f); +#ifdef __TARGET_FPU_SOFTVFP + XN = _frnd(XN); + N = (int)XN; +#else + N = (int)(XN + (ix & 0x80000000 ? -0.5f : 0.5f)); + XN = N; +#endif + g = (X - XN * 0x1.62ep-3F) - XN * 0x1.0bfbe8p-17F; /* prec-and-a-half representation of log(2)/4 */ + + /* + * Now we compute exp(X) in, conceptually, three parts: + * - a pure power of two which we get from N>>2 + * - exp(g) for g in [-log(2)/8,+log(2)/8], which we compute + * using a Remez-generated polynomial approximation + * - exp(k*log(2)/4) (aka 2^(k/4)) for k in [0..3], which we + * get from a lookup table in precision-and-a-half and + * multiply by g. + * + * We gain a bit of extra precision by the fact that actually + * our polynomial approximation gives us exp(g)-1, and we add + * the 1 back on by tweaking the prec-and-a-half multiplication + * step. + * + * Coefficients generated by the command + +./auxiliary/remez.jl --variable=g --suffix=f -- '-log(BigFloat(2))/8' '+log(BigFloat(2))/8' 3 0 '(expm1(x))/x' + + */ + Rg = g * ( + 9.999999412829185331953781321128516523408059996430919985217971370689774264850229e-01f+g*(4.999999608551332693833317084753864837160947932961832943901913087652889900683833e-01f+g*(1.667292360203016574303631953046104769969440903672618034272397630620346717392378e-01f+g*(4.168230895653321517750133783431970715648192153539929404872173693978116154823859e-02f))) + ); + + /* + * Do the table lookup and combine it with Rg, to get our final + * answer apart from the exponent. + */ + { + static const float twotokover4top[4] = { 0x1p+0F, 0x1.306p+0F, 0x1.6ap+0F, 0x1.ae8p+0F }; + static const float twotokover4bot[4] = { 0x0p+0F, 0x1.fc1464p-13F, 0x1.3cccfep-13F, 0x1.3f32b6p-13F }; + static const float twotokover4all[4] = { 0x1p+0F, 0x1.306fep+0F, 0x1.6a09e6p+0F, 0x1.ae89fap+0F }; + int index = (N & 3); + Rg = twotokover4top[index] + (twotokover4bot[index] + twotokover4all[index]*Rg); + N >>= 2; + } + + /* + * Combine the output exponent and mantissa, and return. + */ + if (__builtin_expect(edgecaseflag, 0)) { + Result = fhex(((N/2) << 23) + 0x3f800000); + Result *= Rg; + Result *= fhex(((N-N/2) << 23) + 0x3f800000); + /* + * Step not mentioned in C&W: set errno reliably. + */ + if (fai(Result) == 0) + return MATHERR_EXPF_UFL(Result); + if (fai(Result) == 0x7f800000) + return MATHERR_EXPF_OFL(Result); + return FLOAT_CHECKDENORM(Result); + } else { + Result = fhex(N * 8388608.0f + (float)0x3f800000); + Result *= Rg; + } + + return Result; +} diff --git a/math/e_logf.c b/math/e_logf.c new file mode 100644 index 0000000..484b02d --- /dev/null +++ b/math/e_logf.c @@ -0,0 +1,169 @@ +/* + * e_logf.c - single precision log function + * + * Copyright (C) 2009-2015, ARM Limited, All Rights Reserved + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * This file is part of the Optimized Routines project + */ + +/* + * Algorithm was once taken from Cody & Waite, but has been munged + * out of all recognition by SGT. + */ + +#include "arm_math.h" +#include "math_private.h" +#include +#include + + +float +ARM__logf(float X) +{ + int N = 0; + int aindex; + float a, x, s; + unsigned ix = fai(X); + + if (__builtin_expect((ix - 0x00800000) >= 0x7f800000 - 0x00800000, 0)) { + if ((ix << 1) > 0xff000000) /* NaN */ + return FLOAT_INFNAN(X); + if (ix == 0x7f800000) /* +inf */ + return X; + if (X < 0) { /* anything negative */ + return MATHERR_LOGF_NEG(X); + } + if (X == 0) { + return MATHERR_LOGF_0(X); + } + /* That leaves denormals. */ + N = -23; + X *= 0x1p+23F; + ix = fai(X); + } + + /* + * Separate X into three parts: + * - 2^N for some integer N + * - a number a of the form (1+k/8) for k=0,...,7 + * - a residual which we compute as s = (x-a)/(x+a), for + * x=X/2^N. + * + * We pick the _nearest_ (N,a) pair, so that (x-a) has magnitude + * at most 1/16. Hence, we must round things that are just + * _below_ a power of two up to the next power of two, so this + * isn't as simple as extracting the raw exponent of the FP + * number. Instead we must grab the exponent together with the + * top few bits of the mantissa, and round (in integers) there. + */ + { + int rounded = ix + 0x00080000; + int Nnew = (rounded >> 23) - 127; + aindex = (rounded >> 20) & 7; + a = fhex(0x3f800000 + (aindex << 20)); + N += Nnew; + x = fhex(ix - (Nnew << 23)); + } + + if (!N && !aindex) { + /* + * Use an alternative strategy for very small |x|, which + * avoids the 1ULP of relative error introduced in the + * computation of s. If our nearest (N,a) pair is N=0,a=1, + * that means we have -1/32 < x-a < 1/16, on which interval + * the ordinary series for log(1+z) (setting z-x-a) will + * converge adequately fast; so we can simply find an + * approximation to log(1+z)/z good on that interval and + * scale it by z on the way out. + * + * Coefficients generated by the command + +./auxiliary/remez.jl --variable=z --suffix=f -- '-1/BigFloat(32)' '+1/BigFloat(16)' 3 0 '(log1p(x)-x)/x^2' + + */ + float z = x - 1.0f; + float p = z*z * ( + -4.999999767382730053173434595877399055021398381370452534949864039404089549132551e-01f+z*(3.333416379155995401749506866323446447523793085809161350343357014272193712456391e-01f+z*(-2.501299948811686421962724839011563450757435183422532362736159418564644404218257e-01f+z*(1.903576945606738444146078468935429697455230136403008172485495359631510244557255e-01f))) + ); + + return z + p; + } + + /* + * Now we have N, a and x correct, so that |x-a| <= 1/16. + * Compute s. + * + * (Since |x+a| >= 2, this means that |s| will be at most 1/32.) + */ + s = (x - a) / (x + a); + + /* + * The point of computing s = (x-a)/(x+a) was that this makes x + * equal to a * (1+s)/(1-s). So we can now compute log(x) by + * means of computing log((1+s)/(1-s)) (which has a more + * efficiently converging series), and adding log(a) which we + * obtain from a lookup table. + * + * So our full answer to log(X) is now formed by adding together + * N*log(2) + log(a) + log((1+s)/(1-s)). + * + * Now log((1+s)/(1-s)) has the exact Taylor series + * + * 2s + 2s^3/3 + 2s^5/5 + ... + * + * and what we do is to compute all but the first term of that + * as a polynomial approximation in s^2, then add on the first + * term - and all the other bits and pieces above - in + * precision-and-a-half so as to keep the total error down. + */ + { + float s2 = s*s; + + /* + * We want a polynomial L(s^2) such that + * + * 2s + s^3*L(s^2) = log((1+s)/(1-s)) + * + * => L(s^2) = (log((1+s)/(1-s)) - 2s) / s^3 + * + * => L(z) = (log((1+sqrt(z))/(1-sqrt(z))) - 2*sqrt(z)) / sqrt(z)^3 + * + * The required range of the polynomial is only [0,1/32^2]. + * + * Our accuracy requirement for the polynomial approximation + * is that we don't want to introduce any error more than + * about 2^-23 times the _top_ bit of s. But the value of + * the polynomial has magnitude about s^3; and since |s| < + * 2^-5, this gives us |s^3/s| < 2^-10. In other words, + * our approximation only needs to be accurate to 13 bits or + * so before its error is lost in the noise when we add it + * to everything else. + * + * Coefficients generated by the command + +./auxiliary/remez.jl --variable=s2 --suffix=f -- '0' '1/BigFloat(32^2)' 1 0 '(abs(x) < 1e-20 ? BigFloat(2)/3 + 2*x/5 + 2*x^2/7 + 2*x^3/9 : (log((1+sqrt(x))/(1-sqrt(x)))-2*sqrt(x))/sqrt(x^3))' + + */ + float p = s * s2 * ( + 6.666666325680271091157649745099739739798210281016897722498744752867165138320995e-01f+s2*(4.002792299542401431889592846825025487338520940900492146195427243856292349188402e-01f) + ); + + static const float log2hi = 0x1.62ep-1F, log2lo = 0x1.0bfbe8p-15F; + static const float logahi[8] = { 0x0p+0F, 0x1.e26p-4F, 0x1.c8ep-3F, 0x1.46p-2F, 0x1.9f2p-2F, 0x1.f12p-2F, 0x1.1e8p-1F, 0x1.41cp-1F }; + static const float logalo[8] = { 0x0p+0F, 0x1.076e2ap-16F, 0x1.f7c79ap-15F, 0x1.8bc21cp-14F, 0x1.23eccp-14F, 0x1.1ebf5ep-15F, 0x1.7d79c2p-15F, 0x1.8fe846p-13F }; + return (N*log2hi + logahi[aindex]) + (2.0f*s + (N*log2lo + logalo[aindex] + p)); + } +} diff --git a/math/e_powf.c b/math/e_powf.c new file mode 100644 index 0000000..3fa5d5d --- /dev/null +++ b/math/e_powf.c @@ -0,0 +1,673 @@ +/* + * e_powf.c - single-precision power function + * + * Copyright (C) 2009-2015, ARM Limited, All Rights Reserved + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * This file is part of the Optimized Routines project + */ + +#include "arm_math.h" +#include "math_private.h" +#include +#include + +float +ARM__powf(float x, float y) +{ + float logh, logl; + float rlogh, rlogl; + float sign = 1.0f; + int expadjust = 0; + unsigned ix, iy; + + ix = fai(x); + iy = fai(y); + + if (__builtin_expect((ix - 0x00800000) >= (0x7f800000 - 0x00800000) || + ((iy << 1) + 0x02000000) < 0x40000000, 0)) { + /* + * The above test rules out, as quickly as I can see how to, + * all possible inputs except for a normalised positive x + * being raised to the power of a normalised (and not + * excessively small) y. That's the fast-path case: if + * that's what the user wants, we can skip all of the + * difficult special-case handling. + * + * Now we must identify, as efficiently as we can, cases + * which will return to the fast path with a little tidying + * up. These are, in order of likelihood and hence of + * processing: + * + * - a normalised _negative_ x raised to the power of a + * non-zero finite y. Having identified this case, we + * must categorise y into one of the three categories + * 'odd integer', 'even integer' and 'non-integer'; for + * the last of these we return an error, while for the + * other two we rejoin the main code path having rendered + * x positive and stored an appropriate sign to append to + * the eventual result. + * + * - a _denormal_ x raised to the power of a non-zero + * finite y, in which case we multiply it up by a power + * of two to renormalise it, store an appropriate + * adjustment for its base-2 logarithm, and depending on + * the sign of y either return straight to the main code + * path or go via the categorisation of y above. + * + * - any of the above kinds of x raised to the power of a + * zero, denormal, nearly-denormal or nearly-infinite y, + * in which case we must do the checks on x as above but + * otherwise the algorithm goes through basically + * unchanged. Denormal and very tiny y values get scaled + * up to something not in range of accidental underflow + * when split into prec-and-a-half format; very large y + * values get scaled down by a factor of two to prevent + * CLEARBOTTOMHALF's round-up from overflowing them to + * infinity. (Of course the _output_ will overflow either + * way - the largest y value that can possibly yield a + * finite result is well below this range anyway - so + * this is a safe change.) + */ + if (__builtin_expect(((iy << 1) + 0x02000000) >= 0x40000000, 1)) { /* normalised and sensible y */ + y_ok_check_x: + + if (__builtin_expect((ix - 0x80800000) < (0xff800000 - 0x80800000), 1)) { /* normal but negative x */ + y_ok_x_negative: + + x = fabsf(x); + ix = fai(x); + + /* + * Determine the parity of y, if it's an integer at + * all. + */ + { + int yexp, yunitsbit; + + /* + * Find the exponent of y. + */ + yexp = (iy >> 23) & 0xFF; + /* + * Numbers with an exponent smaller than 0x7F + * are strictly smaller than 1, and hence must + * be fractional. + */ + if (yexp < 0x7F) + return MATHERR_POWF_NEGFRAC(x,y); + /* + * Numbers with an exponent at least 0x97 are by + * definition even integers. + */ + if (yexp >= 0x97) + goto mainpath; /* rejoin main code, giving positive result */ + /* + * In between, we must check the mantissa. + * + * Note that this case includes yexp==0x7f, + * which means 1 point something. In this case, + * the 'units bit' we're testing is semantically + * the lowest bit of the exponent field, not the + * leading 1 on the mantissa - but fortunately, + * that bit position will just happen to contain + * the 1 that we would wish it to, because the + * exponent describing that particular case just + * happens to be odd. + */ + yunitsbit = 0x96 - yexp; + if (iy & ((1 << yunitsbit)-1)) + return MATHERR_POWF_NEGFRAC(x,y); + else if (iy & (1 << yunitsbit)) + sign = -1.0f; /* y is odd; result should be negative */ + goto mainpath; /* now we can rejoin the main code */ + } + } else if (__builtin_expect((ix << 1) != 0 && (ix << 1) < 0x01000000, 0)) { /* denormal x */ + /* + * Renormalise x. + */ + x *= 0x1p+27F; + ix = fai(x); + /* + * Set expadjust to compensate for that. + */ + expadjust = -27; + + /* Now we need to handle negative x as above. */ + if (ix & 0x80000000) + goto y_ok_x_negative; + else + goto mainpath; + } else if ((ix - 0x00800000) < (0x7f800000 - 0x00800000)) { + /* normal positive x, back here from denormal-y case below */ + goto mainpath; + } + } else if (((iy << 1) + 0x02000000) >= 0x02000000) { /* denormal, nearly-denormal or zero y */ + if (y == 0.0F) { + /* + * y == 0. Any finite x returns 1 here. (Quiet NaNs + * do too, but we handle that below since we don't + * mind doing them more slowly.) + */ + if ((ix << 1) != 0 && (ix << 1) < 0xFF000000) + return 1.0f; + } else { + /* + * Denormal or very very small y. In this situation + * we have to be a bit careful, because when we + * break up y into precision-and-a-half later on we + * risk working with denormals and triggering + * underflow exceptions within this function that + * aren't related to the smallness of the output. So + * here we convert all such y values into a standard + * small-but-not-too-small value which will give the + * same output. + * + * What value should that be? Well, we work in + * 16*log2(x) below (equivalently, log to the base + * 2^{1/16}). So the maximum magnitude of that for + * any finite x is about 2416 (= 16 * (128+23), for + * log of the smallest denormal x), i.e. certainly + * less than 2^12. If multiplying that by y gives + * anything of magnitude less than 2^-32 (and even + * that's being generous), the final output will be + * indistinguishable from 1. So any value of y with + * magnitude less than 2^-(32+12) = 2^-44 is + * completely indistinguishable from any other such + * value. Hence we got here in the first place by + * checking the exponent of y against 64 (i.e. -63, + * counting the exponent bias), so we might as well + * normalise all tiny y values to the same threshold + * of 2^-64. + */ + iy = 0x1f800000 | (iy & 0x80000000); /* keep the sign; that's important */ + y = fhex(iy); + } + goto y_ok_check_x; + } else if (((iy << 1) + 0x02000000) < 0x01000000) { /* y in top finite exponent bracket */ + y = fhex(fai(y) - 0x00800000); /* scale down by a factor of 2 */ + goto y_ok_check_x; + } + + /* + * Having dealt with the above cases, we now know that + * either x is zero, infinite or NaN, or y is infinite or + * NaN, or both. We can deal with all of those cases without + * ever rejoining the main code path. + */ + if ((unsigned)(((ix & 0x7FFFFFFF) - 0x7f800001) < 0x7fc00000 - 0x7f800001) || + (unsigned)(((iy & 0x7FFFFFFF) - 0x7f800001) < 0x7fc00000 - 0x7f800001)) { + /* + * At least one signalling NaN. Do a token arithmetic + * operation on the two operands to provoke an exception + * and return the appropriate QNaN. + */ + return FLOAT_INFNAN2(x,y); + } else if (ix==0x3f800000 || (iy << 1)==0) { + /* + * C99 says that 1^anything and anything^0 should both + * return 1, _even for a NaN_. I modify that slightly to + * apply only to QNaNs (which doesn't violate C99, since + * C99 doesn't specify anything about SNaNs at all), + * because I can't bring myself not to throw an + * exception on an SNaN since its _entire purpose_ is to + * throw an exception whenever touched. + */ + return 1.0f; + } else + if (((ix & 0x7FFFFFFF) > 0x7f800000) || + ((iy & 0x7FFFFFFF) > 0x7f800000)) { + /* + * At least one QNaN. Do a token arithmetic operation on + * the two operands to get the right one to propagate to + * the output. + */ + return FLOAT_INFNAN2(x,y); + } else if (ix == 0x7f800000) { + /* + * x = +infinity. Return +infinity for positive y, +0 + * for negative y, and 1 for zero y. + */ + if (!(iy << 1)) + return MATHERR_POWF_INF0(x,y); + else if (iy & 0x80000000) + return 0.0f; + else + return INFINITY; + } else { + /* + * Repeat the parity analysis of y above, returning 1 + * (odd), 2 (even) or 0 (fraction). + */ + int ypar, yexp, yunitsbit; + yexp = (iy >> 23) & 0xFF; + if (yexp < 0x7F) + ypar = 0; + else if (yexp >= 0x97) + ypar = 2; + else { + yunitsbit = 0x96 - yexp; + if (iy & ((1 << yunitsbit)-1)) + ypar = 0; + else if (iy & (1 << yunitsbit)) + ypar = 1; + else + ypar = 2; + } + + if (ix == 0xff800000) { + /* + * x = -infinity. We return infinity or zero + * depending on whether y is positive or negative, + * and the sign is negative iff y is an odd integer. + * (SGT: I don't like this, but it's what C99 + * mandates.) + */ + if (!(iy & 0x80000000)) { + if (ypar == 1) + return -INFINITY; + else + return INFINITY; + } else { + if (ypar == 1) + return -0.0f; + else + return +0.0f; + } + } else if (ix == 0) { + /* + * x = +0. We return +0 for all positive y including + * infinity; a divide-by-zero-like error for all + * negative y including infinity; and an 0^0 error + * for zero y. + */ + if ((iy << 1) == 0) + return MATHERR_POWF_00(x,y); + else if (iy & 0x80000000) + return MATHERR_POWF_0NEGEVEN(x,y); + else + return +0.0f; + } else if (ix == 0x80000000) { + /* + * x = -0. We return errors in almost all cases (the + * exception being positive integer y, in which case + * we return a zero of the appropriate sign), but + * the errors are almost all different. Gah. + */ + if ((iy << 1) == 0) + return MATHERR_POWF_00(x,y); + else if (iy == 0x7f800000) + return MATHERR_POWF_NEG0FRAC(x,y); + else if (iy == 0xff800000) + return MATHERR_POWF_0NEG(x,y); + else if (iy & 0x80000000) + return (ypar == 0 ? MATHERR_POWF_0NEG(x,y) : + ypar == 1 ? MATHERR_POWF_0NEGODD(x,y) : + /* ypar == 2 ? */ MATHERR_POWF_0NEGEVEN(x,y)); + else + return (ypar == 0 ? MATHERR_POWF_NEG0FRAC(x,y) : + ypar == 1 ? -0.0f : + /* ypar == 2 ? */ +0.0f); + } else { + /* + * Now we know y is an infinity of one sign or the + * other and x is finite and nonzero. If x == -1 (+1 + * is already ruled out), we return +1; otherwise + * C99 mandates that we return either +0 or +inf, + * the former iff exactly one of |x| < 1 and y<0 is + * true. + */ + if (ix == 0xbf800000) { + return +1.0f; + } else if (!((ix << 1) < 0x7f000000) ^ !(iy & 0x80000000)) { + return +0.0f; + } + else { + return INFINITY; + } + } + } + } + + mainpath: + +#define PHMULTIPLY(rh,rl, xh,xl, yh,yl) do { \ + float tmph, tmpl; \ + tmph = (xh) * (yh); \ + tmpl = (xh) * (yl) + (xl) * ((yh)+(yl)); \ +/* printf("PHMULTIPLY: tmp=%08x+%08x\n", fai(tmph), fai(tmpl)); */ \ + (rh) = CLEARBOTTOMHALF(tmph + tmpl); \ + (rl) = tmpl + (tmph - (rh)); \ +} while (0) + +/* + * Same as the PHMULTIPLY macro above, but bounds the absolute value + * of rh+rl. In multiplications uncontrolled enough that rh can go + * infinite, we can get an IVO exception from the subtraction tmph - + * rh, so we should spot that case in advance and avoid it. + */ +#define PHMULTIPLY_SATURATE(rh,rl, xh,xl, yh,yl, bound) do { \ + float tmph, tmpl; \ + tmph = (xh) * (yh); \ + if (fabsf(tmph) > (bound)) { \ + (rh) = copysignf((bound),(tmph)); \ + (rl) = 0.0f; \ + } else { \ + tmpl = (xh) * (yl) + (xl) * ((yh)+(yl)); \ + (rh) = CLEARBOTTOMHALF(tmph + tmpl); \ + (rl) = tmpl + (tmph - (rh)); \ + } \ + } while (0) + + /* + * Determine log2 of x to relative prec-and-a-half, as logh + + * logl. + * + * Well, we're actually computing 16*log2(x), so that it's the + * right size for the subsequently fiddly messing with powers of + * 2^(1/16) in the exp step at the end. + */ + if (__builtin_expect((ix - 0x3f7ff000) <= (0x3f801000 - 0x3f7ff000), 0)) { + /* + * For x this close to 1, we write x = 1 + t and then + * compute t - t^2/2 + t^3/3 - t^4/4; and the neat bit is + * that t itself, being the bottom half of an input + * mantissa, is in half-precision already, so the output is + * naturally in canonical prec-and-a-half form. + */ + float t = x - 1.0; + float lnh, lnl; + /* + * Compute natural log of x in prec-and-a-half. + */ + lnh = t; + lnl = - (t * t) * ((1.0f/2.0f) - t * ((1.0f/3.0f) - t * (1.0f/4.0f))); + + /* + * Now we must scale by 16/log(2), still in prec-and-a-half, + * to turn this from natural log(x) into 16*log2(x). + */ + PHMULTIPLY(logh, logl, lnh, lnl, 0x1.716p+4F, -0x1.7135a8p-9F); + } else { + /* + * For all other x, we start by normalising to [1,2), and + * then dividing that further into subintervals. For each + * subinterval we pick a number a in that interval, compute + * s = (x-a)/(x+a) in precision-and-a-half, and then find + * the log base 2 of (1+s)/(1-s), still in precision-and-a- + * half. + * + * Why would we do anything so silly? For two main reasons. + * + * Firstly, if s = (x-a)/(x+a), then a bit of algebra tells + * us that x = a * (1+s)/(1-s); so once we've got + * log2((1+s)/(1-s)), we need only add on log2(a) and then + * we've got log2(x). So this lets us treat all our + * subintervals in essentially the same way, rather than + * requiring a separate approximation for each one; the only + * correction factor we need is to store a table of the + * base-2 logs of all our values of a. + * + * Secondly, log2((1+s)/(1-s)) is a nice thing to compute, + * once we've got s. Switching to natural logarithms for the + * moment (it's only a scaling factor to sort that out at + * the end), we write it as the difference of two logs: + * + * log((1+s)/(1-s)) = log(1+s) - log(1-s) + * + * Now recall that Taylor series expansion gives us + * + * log(1+s) = s - s^2/2 + s^3/3 - ... + * + * and therefore we also have + * + * log(1-s) = -s - s^2/2 - s^3/3 - ... + * + * These series are exactly the same except that the odd + * terms (s, s^3 etc) have flipped signs; so subtracting the + * latter from the former gives us + * + * log(1+s) - log(1-s) = 2s + 2s^3/3 + 2s^5/5 + ... + * + * which requires only half as many terms to be computed + * before the powers of s get too small to see. Then, of + * course, we have to scale the result by 1/log(2) to + * convert natural logs into logs base 2. + * + * To compute the above series in precision-and-a-half, we + * first extract a factor of 2s (which we can multiply back + * in later) so that we're computing 1 + s^2/3 + s^4/5 + ... + * and then observe that if s starts off small enough to + * make s^2/3 at most 2^-12, we need only compute the first + * couple of terms in laborious prec-and-a-half, and can + * delegate everything after that to a simple polynomial + * approximation whose error will end up at the bottom of + * the low word of the result. + * + * How many subintervals does that mean we need? + * + * To go back to s = (x-a)/(x+a). Let x = a + e, for some + * positive e. Then |s| = |e| / |2a+e| <= |e/2a|. So suppose + * we have n subintervals of equal width covering the space + * from 1 to 2. If a is at the centre of each interval, then + * we have e at most 1/2n and a can equal any of 1, 1+1/n, + * 1+2/n, ... 1+(n-1)/n. In that case, clearly the largest + * value of |e/2a| is given by the largest e (i.e. 1/2n) and + * the smallest a (i.e. 1); so |s| <= 1/4n. Hence, when we + * know how big we're prepared to let s be, we simply make + * sure 1/4n is at most that. + * + * And if we want s^2/3 to be at most 2^-12, then that means + * s^2 is at most 3*2^-12, so that s is at most sqrt(3)*2^-6 + * = 0.02706. To get 1/4n smaller than that, we need to have + * n>=9.23; so we'll set n=16 (for ease of bit-twiddling), + * and then s is at most 1/64. + */ + int n, i; + float a, ax, sh, sl, lsh, lsl; + + /* + * Let ax be x normalised to a single exponent range. + * However, the exponent range in question is not a simple + * one like [1,2). What we do is to round up the top four + * bits of the mantissa, so that the top 1/32 of each + * natural exponent range rounds up to the next one and is + * treated as a displacement from the lowest a in that + * range. + * + * So this piece of bit-twiddling gets us our input exponent + * and our subinterval index. + */ + n = (ix + 0x00040000) >> 19; + i = n & 15; + n = ((n >> 4) & 0xFF) - 0x7F; + ax = fhex(ix - (n << 23)); + n += expadjust; + + /* + * Compute the subinterval centre a. + */ + a = 1.0f + i * (1.0f/16.0f); + + /* + * Compute s = (ax-a)/(ax+a), in precision-and-a-half. + */ + { + float u, vh, vl, vapprox, rvapprox; + + u = ax - a; /* exact numerator */ + vapprox = ax + a; /* approximate denominator */ + vh = CLEARBOTTOMHALF(vapprox); + vl = (a - vh) + ax; /* vh+vl is exact denominator */ + rvapprox = 1.0f/vapprox; /* approximate reciprocal of denominator */ + + sh = CLEARBOTTOMHALF(u * rvapprox); + sl = ((u - sh*vh) - sh*vl) * rvapprox; + } + + /* + * Now compute log2(1+s) - log2(1-s). We do this in several + * steps. + * + * By polynomial approximation, we compute + * + * log(1+s) - log(1-s) + * p = ------------------- - 1 + * 2s + * + * in single precision only, using a single-precision + * approximation to s. This polynomial has s^2 as its + * lowest-order term, so we expect the result to be in + * [0,2^-12). + * + * Then we form a prec-and-a-half number out of 1 and p, + * which is therefore equal to (log(1+s) - log(1-s))/(2s). + * + * Finally, we do two prec-and-a-half multiplications: one + * by s itself, and one by the constant 32/log(2). + */ + { + float s = sh + sl; + float s2 = s*s; + /* + * p is actually a polynomial in s^2, with the first + * term constrained to zero. In other words, treated on + * its own terms, we're computing p(s^2) such that p(x) + * is an approximation to the sum of the series 1/3 + + * x/5 + x^2/7 + ..., valid on the range [0, 1/40^2]. + */ + float p = s2 * (0.33333332920177422f + s2 * 0.20008275183621479f); + float th, tl; + + PHMULTIPLY(th,tl, 1.0f,p, sh,sl); + PHMULTIPLY(lsh,lsl, th,tl, 0x1.716p+5F,-0x1.7135a8p-8F); + } + + /* + * And our final answer for 16*log2(x) is equal to 16n (from + * the exponent), plus lsh+lsl (the result of the above + * computation), plus 16*log2(a) which we must look up in a + * table. + */ + { + struct f2 { float h, l; }; + static const struct f2 table[16] = { + /* + * When constructing this table, we have to be sure + * that we produce the same values of a which will + * be produced by the computation above. Ideally, I + * would tell Perl to actually do its _arithmetic_ + * in single precision here; but I don't know a way + * to do that, so instead I just scrupulously + * convert every intermediate value to and from SP. + */ + // perl -e 'for ($i=0; $i<16; $i++) { $v = unpack "f", pack "f", 1/16.0; $a = unpack "f", pack "f", $i * $v; $a = unpack "f", pack "f", $a+1.0; $l = 16*log($a)/log(2); $top = unpack "f", pack "f", int($l*256.0+0.5)/256.0; $bot = unpack "f", pack "f", $l - $top; printf "{0f_%08X,0f_%08X}, ", unpack "VV", pack "ff", $top, $bot; } print "\n"' | fold -s -w56 | sed 's/^/ /' + {0x0p+0F,0x0p+0F}, {0x1.66p+0F,0x1.fb7d64p-11F}, + {0x1.5cp+1F,0x1.a39fbep-15F}, {0x1.fcp+1F,-0x1.f4a37ep-10F}, + {0x1.49cp+2F,-0x1.87b432p-10F}, {0x1.91cp+2F,-0x1.15db84p-12F}, + {0x1.d68p+2F,-0x1.583f9ap-11F}, {0x1.0c2p+3F,-0x1.f5fe54p-10F}, + {0x1.2b8p+3F,0x1.a39fbep-16F}, {0x1.49ap+3F,0x1.e12f34p-11F}, + {0x1.66ap+3F,0x1.1c8f12p-18F}, {0x1.828p+3F,0x1.3ab7cep-14F}, + {0x1.9d6p+3F,-0x1.30158p-12F}, {0x1.b74p+3F,0x1.291eaap-10F}, + {0x1.d06p+3F,-0x1.8125b4p-10F}, {0x1.e88p+3F,0x1.8d66c4p-10F}, + }; + float lah = table[i].h, lal = table[i].l; + float fn = 16*n; + logh = CLEARBOTTOMHALF(lsl + lal + lsh + lah + fn); + logl = lsl - ((((logh - fn) - lah) - lsh) - lal); + } + } + + /* + * Now we have 16*log2(x), multiply it by y in prec-and-a-half. + */ + { + float yh, yl; + int savedexcepts; + + yh = CLEARBOTTOMHALF(y); + yl = y - yh; + + /* This multiplication could become infinite, so to avoid IVO + * in PHMULTIPLY we bound the output at 4096, which is big + * enough to allow any non-overflowing case through + * unmodified. Also, we must mask out the OVF exception, which + * we won't want left in the FP status word in the case where + * rlogh becomes huge and _negative_ (since that will be an + * underflow from the perspective of powf's return value, not + * an overflow). */ + savedexcepts = __ieee_status(0,0) & (FE_IEEE_OVERFLOW | FE_IEEE_UNDERFLOW); + PHMULTIPLY_SATURATE(rlogh, rlogl, logh, logl, yh, yl, 4096.0f); + __ieee_status(FE_IEEE_OVERFLOW | FE_IEEE_UNDERFLOW, savedexcepts); + } + + /* + * And raise 2 to the power of whatever that gave. Again, this + * is done in three parts: the fractional part of our input is + * fed through a polynomial approximation, all but the bottom + * four bits of the integer part go straight into the exponent, + * and the bottom four bits of the integer part index into a + * lookup table of powers of 2^(1/16) in prec-and-a-half. + */ + { + float rlog = rlogh + rlogl; + int i16 = (rlog + (rlog < 0 ? -0.5f : +0.5f)); + float rlogi = i16 >> 4; + + float x = rlogl + (rlogh - i16); + + static const float powersof2to1over16top[16] = { 0x1p+0F, 0x1.0b4p+0F, 0x1.172p+0F, 0x1.238p+0F, 0x1.306p+0F, 0x1.3dep+0F, 0x1.4bep+0F, 0x1.5aap+0F, 0x1.6ap+0F, 0x1.7ap+0F, 0x1.8acp+0F, 0x1.9c4p+0F, 0x1.ae8p+0F, 0x1.c18p+0F, 0x1.d58p+0F, 0x1.ea4p+0F }; + static const float powersof2to1over16bot[16] = { 0x0p+0F, 0x1.586cfap-12F, 0x1.7078fap-13F, 0x1.e9b9d6p-14F, 0x1.fc1464p-13F, 0x1.4c9824p-13F, 0x1.dad536p-12F, 0x1.07dd48p-12F, 0x1.3cccfep-13F, 0x1.1473ecp-12F, 0x1.ca8456p-13F, 0x1.230548p-13F, 0x1.3f32b6p-13F, 0x1.9bdd86p-12F, 0x1.8dcfbap-16F, 0x1.5f454ap-13F }; + static const float powersof2to1over16all[16] = { 0x1p+0F, 0x1.0b5586p+0F, 0x1.172b84p+0F, 0x1.2387a6p+0F, 0x1.306fep+0F, 0x1.3dea64p+0F, 0x1.4bfdaep+0F, 0x1.5ab07ep+0F, 0x1.6a09e6p+0F, 0x1.7a1148p+0F, 0x1.8ace54p+0F, 0x1.9c4918p+0F, 0x1.ae89fap+0F, 0x1.c199bep+0F, 0x1.d5818ep+0F, 0x1.ea4afap+0F }; + /* + * Coefficients generated using the command + +./auxiliary/remez.jl --suffix=f -- '-1/BigFloat(2)' '+1/BigFloat(2)' 2 0 'expm1(x*log(BigFloat(2))/16)/x' + + */ + float p = x * ( + 4.332169876512769231967668743345473181486157887703125512683507537369503902991722e-02f+x*(9.384123108485637159805511308039285411735300871134684682779057580789341719567367e-04f+x*(1.355120515540562256928614563584948866224035897564701496826514330445829352922309e-05f)) + ); + int index = (i16 & 15); + p = powersof2to1over16top[index] + (powersof2to1over16bot[index] + powersof2to1over16all[index]*p); + + if ( + fabsf(rlogi) < 126.0f + ) { + return sign * p * fhex((unsigned)((127.0f+rlogi) * 8388608.0f)); + } else if ( + fabsf(rlogi) < 192.0f + ) { + int i = rlogi; + float ret; + + ret = sign * p * + fhex((unsigned)((0x7f+i/2) * 8388608)) * + fhex((unsigned)((0x7f+i-i/2) * 8388608)); + + if ((fai(ret) << 1) == 0xFF000000) + return MATHERR_POWF_OFL(x, y, sign); + else if ((fai(ret) << 1) == 0) + return MATHERR_POWF_UFL(x, y, sign); + else + return FLOAT_CHECKDENORM(ret); + } else { + if (rlogi < 0) + return MATHERR_POWF_UFL(x, y, sign); + else + return MATHERR_POWF_OFL(x, y, sign); + } + } +} diff --git a/math/e_rem_pio2.c b/math/e_rem_pio2.c new file mode 100644 index 0000000..5506378 --- /dev/null +++ b/math/e_rem_pio2.c @@ -0,0 +1,283 @@ +/* + * e_rem_pio2.c + * + * Copyright (C) 1999-2015, ARM Limited, All Rights Reserved + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * This file is part of the Optimized Routines project + */ + +#include "arm_math.h" +#include "math_private.h" +#include + +int ARM__ieee754_rem_pio2(double x, double *y) { + int q; + + y[1] = 0.0; /* default */ + + /* + * Simple cases: all nicked from the fdlibm version for speed. + */ + { + static const double invpio2 = 0x1.45f306dc9c883p-1; + static const double pio2s[] = { + 0x1.921fb544p+0, /* 1.57079632673412561417e+00 */ + 0x1.0b4611a626331p-34, /* 6.07710050650619224932e-11 */ + 0x1.0b4611a6p-34, /* 6.07710050630396597660e-11 */ + 0x1.3198a2e037073p-69, /* 2.02226624879595063154e-21 */ + 0x1.3198a2ep-69, /* 2.02226624871116645580e-21 */ + 0x1.b839a252049c1p-104, /* 8.47842766036889956997e-32 */ + }; + + double z,w,t,r,fn; + int i,j,idx,n,ix,hx; + + hx = __HI(x); /* high word of x */ + ix = hx&0x7fffffff; + if(ix<=0x3fe921fb) /* |x| ~<= pi/4 , no need for reduction */ + {y[0] = x; y[1] = 0; return 0;} + if(ix<0x4002d97c) { /* |x| < 3pi/4, special case with n=+-1 */ + if(hx>0) { + z = x - pio2s[0]; + if(ix!=0x3ff921fb) { /* 33+53 bit pi is good enough */ + y[0] = z - pio2s[1]; +#ifdef bottomhalf + y[1] = (z-y[0])-pio2s[1]; +#endif + } else { /* near pi/2, use 33+33+53 bit pi */ + z -= pio2s[2]; + y[0] = z - pio2s[3]; +#ifdef bottomhalf + y[1] = (z-y[0])-pio2s[3]; +#endif + } + return 1; + } else { /* negative x */ + z = x + pio2s[0]; + if(ix!=0x3ff921fb) { /* 33+53 bit pi is good enough */ + y[0] = z + pio2s[1]; +#ifdef bottomhalf + y[1] = (z-y[0])+pio2s[1]; +#endif + } else { /* near pi/2, use 33+33+53 bit pi */ + z += pio2s[2]; + y[0] = z + pio2s[3]; +#ifdef bottomhalf + y[1] = (z-y[0])+pio2s[3]; +#endif + } + return -1; + } + } + if(ix<=0x413921fb) { /* |x| ~<= 2^19*(pi/2), medium size */ + t = fabs(x); + n = (int) (t*invpio2+0.5); + fn = (double)n; + r = t-fn*pio2s[0]; + w = fn*pio2s[1]; /* 1st round good to 85 bit */ + j = ix>>20; + idx = 1; + while (1) { + y[0] = r-w; + if (idx == 3) + break; + i = j-(((__HI(y[0]))>>20)&0x7ff); + if(i <= 33*idx-17) + break; + t = r; + w = fn*pio2s[2*idx]; + r = t-w; + w = fn*pio2s[2*idx+1]-((t-r)-w); + idx++; + } +#ifdef bottomhalf + y[1] = (r-y[0])-w; +#endif + if(hx<0) { + y[0] = -y[0]; +#ifdef bottomhalf + y[1] = -y[1]; +#endif + return -n; + } else { + return n; + } + } + } + + { + static const unsigned twooverpi[] = { + /* We start with two zero words, because they take up less + * space than the array bounds checking and special-case + * handling that would have to occur in their absence. */ + 0, 0, + /* 2/pi in hex is 0.a2f9836e... */ + 0xa2f9836e, 0x4e441529, 0xfc2757d1, 0xf534ddc0, 0xdb629599, + 0x3c439041, 0xfe5163ab, 0xdebbc561, 0xb7246e3a, 0x424dd2e0, + 0x06492eea, 0x09d1921c, 0xfe1deb1c, 0xb129a73e, 0xe88235f5, + 0x2ebb4484, 0xe99c7026, 0xb45f7e41, 0x3991d639, 0x835339f4, + 0x9c845f8b, 0xbdf9283b, 0x1ff897ff, 0xde05980f, 0xef2f118b, + 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7, 0x4f463f66, 0x9e5fea2d, + 0x7527bac7, 0xebe5f17b, 0x3d0739f7, 0x8a5292ea, 0x6bfb5fb1, + 0x1f8d5d08, + }; + + /* + * Multiprecision multiplication of this nature is more + * readily done in integers than in VFP, since we can use + * UMULL (on CPUs that support it) to multiply 32 by 32 bits + * at a time whereas the VFP would only be able to do 12x12 + * without losing accuracy. + * + * So extract the mantissa of the input number as two 32-bit + * integers. + */ + unsigned mant1 = 0x00100000 | (__HI(x) & 0xFFFFF); + unsigned mant2 = __LO(x); + /* + * Now work out which part of our stored value of 2/pi we're + * supposed to be multiplying by. + * + * Let the IEEE exponent field of x be e. With its bias + * removed, (e-1023) is the index of the set bit in bit 20 + * of 'mant1' (i.e. that set bit has real place value + * 2^(e-1023)). So the lowest set bit in 'mant1', 52 bits + * further down, must have place value 2^(e-1075). + * + * We begin taking an interest in the value of 2/pi at the + * bit which multiplies by _that_ to give something with + * place value at most 2. In other words, the highest bit of + * 2/pi we're interested in is the one with place value + * 2/(2^(e-1075)) = 2^(1076-e). + * + * The bit at the top of the first zero word of the above + * array has place value 2^63. Hence, the bit we want to put + * at the top of the first word we extract from that array + * is the one at bit index n, where 63-n = 1076-e and hence + * n=e-1013. + */ + int topbitindex = ((__HI(x) >> 20) & 0x7FF) - 1013; + int wordindex = topbitindex >> 5; + int shiftup = topbitindex & 31; + int shiftdown = 32 - shiftup; + unsigned scaled[8]; + int i; + + scaled[7] = scaled[6] = 0; + + for (i = 6; i-- > 0 ;) { + /* + * Extract a word from our representation of 2/pi. + */ + unsigned word; + if (shiftup) + word = (twooverpi[wordindex + i] << shiftup) | (twooverpi[wordindex + i + 1] >> shiftdown); + else + word = twooverpi[wordindex + i]; + /* + * Multiply it by both words of our mantissa. (Should + * generate UMULLs where available.) + */ + unsigned long long mult1 = (unsigned long long)word * mant1; + unsigned long long mult2 = (unsigned long long)word * mant2; + /* + * Split those up into 32-bit chunks. + */ + unsigned bottom1 = (unsigned)mult1, top1 = (unsigned)(mult1 >> 32); + unsigned bottom2 = (unsigned)mult2, top2 = (unsigned)(mult2 >> 32); + /* + * Add those two chunks together. + */ + unsigned out1, out2, out3; + + out3 = bottom2; + out2 = top2 + bottom1; + out1 = top1 + (out2 < top2); + /* + * And finally add them to our 'scaled' array. + */ + unsigned s3 = scaled[i+2], s2 = scaled[i+1], s1; + unsigned carry; + s3 = out3 + s3; carry = (s3 < out3); + s2 = out2 + s2 + carry; carry = carry ? (s2 <= out2) : (s2 < out2); + s1 = out1 + carry; + + scaled[i+2] = s3; + scaled[i+1] = s2; + scaled[i] = s1; + } + + + /* + * The topmost set bit in mant1 is bit 20, and that has + * place value 2^(e-1023). The topmost bit (bit 31) of the + * most significant word we extracted from our twooverpi + * array had place value 2^(1076-e). So the product of those + * two bits must have place value 2^53; and that bit will + * have ended up as bit 19 of scaled[0]. Hence, the integer + * quadrant value we want to output, consisting of the bits + * with place values 2^1 and 2^0, must be 52 and 53 bits + * below that, i.e. precisely the topmost two bits of + * scaled[2]. + * + * Or, at least, it will be once we add 1/2, to round to the + * _nearest_ multiple of pi/2 rather than the next one down. + */ + q = (scaled[2] + (1<<29)) >> 30; + + /* + * Now we construct the output fraction, which is most + * simply done in the VFP. We just extract four consecutive + * bit strings from our chunk of binary data, convert them + * to doubles, equip each with an appropriate FP exponent, + * add them together, and (don't forget) multiply back up by + * pi/2. That way we don't have to work out ourselves where + * the highest fraction bit ended up. + * + * Since our displacement from the nearest multiple of pi/2 + * can be positive or negative, the topmost of these four + * values must be arranged with its 2^-1 bit at the very top + * of the word, and then treated as a _signed_ integer. + */ + int i1 = (scaled[2] << 2); + unsigned i2 = scaled[3]; + unsigned i3 = scaled[4]; + unsigned i4 = scaled[5]; + double f1 = i1, f2 = i2 * (0x1.0p-30), f3 = i3 * (0x1.0p-62), f4 = i4 * (0x1.0p-94); + + /* + * Now f1+f2+f3+f4 is a representation, potentially to + * twice double precision, of 2^32 times ((2/pi)*x minus + * some integer). So our remaining job is to multiply + * back down by (pi/2)*2^-32, and convert back to one + * single-precision output number. + */ + double ftop = f1 + (f2 + (f3 + f4)); + __LO(ftop) = 0; + double fbot = f4 - (((ftop-f1)-f2)-f3); + + /* ... and multiply by a prec-and-a-half value of (pi/2)*2^-32. */ + double ret = (ftop * 0x1.921fb54p-32) + (ftop * 0x1.10b4611a62633p-62 + fbot * 0x1.921fb54442d18p-32); + + /* Just before we return, take the input sign into account. */ + if (__HI(x) & 0x80000000) { + q = -q; + ret = -ret; + } + y[0] = ret; + return q; + } +} diff --git a/math/funder.c b/math/funder.c new file mode 100644 index 0000000..7c086d1 --- /dev/null +++ b/math/funder.c @@ -0,0 +1,65 @@ +/* + * funder.c - manually provoke SP exceptions for mathlib + * + * Copyright (C) 2009-2015, ARM Limited, All Rights Reserved + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * This file is part of the Optimized Routines project + */ + +#include "math_private.h" +#include + +__inline float __mathlib_flt_infnan2(float x, float y) +{ + return x+y; +} + +__inline float __mathlib_flt_infnan(float x) +{ + return x+x; +} + +float __mathlib_flt_underflow(void) +{ +#ifdef CLANG_EXCEPTIONS + feraiseexcept(FE_UNDERFLOW); +#endif + return 0x1p-95F * 0x1p-95F; +} + +float __mathlib_flt_overflow(void) +{ +#ifdef CLANG_EXCEPTIONS + feraiseexcept(FE_OVERFLOW); +#endif + return 0x1p+97F * 0x1p+97F; +} + +float __mathlib_flt_invalid(void) +{ +#ifdef CLANG_EXCEPTIONS + feraiseexcept(FE_INVALID); +#endif + return 0.0f / 0.0f; +} + +float __mathlib_flt_divzero(void) +{ +#ifdef CLANG_EXCEPTIONS + feraiseexcept(FE_DIVBYZERO); +#endif + return 1.0f / 0.0f; +} diff --git a/math/ieee_status.c b/math/ieee_status.c new file mode 100644 index 0000000..85fa48a --- /dev/null +++ b/math/ieee_status.c @@ -0,0 +1,52 @@ +/* + * ieee_status.c + * + * Copyright (C) 2015, ARM Limited, All Rights Reserved + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * This file is part of the Optimized Routines project + */ + +#include "math_private.h" + +__inline unsigned __ieee_status(unsigned bicmask, unsigned xormask) +{ +#ifdef __FP_FENV_EXCEPTIONS + unsigned status_word; + unsigned ret; + +#ifdef __FP_FENV_ROUNDING +# define MASK (1<<27)|FE_IEEE_FLUSHZERO|FE_IEEE_MASK_ALL_EXCEPT|FE_IEEE_ALL_EXCEPT|FE_IEEE_ROUND_MASK +#else +# define MASK (1<<27)|FE_IEEE_FLUSHZERO|FE_IEEE_MASK_ALL_EXCEPT|FE_IEEE_ALL_EXCEPT +#endif + + /* mask out read-only bits */ + bicmask &= MASK; + xormask &= MASK; + + /* modify the status word */ + __asm__ __volatile__ ("mrs %0, fpsr" : "=r" (status_word)); + ret = status_word; + status_word &= ~bicmask; + status_word ^= xormask; + __asm__ __volatile__ ("msr fpsr, %0" : : "r" (status_word)); + + /* and return what it used to be */ + return ret; +#else + return 0; +#endif +} diff --git a/math/include/arm_math.h b/math/include/arm_math.h new file mode 100644 index 0000000..c0a70d9 --- /dev/null +++ b/math/include/arm_math.h @@ -0,0 +1,33 @@ +/* + * arm_math.h - ARM math library function definitions + * + * Copyright (C) 2015, ARM Limited, All Rights Reserved + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * This file is part of the Optimized Routines project + */ + +/* Double precision */ +__attribute__((const)) double ARM__sin(double); +__attribute__((const)) double ARM__cos(double); +double ARM__tan(double); + +/* Single precision */ +float ARM__sinf(float); +float ARM__cosf(float); +float ARM__tanf(float); +float ARM__expf(float); +float ARM__logf(float); +float ARM__powf(float, float); diff --git a/math/k_cos.c b/math/k_cos.c new file mode 100644 index 0000000..fa4d31b --- /dev/null +++ b/math/k_cos.c @@ -0,0 +1,107 @@ +/* + * k_cos.c + * + * Copyright (C) 1998-2015, ARM Limited, All Rights Reserved + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * This file is part of the Optimized Routines project + */ + +/* @(#)k_cos.c 1.3 95/01/18 */ +/* + * ==================================================== + * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * + * Developed at SunSoft, a Sun Microsystems, Inc. business. + * Permission to use, copy, modify, and distribute this + * software is freely granted, provided that this notice + * is preserved. + * ==================================================== + */ + +/* + * __kernel_cos( x, y ) + * kernel cos function on [-pi/4, pi/4], pi/4 ~ 0.785398164 + * Input x is assumed to be bounded by ~pi/4 in magnitude. + * Input y is the tail of x. + * + * Algorithm + * 1. Since cos(-x) = cos(x), we need only to consider positive x. + * 2. if x < 2^-27 (hx<0x3e400000 0), return 1 with inexact if x!=0. + * 3. cos(x) is approximated by a polynomial of degree 14 on + * [0,pi/4] + * 4 14 + * cos(x) ~ 1 - x*x/2 + C1*x + ... + C6*x + * where the remez error is + * + * | 2 4 6 8 10 12 14 | -58 + * |cos(x)-(1-.5*x +C1*x +C2*x +C3*x +C4*x +C5*x +C6*x )| <= 2 + * | | + * + * 4 6 8 10 12 14 + * 4. let r = C1*x +C2*x +C3*x +C4*x +C5*x +C6*x , then + * cos(x) = 1 - x*x/2 + r + * since cos(x+y) ~ cos(x) - sin(x)*y + * ~ cos(x) - x*y, + * a correction term is necessary in cos(x) and hence + * cos(x+y) = 1 - (x*x/2 - (r - x*y)) + * For better accuracy when x > 0.3, let qx = |x|/4 with + * the last 32 bits mask off, and if x > 0.78125, let qx = 0.28125. + * Then + * cos(x+y) = (1-qx) - ((x*x/2-qx) - (r-x*y)). + * Note that 1-qx and (x*x/2-qx) is EXACT here, and the + * magnitude of the latter is at least a quarter of x*x/2, + * thus, reducing the rounding error in the subtraction. + */ + +#include "arm_math.h" +#include "math_private.h" + +static const double +one = 0x1p+0, /* 1.00000000000000000000e+00 */ +C[] = { 0x1.555555555554cp-5, /* 4.16666666666666019037e-02 */ + -0x1.6c16c16c15177p-10, /* -1.38888888888741095749e-03 */ + 0x1.a01a019cb159p-16, /* 2.48015872894767294178e-05 */ + -0x1.27e4f809c52adp-22, /* -2.75573143513906633035e-07 */ + 0x1.1ee9ebdb4b1c4p-29, /* 2.08757232129817482790e-09 */ + -0x1.8fae9be8838d4p-37, /* -1.13596475577881948265e-11 */ }; + +double +ARM__kernel_cos(double x, double y) +{ + double a,hz,z,r,qx; + int ix; + ix = __HI(x)&0x7fffffff; /* ix = |x|'s high word*/ + if(ix<0x3e400000) { /* if x < 2**27 */ + if(((int)x)==0) return one; /* generate inexact */ + } + z = x*x; + r = z*ARM__kernel_poly(C,6,z); + r = z*r - x*y; + z = 0.5 * z; + if(ix < 0x3FD33333) /* if |x| < 0.3 */ + return one - (z - r); + else { + if(ix > 0x3fe90000) { /* x > 0.78125 */ + qx = 0.28125; + } else { + __HI(qx) = ix-0x00200000; /* x/4 */ + __LO(qx) = 0; + } + hz = z-qx; + a = one-qx; + return a - (hz - r); + } +} diff --git a/math/k_sin.c b/math/k_sin.c new file mode 100644 index 0000000..3b97028 --- /dev/null +++ b/math/k_sin.c @@ -0,0 +1,88 @@ +/* + * k_sin.c + * + * Copyright (C) 1998-2015, ARM Limited, All Rights Reserved + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * This file is part of the Optimized Routines project + */ + +/* @(#)k_sin.c 1.3 95/01/18 */ +/* + * ==================================================== + * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * + * Developed at SunSoft, a Sun Microsystems, Inc. business. + * Permission to use, copy, modify, and distribute this + * software is freely granted, provided that this notice + * is preserved. + * ==================================================== + */ + +/* __kernel_sin( x, y, iy) + * kernel sin function on [-pi/4, pi/4], pi/4 ~ 0.7854 + * Input x is assumed to be bounded by ~pi/4 in magnitude. + * Input y is the tail of x. + * Input iy indicates whether y is 0. (if iy=0, y assume to be 0). + * + * Algorithm + * 1. Since sin(-x) = -sin(x), we need only to consider positive x. + * 2. if x < 2^-27 (hx<0x3e400000 0), return x with inexact if x!=0. + * 3. sin(x) is approximated by a polynomial of degree 13 on + * [0,pi/4] + * 3 13 + * sin(x) ~ x + S1*x + ... + S6*x + * where + * + * |sin(x) 2 4 6 8 10 12 | -58 + * |----- - (1+S1*x +S2*x +S3*x +S4*x +S5*x +S6*x )| <= 2 + * | x | + * + * 4. sin(x+y) = sin(x) + sin'(x')*y + * ~ sin(x) + (1-x*x/2)*y + * For better accuracy, let + * 3 2 2 2 2 + * r = x *(S2+x *(S3+x *(S4+x *(S5+x *S6)))) + * then 3 2 + * sin(x) = x + (S1*x + (x *(r-y/2)+y)) + */ + +#include "arm_math.h" +#include "math_private.h" +#include + +static const double + half = 0x1p-1, /* 5.00000000000000000000e-01 */ + S1 = -0x1.5555555555549p-3, /* -1.66666666666666324348e-01 */ + S[] = { 0x1.111111110f8a6p-7, /* 8.33333333332248946124e-03 */ + -0x1.a01a019c161d5p-13, /* -1.98412698298579493134e-04 */ + 0x1.71de357b1fe7dp-19, /* 2.75573137070700676789e-06 */ + -0x1.ae5e68a2b9cebp-26, /* -2.50507602534068634195e-08 */ + 0x1.5d93a5acfd57cp-33, /* 1.58969099521155010221e-10 */ }; + +double +ARM__kernel_sin(double x, double y, int iy) +{ + double z,r,v; + int ix; + ix = __HI(x)&0x7fffffff; /* high word of x */ + if(ix<0x3e400000) /* |x| < 2**-27 */ + return DOUBLE_CHECKDENORM(x); + z = x*x; + v = z*x; + r = ARM__kernel_poly(S,5,z); + if(iy==0) return x+v*(S1+z*r); + else return x-((z*(half*y-v*r)-y)-v*S1); +} diff --git a/math/k_tan.c b/math/k_tan.c new file mode 100644 index 0000000..50c1201 --- /dev/null +++ b/math/k_tan.c @@ -0,0 +1,154 @@ +/* + * k_tan.c + * + * Copyright (C) 1998-2015, ARM Limited, All Rights Reserved + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * This file is part of the Optimized Routines project + */ + +/* @(#)k_tan.c 1.3 95/01/18 */ +/* + * ==================================================== + * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * + * Developed at SunSoft, a Sun Microsystems, Inc. business. + * Permission to use, copy, modify, and distribute this + * software is freely granted, provided that this notice + * is preserved. + * ==================================================== + */ + +/* __kernel_tan( x, y, k ) + * kernel tan function on [-pi/4, pi/4], pi/4 ~ 0.7854 + * Input x is assumed to be bounded by ~pi/4 in magnitude. + * Input y is the tail of x. + * Input k indicates whether tan (if k=1) or + * -1/tan (if k= -1) is returned. + * + * Algorithm + * 1. Since tan(-x) = -tan(x), we need only to consider positive x. + * 2. if x < 2^-28 (hx<0x3e300000 0), return x with inexact if x!=0. + * 3. tan(x) is approximated by a odd polynomial of degree 27 on + * [0,0.67434] + * 3 27 + * tan(x) ~ x + T1*x + ... + T13*x + * where + * + * |tan(x) 2 4 26 | -59.2 + * |----- - (1+T1*x +T2*x +.... +T13*x )| <= 2 + * | x | + * + * Note: tan(x+y) = tan(x) + tan'(x)*y + * ~ tan(x) + (1+x*x)*y + * Therefore, for better accuracy in computing tan(x+y), let + * 3 2 2 2 2 + * r = x *(T2+x *(T3+x *(...+x *(T12+x *T13)))) + * then + * 3 2 + * tan(x+y) = x + (T1*x + (x *(r+y)+y)) + * + * 4. For x in [0.67434,pi/4], let y = pi/4 - x, then + * tan(x) = tan(pi/4-y) = (1-tan(y))/(1+tan(y)) + * = 1 - 2*(tan(y) - (tan(y)^2)/(1+tan(y))) + */ + +#include +#include "math_private.h" + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ +#ifdef __STDC__ +static const double +#else +static double +#endif +one = 0x1p+0, /* 1.00000000000000000000e+00 */ +pio4 = 0x1.921fb54442d18p-1, /* 7.85398163397448278999e-01 */ +pio4lo= 0x1.1a62633145c07p-55, /* 3.06161699786838301793e-17 */ +T0 = 0x1.5555555555563p-2, /* 3.33333333333334091986e-01 */ +Todd[] = { + 0x1.111111110fe7ap-3, /* 1.33333333333201242699e-01 */ + 0x1.664f48406d637p-6, /* 2.18694882948595424599e-02 */ + 0x1.d6d22c9560328p-9, /* 3.59207910759131235356e-03 */ + 0x1.344d8f2f26501p-11, /* 5.88041240820264096874e-04 */ + 0x1.47e88a03792a6p-14, /* 7.81794442939557092300e-05 */ + -0x1.375cbdb605373p-16, /* -1.85586374855275456654e-05 */ +}, +Teven[] = { + 0x1.ba1ba1bb341fep-5, /* 5.39682539762260521377e-02 */ + 0x1.226e3e96e8493p-7, /* 8.86323982359930005737e-03 */ + 0x1.7dbc8fee08315p-10, /* 1.45620945432529025516e-03 */ + 0x1.026f71a8d1068p-12, /* 2.46463134818469906812e-04 */ + 0x1.2b80f32f0a7e9p-14, /* 7.14072491382608190305e-05 */ + 0x1.b2a7074bf7ad4p-16, /* 2.59073051863633712884e-05 */ +}; + + +double ARM__kernel_tan(double x, double y, int iy) +{ + double z,r,v,w,s; + int ix,hx; + hx = __HI(x); /* high word of x */ + ix = hx&0x7fffffff; /* high word of |x| */ + if(ix<0x3e300000) { /* x < 2**-28 */ + if((int)x==0) { /* generate inexact */ + if(((ix|__LO(x))|(iy+1))==0) return one/fabs(x); + else return (iy==1)? DOUBLE_CHECKDENORM(x): -one/x; + } + } + if(ix>=0x3FE59428) { /* |x|>=0.6744 */ + if(hx<0) {x = -x; y = -y;} + z = pio4-x; + w = pio4lo-y; + x = z+w; y = 0.0; + } + z = x*x; + w = z*z; + /* Break x^5*(T[1]+x^2*T[2]+...) into + * x^5(T[1]+x^4*T[3]+...+x^20*T[11]) + + * x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12])) + */ + r = ARM__kernel_poly(Todd, 6, w); + v = z*ARM__kernel_poly(Teven, 6, w); + s = z*x; + r = y + z*(s*(r+v)+y); + r += T0*s; + w = x+r; + if(ix>=0x3FE59428) { + v = (double)iy; + return (double)(1-((hx>>30)&2))*(v-2.0*(x-(w*w/(w+v)-r))); + } + if(iy==1) return w; + else { /* if allow error up to 2 ulp, + simply return -1.0/(x+r) here */ + /* compute -1.0/(x+r) accurately */ + double a,t; + z = w; + __LO(z) = 0; + v = r-(z - x); /* z+v = r+x */ + t = a = -1.0/w; /* a = -1.0/w */ + __LO(t) = 0; + s = 1.0+t*z; + return t+a*(s+t*v); + } +} + +#ifdef __cplusplus +} /* end of extern "C" */ +#endif /* __cplusplus */ + +/* end of tan_i.c */ diff --git a/math/math_private.h b/math/math_private.h new file mode 100644 index 0000000..75f222b --- /dev/null +++ b/math/math_private.h @@ -0,0 +1,137 @@ +/* + * math_private.h + * + * Copyright (C) 2015, ARM Limited, All Rights Reserved + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * This file is part of the Optimized Routines project + */ + +/* + * Header file containing some definitions and other small reusable pieces of + * code that we need in the libraries. + */ + +#ifndef __MATH_PRIVATE_H +#define __MATH_PRIVATE_H + +#include +#include + +extern int ARM__ieee754_rem_pio2(double, double *); +extern double ARM__kernel_sin(double, double, int); +extern double ARM__kernel_cos(double, double); +extern double ARM__kernel_poly(const double *, int, double); + +#define __FP_IEEE +#define __FP_FENV_EXCEPTIONS +#define __FP_FENV_ROUNDING +#define __FP_INEXACT_EXCEPTION + +#define __set_errno(val) (errno = (val)) + +#define __FLT(x) (*(unsigned *)&(x)) +#if defined(__ARM_BIG_ENDIAN) || defined(__BIG_ENDIAN) +# define __LO(x) (*(1 + (unsigned *)&(x))) +# define __HI(x) (*(unsigned *)&(x)) +#else /* !defined(__ARM_BIG_ENDIAN) && !defined(__BIG_ENDIAN) */ +# define __HI(x) (*(1 + (unsigned *)&(x))) +# define __LO(x) (*(unsigned *)&(x)) +#endif /* !defined(__ARM_BIG_ENDIAN) && !defined(__BIG_ENDIAN) */ + +// FIXME: Implement these without type punning. +static __inline unsigned int fai(float f) { return __FLT(f); } +static __inline float fhex(unsigned int n) { float f; __FLT(f) = n; return f; } + +#define CLEARBOTTOMHALF(x) fhex((fai(x) + 0x00000800) & 0xFFFFF000) + +#define FE_IEEE_OVERFLOW (0x00000004) +#define FE_IEEE_UNDERFLOW (0x00000008) +#define FE_IEEE_FLUSHZERO (0x01000000) +#define FE_IEEE_ROUND_TONEAREST (0x00000000) +#define FE_IEEE_ROUND_UPWARD (0x00400000) +#define FE_IEEE_ROUND_DOWNWARD (0x00800000) +#define FE_IEEE_ROUND_TOWARDZERO (0x00C00000) +#define FE_IEEE_ROUND_MASK (0x00C00000) +#define FE_IEEE_MASK_INVALID (0x00000100) +#define FE_IEEE_MASK_DIVBYZERO (0x00000200) +#define FE_IEEE_MASK_OVERFLOW (0x00000400) +#define FE_IEEE_MASK_UNDERFLOW (0x00000800) +#define FE_IEEE_MASK_INEXACT (0x00001000) +#define FE_IEEE_MASK_INPUTDENORMAL (0x00008000) +#define FE_IEEE_MASK_ALL_EXCEPT (0x00009F00) +#define FE_IEEE_INVALID (0x00000001) +#define FE_IEEE_DIVBYZERO (0x00000002) +#define FE_IEEE_INEXACT (0x00000010) +#define FE_IEEE_INPUTDENORMAL (0x00000080) +#define FE_IEEE_ALL_EXCEPT (0x0000009F) + +extern double __mathlib_dbl_overflow(void); +extern float __mathlib_flt_overflow(void); +extern double __mathlib_dbl_underflow(void); +extern float __mathlib_flt_underflow(void); +extern double __mathlib_dbl_invalid(void); +extern float __mathlib_flt_invalid(void); +extern double __mathlib_dbl_divzero(void); +extern float __mathlib_flt_divzero(void); +#define DOUBLE_OVERFLOW ( __mathlib_dbl_overflow() ) +#define FLOAT_OVERFLOW ( __mathlib_flt_overflow() ) +#define DOUBLE_UNDERFLOW ( __mathlib_dbl_underflow() ) +#define FLOAT_UNDERFLOW ( __mathlib_flt_underflow() ) +#define DOUBLE_INVALID ( __mathlib_dbl_invalid() ) +#define FLOAT_INVALID ( __mathlib_flt_invalid() ) +#define DOUBLE_DIVZERO ( __mathlib_dbl_divzero() ) +#define FLOAT_DIVZERO ( __mathlib_flt_divzero() ) + +extern float __mathlib_flt_infnan(float); +extern float __mathlib_flt_infnan2(float, float); +extern double __mathlib_dbl_infnan(double); +extern double __mathlib_dbl_infnan2(double, double); +extern unsigned __ieee_status(unsigned, unsigned); +extern double ARM__kernel_tan(double, double, int); + +#define FLOAT_INFNAN(x) __mathlib_flt_infnan(x) +#define FLOAT_INFNAN2(x,y) __mathlib_flt_infnan2(x,y) +#define DOUBLE_INFNAN(x) __mathlib_dbl_infnan(x) +#define DOUBLE_INFNAN2(x,y) __mathlib_dbl_infnan2(x,y) + +#define MATHERR_POWF_00(x,y) (__set_errno(EDOM), 1.0f) +#define MATHERR_POWF_INF0(x,y) (__set_errno(EDOM), 1.0f) +#define MATHERR_POWF_0NEG(x,y) (__set_errno(ERANGE), FLOAT_DIVZERO) +#define MATHERR_POWF_NEG0FRAC(x,y) (0.0f) +#define MATHERR_POWF_0NEGODD(x,y) (__set_errno(ERANGE), -FLOAT_DIVZERO) +#define MATHERR_POWF_0NEGEVEN(x,y) (__set_errno(ERANGE), FLOAT_DIVZERO) +#define MATHERR_POWF_NEGFRAC(x,y) (__set_errno(EDOM), FLOAT_INVALID) +#define MATHERR_POWF_ONEINF(x,y) (1.0f) +#define MATHERR_POWF_OFL(x,y,z) (__set_errno(ERANGE), copysignf(FLOAT_OVERFLOW,z)) +#define MATHERR_POWF_UFL(x,y,z) (__set_errno(ERANGE), copysignf(FLOAT_UNDERFLOW,z)) + +#define MATHERR_LOGF_0(x) (__set_errno(ERANGE), -FLOAT_DIVZERO) +#define MATHERR_LOGF_NEG(x) (__set_errno(EDOM), FLOAT_INVALID) + +#define MATHERR_SIN_INF(x) (__set_errno(EDOM), DOUBLE_INVALID) +#define MATHERR_SINF_INF(x) (__set_errno(EDOM), FLOAT_INVALID) +#define MATHERR_COS_INF(x) (__set_errno(EDOM), DOUBLE_INVALID) +#define MATHERR_COSF_INF(x) (__set_errno(EDOM), FLOAT_INVALID) +#define MATHERR_TAN_INF(x) (__set_errno(EDOM), DOUBLE_INVALID) +#define MATHERR_TANF_INF(x) (__set_errno(EDOM), FLOAT_INVALID) + +#define MATHERR_EXPF_UFL(x) (__set_errno(ERANGE), FLOAT_UNDERFLOW) +#define MATHERR_EXPF_OFL(x) (__set_errno(ERANGE), FLOAT_OVERFLOW) + +#define FLOAT_CHECKDENORM(x) ( (fpclassify(x) == FP_SUBNORMAL ? FLOAT_UNDERFLOW : 0), x ) +#define DOUBLE_CHECKDENORM(x) ( (fpclassify(x) == FP_SUBNORMAL ? DOUBLE_UNDERFLOW : 0), x ) + +#endif /* __MATH_PRIVATE_H */ diff --git a/math/poly.c b/math/poly.c new file mode 100644 index 0000000..afb5368 --- /dev/null +++ b/math/poly.c @@ -0,0 +1,39 @@ +/* + * poly.c + * + * Copyright (C) 1998-2015, ARM Limited, All Rights Reserved + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * This file is part of the Optimized Routines project + */ + +double ARM__kernel_poly(const double *coeffs, int n, double x) +{ + double result = coeffs[--n]; + + while ((n & ~0x6) != 0) /* Loop until n even and < 8 */ + result = (result * x) + coeffs[--n]; + + switch (n) + { + case 6: result = (result * x) + coeffs[5]; + result = (result * x) + coeffs[4]; + case 4: result = (result * x) + coeffs[3]; + result = (result * x) + coeffs[2]; + case 2: result = (result * x) + coeffs[1]; + result = (result * x) + coeffs[0]; + } + return result; +} diff --git a/math/rredf.c b/math/rredf.c new file mode 100644 index 0000000..69ab35a --- /dev/null +++ b/math/rredf.c @@ -0,0 +1,253 @@ +/* + * rredf.c - trigonometric range reduction function + * + * Copyright (C) 2009-2015, ARM Limited, All Rights Reserved + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * This file is part of the Optimized Routines project + */ + +/* + * This code is intended to be used as the second half of a range + * reducer whose first half is an inline function defined in + * rredf.h. Each trig function performs range reduction by invoking + * that, which handles the quickest and most common cases inline + * before handing off to this function for everything else. Thus a + * reasonable compromise is struck between speed and space. (I + * hope.) In particular, this approach avoids a function call + * overhead in the common case. + */ + +#include "math_private.h" + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +/* + * Input values to this function: + * - x is the original user input value, unchanged by the + * first-tier reducer in the case where it hands over to us. + * - q is still the place where the caller expects us to leave the + * quadrant code. + * - k is the IEEE bit pattern of x (which it would seem a shame to + * recompute given that the first-tier reducer already went to + * the effort of extracting it from the VFP). FIXME: in softfp, + * on the other hand, it's unconscionably wasteful to replicate + * this value into a second register and we should change the + * prototype! + */ +float ARM__mathlib_rredf2(float x, int *q, unsigned k) +{ + /* + * First, weed out infinities and NaNs, and deal with them by + * returning a negative q. + */ + if ((k << 1) >= 0xFF000000) { + *q = -1; + return x; + } + /* + * We do general range reduction by multiplying by 2/pi, and + * retaining the bottom two bits of the integer part and an + * initial chunk of the fraction below that. The integer bits + * are directly output as *q; the fraction is then multiplied + * back up by pi/2 before returning it. + * + * To get this right, we don't have to multiply by the _whole_ + * of 2/pi right from the most significant bit downwards: + * instead we can discard any bit of 2/pi with a place value + * high enough that multiplying it by the LSB of x will yield a + * place value higher than 2. Thus we can bound the required + * work by a reasonably small constant regardless of the size of + * x (unlike, for instance, the IEEE remainder operation). + * + * At the other end, however, we must take more care: it isn't + * adequate just to acquire two integer bits and 24 fraction + * bits of (2/pi)x, because if a lot of those fraction bits are + * zero then we will suffer significance loss. So we must keep + * computing fraction bits as far down as 23 bits below the + * _highest set fraction bit_. + * + * The immediate question, therefore, is what the bound on this + * end of the job will be. In other words: what is the smallest + * difference between an integer multiple of pi/2 and a + * representable IEEE single precision number larger than the + * maximum size handled by rredf.h? + * + * The most difficult cases for each exponent can readily be + * found by Tim Peters's modular minimisation algorithm, and are + * tabulated in mathlib/tests/directed/rredf.tst. The single + * worst case is the IEEE single-precision number 0x6F79BE45, + * whose numerical value is in the region of 7.7*10^28; when + * reduced mod pi/2, it attains the value 0x30DDEEA9, or about + * 0.00000000161. The highest set bit of this value is the one + * with place value 2^-30; so its lowest is 2^-53. Hence, to be + * sure of having enough fraction bits to output at full single + * precision, we must be prepared to collect up to 53 bits of + * fraction in addition to our two bits of integer part. + * + * To begin with, this means we must store the value of 2/pi to + * a precision of 128+53 = 181 bits. That's six 32-bit words. + * (Hardly a chore, unlike the equivalent problem in double + * precision!) + */ + { + static const unsigned twooverpi[] = { + /* We start with a zero word, because that takes up less + * space than the array bounds checking and special-case + * handling that would have to occur in its absence. */ + 0, + /* 2/pi in hex is 0.a2f9836e... */ + 0xa2f9836e, 0x4e441529, 0xfc2757d1, + 0xf534ddc0, 0xdb629599, 0x3c439041, + /* Again, to avoid array bounds overrun, we store a spare + * word at the end. And it would be a shame to fill it + * with zeroes when we could use more bits of 2/pi... */ + 0xfe5163ab + }; + + /* + * Multiprecision multiplication of this nature is more + * readily done in integers than in VFP, since we can use + * UMULL (on CPUs that support it) to multiply 32 by 32 bits + * at a time whereas the VFP would only be able to do 12x12 + * without losing accuracy. + * + * So extract the mantissa of the input number as a 32-bit + * integer. + */ + unsigned mantissa = 0x80000000 | (k << 8); + + /* + * Now work out which part of our stored value of 2/pi we're + * supposed to be multiplying by. + * + * Let the IEEE exponent field of x be e. With its bias + * removed, (e-127) is the index of the set bit at the top + * of 'mantissa' (i.e. that set bit has real place value + * 2^(e-127)). So the lowest set bit in 'mantissa', 23 bits + * further down, must have place value 2^(e-150). + * + * We begin taking an interest in the value of 2/pi at the + * bit which multiplies by _that_ to give something with + * place value at most 2. In other words, the highest bit of + * 2/pi we're interested in is the one with place value + * 2/(2^(e-150)) = 2^(151-e). + * + * The bit at the top of the first (zero) word of the above + * array has place value 2^31. Hence, the bit we want to put + * at the top of the first word we extract from that array + * is the one at bit index n, where 31-n = 151-e and hence + * n=e-120. + */ + int topbitindex = ((k >> 23) & 0xFF) - 120; + int wordindex = topbitindex >> 5; + int shiftup = topbitindex & 31; + int shiftdown = 32 - shiftup; + unsigned word1, word2, word3; + if (shiftup) { + word1 = (twooverpi[wordindex] << shiftup) | (twooverpi[wordindex+1] >> shiftdown); + word2 = (twooverpi[wordindex+1] << shiftup) | (twooverpi[wordindex+2] >> shiftdown); + word3 = (twooverpi[wordindex+2] << shiftup) | (twooverpi[wordindex+3] >> shiftdown); + } else { + word1 = twooverpi[wordindex]; + word2 = twooverpi[wordindex+1]; + word3 = twooverpi[wordindex+2]; + } + + /* + * Do the multiplications, and add them together. + */ + unsigned long long mult1 = (unsigned long long)word1 * mantissa; + unsigned long long mult2 = (unsigned long long)word2 * mantissa; + unsigned long long mult3 = (unsigned long long)word3 * mantissa; + + unsigned /* bottom3 = (unsigned)mult3, */ top3 = (unsigned)(mult3 >> 32); + unsigned bottom2 = (unsigned)mult2, top2 = (unsigned)(mult2 >> 32); + unsigned bottom1 = (unsigned)mult1, top1 = (unsigned)(mult1 >> 32); + + unsigned out3, out2, out1, carry; + + out3 = top3 + bottom2; carry = (out3 < top3); + out2 = top2 + bottom1 + carry; carry = carry ? (out2 <= top2) : (out2 < top2); + out1 = top1 + carry; + + /* + * The two words we multiplied to get mult1 had their top + * bits at (respectively) place values 2^(151-e) and + * 2^(e-127). The value of those two bits multiplied + * together will have ended up in bit 62 (the + * topmost-but-one bit) of mult1, i.e. bit 30 of out1. + * Hence, that bit has place value 2^(151-e+e-127) = 2^24. + * So the integer value that we want to output as q, + * consisting of the bits with place values 2^1 and 2^0, + * must be 23 and 24 bits below that, i.e. in bits 7 and 6 + * of out1. + * + * Or, at least, it will be once we add 1/2, to round to the + * _nearest_ multiple of pi/2 rather than the next one down. + */ + *q = (out1 + (1<<5)) >> 6; + + /* + * Now we construct the output fraction, which is most + * simply done in the VFP. We just extract three consecutive + * bit strings from our chunk of binary data, convert them + * to integers, equip each with an appropriate FP exponent, + * add them together, and (don't forget) multiply back up by + * pi/2. That way we don't have to work out ourselves where + * the highest fraction bit ended up. + * + * Since our displacement from the nearest multiple of pi/2 + * can be positive or negative, the topmost of these three + * values must be arranged with its 2^-1 bit at the very top + * of the word, and then treated as a _signed_ integer. + */ + { + int i1 = (out1 << 26) | ((out2 >> 19) << 13); + unsigned i2 = out2 << 13; + unsigned i3 = out3; + float f1 = i1, f2 = i2 * (1.0f/524288.0f), f3 = i3 * (1.0f/524288.0f/524288.0f); + + /* + * Now f1+f2+f3 is a representation, potentially to + * twice double precision, of 2^32 times ((2/pi)*x minus + * some integer). So our remaining job is to multiply + * back down by (pi/2)*2^-32, and convert back to one + * single-precision output number. + */ + + /* Normalise to a prec-and-a-half representation... */ + float ftop = CLEARBOTTOMHALF(f1+f2+f3), fbot = f3-((ftop-f1)-f2); + + /* ... and multiply by a prec-and-a-half value of (pi/2)*2^-32. */ + float ret = (ftop * 0x1.92p-32F) + (ftop * 0x1.fb5444p-44F + fbot * 0x1.921fb6p-32F); + + /* Just before we return, take the input sign into account. */ + if (k & 0x80000000) { + *q = 0x10000000 - *q; + ret = -ret; + } + return ret; + } + } +} + +#ifdef __cplusplus +} /* end of extern "C" */ +#endif /* __cplusplus */ + +/* end of rredf.c */ diff --git a/math/rredf.h b/math/rredf.h new file mode 100644 index 0000000..1afa141 --- /dev/null +++ b/math/rredf.h @@ -0,0 +1,145 @@ +/* rredf.h - trigonometric range reduction function written new for RVCT 4.1 + * + * Copyright 2009 ARM Limited. All rights reserved. + * + */ + +/* + * This header file defines an inline function which all three of + * the single-precision trig functions (sinf, cosf, tanf) should use + * to perform range reduction. The inline function handles the + * quickest and most common cases inline, before handing off to an + * out-of-line function defined in rredf.c for everything else. Thus + * a reasonable compromise is struck between speed and space. (I + * hope.) In particular, this approach avoids a function call + * overhead in the common case. + */ + +#ifndef _included_rredf_h +#define _included_rredf_h + +#include "math_private.h" + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +extern float ARM__mathlib_rredf2(float x, int *q, unsigned k); + +/* + * Semantics of the function: + * - x is the single-precision input value provided by the user + * - the return value is in the range [-pi/4,pi/4], and is equal + * (within reasonable accuracy bounds) to x minus n*pi/2 for some + * integer n. (FIXME: perhaps some slippage on the output + * interval is acceptable, requiring more range from the + * following polynomial approximations but permitting more + * approximate rred decisions?) + * - *q is set to a positive value whose low two bits match those + * of n. Alternatively, it comes back as -1 indicating that the + * input value was trivial in some way (infinity, NaN, or so + * small that we can safely return sin(x)=tan(x)=x,cos(x)=1). + */ +static __inline float ARM__mathlib_rredf(float x, int *q) +{ + /* + * First, extract the bit pattern of x as an integer, so that we + * can repeatedly compare things to it without multiple + * overheads in retrieving comparison results from the VFP. + */ + unsigned k = fai(x); + + /* + * Deal immediately with the simplest possible case, in which x + * is already within the interval [-pi/4,pi/4]. This also + * identifies the subcase of ludicrously small x. + */ + if ((k << 1) < (0x3f490fdb << 1)) { + if ((k << 1) < (0x39800000 << 1)) + *q = -1; + else + *q = 0; + return x; + } + + /* + * The next plan is to multiply x by 2/pi and convert to an + * integer, which gives us n; then we subtract n*pi/2 from x to + * get our output value. + * + * By representing pi/2 in that final step by a prec-and-a-half + * approximation, we can arrange good accuracy for n strictly + * less than 2^13 (so that an FP representation of n has twelve + * zero bits at the bottom). So our threshold for this strategy + * is 2^13 * pi/2 - pi/4, otherwise known as 8191.75 * pi/2 or + * 4095.875*pi. (Or, for those perverse people interested in + * actual numbers rather than multiples of pi/2, about 12867.5.) + */ + if (__builtin_expect((k & 0x7fffffff) < 0x46490e49, 1)) { + float nf = 0.636619772367581343f * x; + /* + * The difference between that single-precision constant and + * the real 2/pi is about 2.568e-8. Hence the product nf has a + * potential error of 2.568e-8|x| even before rounding; since + * |x| < 4096 pi, that gives us an error bound of about + * 0.0003305. + * + * nf is then rounded to single precision, with a max error of + * 1/2 ULP, and since nf goes up to just under 8192, half a + * ULP could be as big as 2^-12 ~= 0.0002441. + * + * So by the time we convert nf to an integer, it could be off + * by that much, causing the wrong integer to be selected, and + * causing us to return a value a little bit outside the + * theoretical [-pi/4,+pi/4] output interval. + * + * How much outside? Well, we subtract nf*pi/2 from x, so the + * error bounds above have be be multiplied by pi/2. And if + * both of the above sources of error suffer their worst cases + * at once, then the very largest value we could return is + * obtained by adding that lot to the interval bound pi/4 to + * get + * + * pi/4 + ((2/pi - 0f_3f22f983)*4096*pi + 2^-12) * pi/2 + * + * which comes to 0f_3f494b02. (Compare 0f_3f490fdb = pi/4.) + * + * So callers of this range reducer should be prepared to + * handle numbers up to that large. + */ +#ifdef __TARGET_FPU_SOFTVFP + nf = _frnd(nf); +#else + if (k & 0x80000000) + nf = (nf - 8388608.0f) + 8388608.0f; + else + nf = (nf + 8388608.0f) - 8388608.0f; /* round to _nearest_ integer. FIXME: use some sort of frnd in softfp */ +#endif + *q = 3 & (int)nf; +#if 0 + /* + * FIXME: now I need a bunch of special cases to avoid + * having to do the full four-word reduction every time. + * Also, adjust the comment at the top of this section! + */ + if (__builtin_expect((k & 0x7fffffff) < 0x46490e49, 1)) + return ((x - nf * 0x1.92p+0F) - nf * 0x1.fb4p-12F) - nf * 0x1.4442d2p-24F; + else +#endif + return ((x - nf * 0x1.92p+0F) - nf * 0x1.fb4p-12F) - nf * 0x1.444p-24F - nf * 0x1.68c234p-39F; + } + + /* + * That's enough to do in-line; if we're still playing, hand off + * to the out-of-line main range reducer. + */ + return ARM__mathlib_rredf2(x, q, k); +} + +#ifdef __cplusplus +} /* end of extern "C" */ +#endif /* __cplusplus */ + +#endif /* included */ + +/* end of rredf.h */ diff --git a/math/s_cos.c b/math/s_cos.c new file mode 100644 index 0000000..5b500a7 --- /dev/null +++ b/math/s_cos.c @@ -0,0 +1,98 @@ +/* + * s_cos.c - double precision cosine function + * + * Copyright (C) 1998-2015, ARM Limited, All Rights Reserved + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * This file is part of the Optimized Routines project + */ + +/* @(#)s_cos.c 1.3 95/01/18 */ +/* + * ==================================================== + * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * + * Developed at SunSoft, a Sun Microsystems, Inc. business. + * Permission to use, copy, modify, and distribute this + * software is freely granted, provided that this notice + * is preserved. + * ==================================================== + */ + +/* cos(x) + * Return cosine function of x. + * + * kernel function: + * __kernel_sin ... sine function on [-pi/4,pi/4] + * __kernel_cos ... cosine function on [-pi/4,pi/4] + * __ieee754_rem_pio2 ... argument reduction routine + * + * Method. + * Let S,C and T denote the sin, cos and tan respectively on + * [-PI/4, +PI/4]. Reduce the argument x to y1+y2 = x-k*pi/2 + * in [-pi/4 , +pi/4], and let n = k mod 4. + * We have + * + * n sin(x) cos(x) tan(x) + * ---------------------------------------------------------- + * 0 S C T + * 1 C -S -1/T + * 2 -S -C T + * 3 -C S -1/T + * ---------------------------------------------------------- + * + * Special cases: + * Let trig be any of sin, cos, or tan. + * trig(+-INF) is NaN, with signals; + * trig(NaN) is that NaN; + * + * Accuracy: + * TRIG(x) returns trig(x) nearly rounded + */ + +#include "arm_math.h" +#include "math_private.h" + +__attribute__((const)) double +ARM__cos(double x) +{ + double y[2],z=0.0; + int n, ix; + + /* High word of x. */ + ix = __HI(x); + + /* |x| ~< pi/4 */ + ix &= 0x7fffffff; + if(ix <= 0x3fe921fb) return ARM__kernel_cos(x,z); + + /* cos(Inf) */ + else if (ix==0x7ff00000 && !__LO(x)) + return MATHERR_COS_INF(x); + /* cos(NaN) */ + else if (ix>=0x7ff00000) + return DOUBLE_INFNAN(x); + + /* argument reduction needed */ + else { + n = ARM__ieee754_rem_pio2(x,y); + switch(n&3) { + case 0: return ARM__kernel_cos(y[0],y[1]); + case 1: return -ARM__kernel_sin(y[0],y[1],1); + case 2: return -ARM__kernel_cos(y[0],y[1]); + default: return ARM__kernel_sin(y[0],y[1],1); + } + } +} diff --git a/math/s_cosf.c b/math/s_cosf.c new file mode 100644 index 0000000..ee47a7b --- /dev/null +++ b/math/s_cosf.c @@ -0,0 +1,25 @@ +/* + * s_cosf.c - single precision cosine function + * + * Copyright (C) 2009-2015, ARM Limited, All Rights Reserved + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * This file is part of the Optimized Routines project + */ + +#define COSINE +#include "s_sincosf.c" + +/* end of s_cosf.c */ diff --git a/math/s_sin.c b/math/s_sin.c new file mode 100644 index 0000000..1794470 --- /dev/null +++ b/math/s_sin.c @@ -0,0 +1,100 @@ +/* + * s_sin.c - double precision sine function + * + * Copyright (C) 1998-2015, ARM Limited, All Rights Reserved + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * This file is part of the Optimized Routines project + */ + +/* @(#)s_sin.c 1.3 95/01/18 */ +/* + * ==================================================== + * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * + * Developed at SunSoft, a Sun Microsystems, Inc. business. + * Permission to use, copy, modify, and distribute this + * software is freely granted, provided that this notice + * is preserved. + * ==================================================== + */ + +/* sin(x) + * Return sine function of x. + * + * kernel function: + * __kernel_sin ... sine function on [-pi/4,pi/4] + * __kernel_cos ... cose function on [-pi/4,pi/4] + * __ieee754_rem_pio2 ... argument reduction routine + * + * Method. + * Let S,C and T denote the sin, cos and tan respectively on + * [-PI/4, +PI/4]. Reduce the argument x to y1+y2 = x-k*pi/2 + * in [-pi/4 , +pi/4], and let n = k mod 4. + * We have + * + * n sin(x) cos(x) tan(x) + * ---------------------------------------------------------- + * 0 S C T + * 1 C -S -1/T + * 2 -S -C T + * 3 -C S -1/T + * ---------------------------------------------------------- + * + * Special cases: + * Let trig be any of sin, cos, or tan. + * trig(+-INF) is NaN, with signals; + * trig(NaN) is that NaN; + * + * Accuracy: + * TRIG(x) returns trig(x) nearly rounded + */ + + +#include "arm_math.h" +#include "math_private.h" +#include + +__attribute__((const)) double +ARM__sin(double x) +{ + double y[2],z=0.0; + int n, ix; + + /* High word of x. */ + ix = __HI(x); + + /* |x| ~< pi/4 */ + ix &= 0x7fffffff; + if(ix <= 0x3fe921fb) return ARM__kernel_sin(x,z,0); + + /* sin(Inf) */ + else if (ix==0x7ff00000 && !__LO(x)) + return MATHERR_SIN_INF(x); + /* sin(NaN) */ + else if (ix>=0x7ff00000) + return DOUBLE_INFNAN(x); + + /* argument reduction needed */ + else { + n = ARM__ieee754_rem_pio2(x,y); + switch(n&3) { + case 0: return ARM__kernel_sin(y[0],y[1],1); + case 1: return ARM__kernel_cos(y[0],y[1]); + case 2: return -ARM__kernel_sin(y[0],y[1],1); + default: return -ARM__kernel_cos(y[0],y[1]); + } + } +} diff --git a/math/s_sincosf.c b/math/s_sincosf.c new file mode 100644 index 0000000..8f6b9f4 --- /dev/null +++ b/math/s_sincosf.c @@ -0,0 +1,123 @@ +/* + * s_sincosf.c - single precision sine and cosine functions + * + * Copyright (C) 2009-2015, ARM Limited, All Rights Reserved + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * This file is part of the Optimized Routines project + */ + +/* + * Source: my own head, and Remez-generated polynomial approximations. + */ + +#include "rredf.h" +#include +#include "math_private.h" +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +#ifndef COSINE +#define FUNCNAME ARM__sinf +#define SOFTFP_FUNCNAME __softfp_sinf +#define DO_SIN (!(q & 1)) +#define NEGATE_SIN ((q & 2)) +#define NEGATE_COS ((q & 2)) +#define TRIVIAL_RESULT(x) FLOAT_CHECKDENORM(x) +#define ERR_INF MATHERR_SINF_INF +#else +#define FUNCNAME ARM__cosf +#define SOFTFP_FUNCNAME __softfp_cosf +#define DO_SIN (q & 1) +#define NEGATE_SIN (!(q & 2)) +#define NEGATE_COS ((q & 2)) +#define TRIVIAL_RESULT(x) 1.0f +#define ERR_INF MATHERR_COSF_INF +#endif + +float FUNCNAME(float x) +{ + int q; + + /* + * Range-reduce x to the range [-pi/4,pi/4]. + */ + { + /* + * I enclose the call to ARM__mathlib_rredf in braces so that + * the address-taken-ness of qq does not propagate + * throughout the rest of the function, for what that might + * be worth. + */ + int qq; + x = ARM__mathlib_rredf(x, &qq); + q = qq; + } + if (__builtin_expect(q < 0, 0)) { /* this signals tiny, inf, or NaN */ + unsigned k = fai(x) << 1; + if (k < 0xFF000000) /* tiny */ + return TRIVIAL_RESULT(x); + else if (k == 0xFF000000) /* inf */ + return ERR_INF(x); + else /* NaN */ + return FLOAT_INFNAN(x); + } + + /* + * Depending on the quadrant we were in, we may have to compute + * a sine-like function (f(0)=0) or a cosine-like one (f(0)=1), + * and we may have to negate it. + */ + if (DO_SIN) { + float x2 = x*x; + /* + * Coefficients generated by the command + +./auxiliary/remez.jl --variable=x2 --suffix=f -- '0' 'atan(BigFloat(1))^2' 2 0 'x==0 ? -1/BigFloat(6) : (x->(sin(x)-x)/x^3)(sqrt(x))' 'sqrt(x^3)' + + */ + x += x * (x2 * ( + -1.666665066929417292436220415142244613956015227491999719404711781344783392564922e-01f+x2*(8.331978663157089651408875887703995477889496917296385733254577121461421466427772e-03f+x2*(-1.949563623766929906511886482584265500187314705496861877317774185883215997931494e-04f)) + )); + if (NEGATE_SIN) + x = -x; + return x; + } else { + float x2 = x*x; + /* + * Coefficients generated by the command + +./auxiliary/remez.jl --variable=x2 --suffix=f -- '0' 'atan(BigFloat(1))^2' 2 0 'x==0 ? -1/BigFloat(6) : (x->(cos(x)-1)/x^2)(sqrt(x))' 'x' + + */ + x = 1.0f + x2*( + -4.999989478137016757327030935768632852012781143541026304540837816323349768666875e-01f+x2*(4.165629457842617238353362092016541041535652603456375154392942188742496860024377e-02f+x2*(-1.35978231111049428748566568960423202948250988565693107969571193763372093404347e-03f)) + ); + if (NEGATE_COS) + x = -x; + return x; + } +} + + +#ifdef __cplusplus +} /* end of extern "C" */ +#endif /* __cplusplus */ + +/* end of sincosf.c */ diff --git a/math/s_sinf.c b/math/s_sinf.c new file mode 100644 index 0000000..8c4cd1f --- /dev/null +++ b/math/s_sinf.c @@ -0,0 +1,25 @@ +/* + * s_sinf.c - single precision sine function + * + * Copyright (C) 2009-2015, ARM Limited, All Rights Reserved + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * This file is part of the Optimized Routines project + */ + +#define SINE +#include "s_sincosf.c" + +/* end of s_sinf.c */ diff --git a/math/s_tan.c b/math/s_tan.c new file mode 100644 index 0000000..284bcfd --- /dev/null +++ b/math/s_tan.c @@ -0,0 +1,136 @@ +/* + * s_tan.c - double precision tangent function + * + * Copyright (C) 1998-2015, ARM Limited, All Rights Reserved + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * This file is part of the Optimized Routines project + */ + +/* @(#)s_tan.c 1.3 95/01/18 */ +/* + * ==================================================== + * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * + * Developed at SunSoft, a Sun Microsystems, Inc. business. + * Permission to use, copy, modify, and distribute this + * software is freely granted, provided that this notice + * is preserved. + * ==================================================== + */ + +/* tan(x) + * Return tangent function of x. + * + * kernel function: + * __kernel_tan ... tangent function on [-pi/4,pi/4] + * __ieee754_rem_pio2 ... argument reduction routine + * + * Method. + * Let S,C and T denote the sin, cos and tan respectively on + * [-PI/4, +PI/4]. Reduce the argument x to y1+y2 = x-k*pi/2 + * in [-pi/4 , +pi/4], and let n = k mod 4. + * We have + * + * n sin(x) cos(x) tan(x) + * ---------------------------------------------------------- + * 0 S C T + * 1 C -S -1/T + * 2 -S -C T + * 3 -C S -1/T + * ---------------------------------------------------------- + * + * Special cases: + * Let trig be any of sin, cos, or tan. + * trig(+-INF) is NaN, with signals; + * trig(NaN) is that NaN; + * + * Accuracy: + * TRIG(x) returns trig(x) nearly rounded + */ + +/* + * David Seal once wondered about the possibility of tan() + * overflowing. It's perfectly possible in principle: if you + * provide an input number x _very_ close to (k+1/2)pi for some + * integer k, then cos(x) will be denormal and sin(x) will be + * effectively 1, and hence tan(x) will be just off the top of the + * representable number range. + * + * Fortunately, using Tim Peters' modmin algorithm[1], it's + * practically feasible to search _all_ the representable double + * precision numbers and find the ones which are closest to a + * multiple-and-a-half of pi. The algorithm only works on a range + * of _equally spaced_ numbers, so of course you need to do a + * separate search for each possible exponent, but it's a very fast + * algorithm, 2048 times negligible time still doesn't take very + * long, and this is still feasible. + * + * I (Simon Tatham) actually did this, and it turns out that the + * minimum negative value of cos(x) is 0x3c214ae7.2e6ba22e.f46 + * (that's expressed in double precision with a few extra bits), + * and the minimum positive value is 0xbc3f54f5.227a4e83.fbf. The + * `double' inputs which generate those values are + * 0x7506ac5b.262ca1ff and 0x416b951f.1572eba5 respectively. + * + * Hence, no representable double has a cosine smaller than about + * 2^-60, and accordingly tan() can never return anything bigger + * than about 2^60, which doesn't even overflow in _single_ + * precision let alone double. We are safe. + * + * [1] A Program for Testing IEEE Decimal-Binary Conversion, Vern + * Paxson and Prof W. Kahan, available from + * ftp://ftp.ee.lbl.gov/testbase-report.ps.Z + */ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +double ARM__tan(double x) +{ + double y[2],z=0.0; + int n, ix; + + /* High word of x. */ + ix = __HI(x); + + /* |x| ~< pi/4 */ + ix &= 0x7fffffff; + if(ix <= 0x3fe921fb) return ARM__kernel_tan(x,z,1); + + /* tan(Inf) */ + else if (ix==0x7ff00000 && !__LO(x)) + return MATHERR_TAN_INF(x); + /* tan(NaN) */ + else if (ix>=0x7ff00000) + return DOUBLE_INFNAN(x); + + /* argument reduction needed */ + else { + n = ARM__ieee754_rem_pio2(x,y); + return ARM__kernel_tan(y[0],y[1],1-((n&1)<<1)); /* 1 -- n even + -1 -- n odd */ + } +} + +#ifdef __cplusplus +} /* end of extern "C" */ +#endif /* __cplusplus */ + +/* end of tan.c */ diff --git a/math/s_tanf.c b/math/s_tanf.c new file mode 100644 index 0000000..cfc08da --- /dev/null +++ b/math/s_tanf.c @@ -0,0 +1,91 @@ +/* + * s_tanf.c - single precision tangent function + * + * Copyright (C) 2009-2015, ARM Limited, All Rights Reserved + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * This file is part of the Optimized Routines project + */ + +/* + * Source: my own head, and Remez-generated polynomial approximations. + */ + +#include +#include "math_private.h" +#include +#include +#include "rredf.h" + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +float ARM__tanf(float x) +{ + int q; + + /* + * Range-reduce x to the range [-pi/4,pi/4]. + */ + { + /* + * I enclose the call to __mathlib_rredf in braces so that + * the address-taken-ness of qq does not propagate + * throughout the rest of the function, for what that might + * be worth. + */ + int qq; + x = ARM__mathlib_rredf(x, &qq); + q = qq; + } + if (__builtin_expect(q < 0, 0)) { /* this signals tiny, inf, or NaN */ + unsigned k = fai(x) << 1; + if (k < 0xFF000000) /* tiny */ + return FLOAT_CHECKDENORM(x); + else if (k == 0xFF000000) /* inf */ + return MATHERR_TANF_INF(x); + else /* NaN */ + return FLOAT_INFNAN(x); + } + + /* + * We use a direct polynomial approximation for tan(x) on + * [-pi/4,pi/4], and then take the negative reciprocal of the + * result if we're in an interval surrounding an odd rather than + * even multiple of pi/2. + * + * Coefficients generated by the command + +./auxiliary/remez.jl --variable=x2 --suffix=f -- '0' '(pi/BigFloat(4))^2' 5 0 'x==0 ? 1/BigFloat(3) : (tan(sqrt(x))-sqrt(x))/sqrt(x^3)' 'sqrt(x^3)' + + */ + { + float x2 = x*x; + x += x * (x2 * ( + 3.333294809182307633621540045249152105330074691488121206914336806061620616979305e-01f+x2*(1.334274588580033216191949445078951865160600494428914956688702429547258497367525e-01f+x2*(5.315177279765676178198868818834880279286012428084733419724267810723468887753723e-02f+x2*(2.520300881849204519070372772571624013984546591252791443673871814078418474596388e-02f+x2*(2.051177187082974766686645514206648277055233230110624602600687812103764075834307e-03f+x2*(9.943421494628597182458186353995299429948224864648292162238582752158235742046109e-03f))))) + )); + if (q & 1) + x = -1.0f/x; + + return x; + } +} + +#ifdef __cplusplus +} /* end of extern "C" */ +#endif /* __cplusplus */ + +/* end of s_tanf.c */ -- cgit v1.2.3