diff options
Diffstat (limited to 'src/main/java/org/apache/commons/math3/distribution/KolmogorovSmirnovDistribution.java')
-rw-r--r-- | src/main/java/org/apache/commons/math3/distribution/KolmogorovSmirnovDistribution.java | 338 |
1 files changed, 338 insertions, 0 deletions
diff --git a/src/main/java/org/apache/commons/math3/distribution/KolmogorovSmirnovDistribution.java b/src/main/java/org/apache/commons/math3/distribution/KolmogorovSmirnovDistribution.java new file mode 100644 index 0000000..3ee007f --- /dev/null +++ b/src/main/java/org/apache/commons/math3/distribution/KolmogorovSmirnovDistribution.java @@ -0,0 +1,338 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.math3.distribution; + +import org.apache.commons.math3.exception.MathArithmeticException; +import org.apache.commons.math3.exception.NotStrictlyPositiveException; +import org.apache.commons.math3.exception.NumberIsTooLargeException; +import org.apache.commons.math3.exception.util.LocalizedFormats; +import org.apache.commons.math3.fraction.BigFraction; +import org.apache.commons.math3.fraction.BigFractionField; +import org.apache.commons.math3.fraction.FractionConversionException; +import org.apache.commons.math3.linear.Array2DRowFieldMatrix; +import org.apache.commons.math3.linear.Array2DRowRealMatrix; +import org.apache.commons.math3.linear.FieldMatrix; +import org.apache.commons.math3.linear.RealMatrix; +import org.apache.commons.math3.util.FastMath; + +import java.io.Serializable; +import java.math.BigDecimal; + +/** + * Implementation of the Kolmogorov-Smirnov distribution. + * + * <p>Treats the distribution of the two-sided {@code P(D_n < d)} where {@code D_n = sup_x |G(x) - + * G_n (x)|} for the theoretical cdf {@code G} and the empirical cdf {@code G_n}. + * + * <p>This implementation is based on [1] with certain quick decisions for extreme values given in + * [2]. + * + * <p>In short, when wanting to evaluate {@code P(D_n < d)}, the method in [1] is to write {@code d + * = (k - h) / n} for positive integer {@code k} and {@code 0 <= h < 1}. Then {@code P(D_n < d) = + * (n! / n^n) * t_kk}, where {@code t_kk} is the {@code (k, k)}'th entry in the special matrix + * {@code H^n}, i.e. {@code H} to the {@code n}'th power. + * + * <p>References: + * + * <ul> + * <li>[1] <a href="http://www.jstatsoft.org/v08/i18/">Evaluating Kolmogorov's Distribution</a> by + * George Marsaglia, Wai Wan Tsang, and Jingbo Wang + * <li>[2] <a href="http://www.jstatsoft.org/v39/i11/">Computing the Two-Sided Kolmogorov-Smirnov + * Distribution</a> by Richard Simard and Pierre L'Ecuyer + * </ul> + * + * Note that [1] contains an error in computing h, refer to <a + * href="https://issues.apache.org/jira/browse/MATH-437">MATH-437</a> for details. + * + * @see <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">Kolmogorov-Smirnov test + * (Wikipedia)</a> + * @deprecated to be removed in version 4.0 - use {@link + * org.apache.commons.math3.stat.inference.KolmogorovSmirnovTest} + */ +public class KolmogorovSmirnovDistribution implements Serializable { + + /** Serializable version identifier. */ + private static final long serialVersionUID = -4670676796862967187L; + + /** Number of observations. */ + private int n; + + /** + * @param n Number of observations + * @throws NotStrictlyPositiveException if {@code n <= 0} + */ + public KolmogorovSmirnovDistribution(int n) throws NotStrictlyPositiveException { + if (n <= 0) { + throw new NotStrictlyPositiveException( + LocalizedFormats.NOT_POSITIVE_NUMBER_OF_SAMPLES, n); + } + + this.n = n; + } + + /** + * Calculates {@code P(D_n < d)} using method described in [1] with quick decisions for extreme + * values given in [2] (see above). The result is not exact as with {@link + * KolmogorovSmirnovDistribution#cdfExact(double)} because calculations are based on {@code + * double} rather than {@link org.apache.commons.math3.fraction.BigFraction}. + * + * @param d statistic + * @return the two-sided probability of {@code P(D_n < d)} + * @throws MathArithmeticException if algorithm fails to convert {@code h} to a {@link + * org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as {@code (k - h) + * / m} for integer {@code k, m} and {@code 0 <= h < 1}. + */ + public double cdf(double d) throws MathArithmeticException { + return this.cdf(d, false); + } + + /** + * Calculates {@code P(D_n < d)} using method described in [1] with quick decisions for extreme + * values given in [2] (see above). The result is exact in the sense that BigFraction/BigReal is + * used everywhere at the expense of very slow execution time. Almost never choose this in real + * applications unless you are very sure; this is almost solely for verification purposes. + * Normally, you would choose {@link KolmogorovSmirnovDistribution#cdf(double)} + * + * @param d statistic + * @return the two-sided probability of {@code P(D_n < d)} + * @throws MathArithmeticException if algorithm fails to convert {@code h} to a {@link + * org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as {@code (k - h) + * / m} for integer {@code k, m} and {@code 0 <= h < 1}. + */ + public double cdfExact(double d) throws MathArithmeticException { + return this.cdf(d, true); + } + + /** + * Calculates {@code P(D_n < d)} using method described in [1] with quick decisions for extreme + * values given in [2] (see above). + * + * @param d statistic + * @param exact whether the probability should be calculated exact using {@link + * org.apache.commons.math3.fraction.BigFraction} everywhere at the expense of very slow + * execution time, or if {@code double} should be used convenient places to gain speed. + * Almost never choose {@code true} in real applications unless you are very sure; {@code + * true} is almost solely for verification purposes. + * @return the two-sided probability of {@code P(D_n < d)} + * @throws MathArithmeticException if algorithm fails to convert {@code h} to a {@link + * org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as {@code (k - h) + * / m} for integer {@code k, m} and {@code 0 <= h < 1}. + */ + public double cdf(double d, boolean exact) throws MathArithmeticException { + + final double ninv = 1 / ((double) n); + final double ninvhalf = 0.5 * ninv; + + if (d <= ninvhalf) { + + return 0; + + } else if (ninvhalf < d && d <= ninv) { + + double res = 1; + double f = 2 * d - ninv; + + // n! f^n = n*f * (n-1)*f * ... * 1*x + for (int i = 1; i <= n; ++i) { + res *= i * f; + } + + return res; + + } else if (1 - ninv <= d && d < 1) { + + return 1 - 2 * FastMath.pow(1 - d, n); + + } else if (1 <= d) { + + return 1; + } + + return exact ? exactK(d) : roundedK(d); + } + + /** + * Calculates the exact value of {@code P(D_n < d)} using method described in [1] and {@link + * org.apache.commons.math3.fraction.BigFraction} (see above). + * + * @param d statistic + * @return the two-sided probability of {@code P(D_n < d)} + * @throws MathArithmeticException if algorithm fails to convert {@code h} to a {@link + * org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as {@code (k - h) + * / m} for integer {@code k, m} and {@code 0 <= h < 1}. + */ + private double exactK(double d) throws MathArithmeticException { + + final int k = (int) FastMath.ceil(n * d); + + final FieldMatrix<BigFraction> H = this.createH(d); + final FieldMatrix<BigFraction> Hpower = H.power(n); + + BigFraction pFrac = Hpower.getEntry(k - 1, k - 1); + + for (int i = 1; i <= n; ++i) { + pFrac = pFrac.multiply(i).divide(n); + } + + /* + * BigFraction.doubleValue converts numerator to double and the + * denominator to double and divides afterwards. That gives NaN quite + * easy. This does not (scale is the number of digits): + */ + return pFrac.bigDecimalValue(20, BigDecimal.ROUND_HALF_UP).doubleValue(); + } + + /** + * Calculates {@code P(D_n < d)} using method described in [1] and doubles (see above). + * + * @param d statistic + * @return the two-sided probability of {@code P(D_n < d)} + * @throws MathArithmeticException if algorithm fails to convert {@code h} to a {@link + * org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as {@code (k - h) + * / m} for integer {@code k, m} and {@code 0 <= h < 1}. + */ + private double roundedK(double d) throws MathArithmeticException { + + final int k = (int) FastMath.ceil(n * d); + final FieldMatrix<BigFraction> HBigFraction = this.createH(d); + final int m = HBigFraction.getRowDimension(); + + /* + * Here the rounding part comes into play: use + * RealMatrix instead of FieldMatrix<BigFraction> + */ + final RealMatrix H = new Array2DRowRealMatrix(m, m); + + for (int i = 0; i < m; ++i) { + for (int j = 0; j < m; ++j) { + H.setEntry(i, j, HBigFraction.getEntry(i, j).doubleValue()); + } + } + + final RealMatrix Hpower = H.power(n); + + double pFrac = Hpower.getEntry(k - 1, k - 1); + + for (int i = 1; i <= n; ++i) { + pFrac *= (double) i / (double) n; + } + + return pFrac; + } + + /*** + * Creates {@code H} of size {@code m x m} as described in [1] (see above). + * + * @param d statistic + * @return H matrix + * @throws NumberIsTooLargeException if fractional part is greater than 1 + * @throws FractionConversionException if algorithm fails to convert + * {@code h} to a {@link org.apache.commons.math3.fraction.BigFraction} in + * expressing {@code d} as {@code (k - h) / m} for integer {@code k, m} and + * {@code 0 <= h < 1}. + */ + private FieldMatrix<BigFraction> createH(double d) + throws NumberIsTooLargeException, FractionConversionException { + + int k = (int) FastMath.ceil(n * d); + + int m = 2 * k - 1; + double hDouble = k - n * d; + + if (hDouble >= 1) { + throw new NumberIsTooLargeException(hDouble, 1.0, false); + } + + BigFraction h = null; + + try { + h = new BigFraction(hDouble, 1.0e-20, 10000); + } catch (FractionConversionException e1) { + try { + h = new BigFraction(hDouble, 1.0e-10, 10000); + } catch (FractionConversionException e2) { + h = new BigFraction(hDouble, 1.0e-5, 10000); + } + } + + final BigFraction[][] Hdata = new BigFraction[m][m]; + + /* + * Start by filling everything with either 0 or 1. + */ + for (int i = 0; i < m; ++i) { + for (int j = 0; j < m; ++j) { + if (i - j + 1 < 0) { + Hdata[i][j] = BigFraction.ZERO; + } else { + Hdata[i][j] = BigFraction.ONE; + } + } + } + + /* + * Setting up power-array to avoid calculating the same value twice: + * hPowers[0] = h^1 ... hPowers[m-1] = h^m + */ + final BigFraction[] hPowers = new BigFraction[m]; + hPowers[0] = h; + for (int i = 1; i < m; ++i) { + hPowers[i] = h.multiply(hPowers[i - 1]); + } + + /* + * First column and last row has special values (each other reversed). + */ + for (int i = 0; i < m; ++i) { + Hdata[i][0] = Hdata[i][0].subtract(hPowers[i]); + Hdata[m - 1][i] = Hdata[m - 1][i].subtract(hPowers[m - i - 1]); + } + + /* + * [1] states: "For 1/2 < h < 1 the bottom left element of the matrix + * should be (1 - 2*h^m + (2h - 1)^m )/m!" Since 0 <= h < 1, then if h > + * 1/2 is sufficient to check: + */ + if (h.compareTo(BigFraction.ONE_HALF) == 1) { + Hdata[m - 1][0] = Hdata[m - 1][0].add(h.multiply(2).subtract(1).pow(m)); + } + + /* + * Aside from the first column and last row, the (i, j)-th element is + * 1/(i - j + 1)! if i - j + 1 >= 0, else 0. 1's and 0's are already + * put, so only division with (i - j + 1)! is needed in the elements + * that have 1's. There is no need to calculate (i - j + 1)! and then + * divide - small steps avoid overflows. + * + * Note that i - j + 1 > 0 <=> i + 1 > j instead of j'ing all the way to + * m. Also note that it is started at g = 2 because dividing by 1 isn't + * really necessary. + */ + for (int i = 0; i < m; ++i) { + for (int j = 0; j < i + 1; ++j) { + if (i - j + 1 > 0) { + for (int g = 2; g <= i - j + 1; ++g) { + Hdata[i][j] = Hdata[i][j].divide(g); + } + } + } + } + + return new Array2DRowFieldMatrix<BigFraction>(BigFractionField.getInstance(), Hdata); + } +} |