diff options
Diffstat (limited to 'src/main/java/org/apache/commons/math3/stat/correlation')
7 files changed, 1548 insertions, 0 deletions
diff --git a/src/main/java/org/apache/commons/math3/stat/correlation/Covariance.java b/src/main/java/org/apache/commons/math3/stat/correlation/Covariance.java new file mode 100644 index 0000000..c462401 --- /dev/null +++ b/src/main/java/org/apache/commons/math3/stat/correlation/Covariance.java @@ -0,0 +1,295 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math3.stat.correlation; + +import org.apache.commons.math3.exception.MathIllegalArgumentException; +import org.apache.commons.math3.exception.NotStrictlyPositiveException; +import org.apache.commons.math3.exception.util.LocalizedFormats; +import org.apache.commons.math3.linear.RealMatrix; +import org.apache.commons.math3.linear.BlockRealMatrix; +import org.apache.commons.math3.stat.descriptive.moment.Mean; +import org.apache.commons.math3.stat.descriptive.moment.Variance; + +/** + * Computes covariances for pairs of arrays or columns of a matrix. + * + * <p>The constructors that take <code>RealMatrix</code> or + * <code>double[][]</code> arguments generate covariance matrices. The + * columns of the input matrices are assumed to represent variable values.</p> + * + * <p>The constructor argument <code>biasCorrected</code> determines whether or + * not computed covariances are bias-corrected.</p> + * + * <p>Unbiased covariances are given by the formula</p> + * <code>cov(X, Y) = Σ[(x<sub>i</sub> - E(X))(y<sub>i</sub> - E(Y))] / (n - 1)</code> + * where <code>E(X)</code> is the mean of <code>X</code> and <code>E(Y)</code> + * is the mean of the <code>Y</code> values. + * + * <p>Non-bias-corrected estimates use <code>n</code> in place of <code>n - 1</code> + * + * @since 2.0 + */ +public class Covariance { + + /** covariance matrix */ + private final RealMatrix covarianceMatrix; + + /** + * Create an empty covariance matrix. + */ + /** Number of observations (length of covariate vectors) */ + private final int n; + + /** + * Create a Covariance with no data + */ + public Covariance() { + super(); + covarianceMatrix = null; + n = 0; + } + + /** + * Create a Covariance matrix from a rectangular array + * whose columns represent covariates. + * + * <p>The <code>biasCorrected</code> parameter determines whether or not + * covariance estimates are bias-corrected.</p> + * + * <p>The input array must be rectangular with at least one column + * and two rows.</p> + * + * @param data rectangular array with columns representing covariates + * @param biasCorrected true means covariances are bias-corrected + * @throws MathIllegalArgumentException if the input data array is not + * rectangular with at least two rows and one column. + * @throws NotStrictlyPositiveException if the input data array is not + * rectangular with at least one row and one column. + */ + public Covariance(double[][] data, boolean biasCorrected) + throws MathIllegalArgumentException, NotStrictlyPositiveException { + this(new BlockRealMatrix(data), biasCorrected); + } + + /** + * Create a Covariance matrix from a rectangular array + * whose columns represent covariates. + * + * <p>The input array must be rectangular with at least one column + * and two rows</p> + * + * @param data rectangular array with columns representing covariates + * @throws MathIllegalArgumentException if the input data array is not + * rectangular with at least two rows and one column. + * @throws NotStrictlyPositiveException if the input data array is not + * rectangular with at least one row and one column. + */ + public Covariance(double[][] data) + throws MathIllegalArgumentException, NotStrictlyPositiveException { + this(data, true); + } + + /** + * Create a covariance matrix from a matrix whose columns + * represent covariates. + * + * <p>The <code>biasCorrected</code> parameter determines whether or not + * covariance estimates are bias-corrected.</p> + * + * <p>The matrix must have at least one column and two rows</p> + * + * @param matrix matrix with columns representing covariates + * @param biasCorrected true means covariances are bias-corrected + * @throws MathIllegalArgumentException if the input matrix does not have + * at least two rows and one column + */ + public Covariance(RealMatrix matrix, boolean biasCorrected) + throws MathIllegalArgumentException { + checkSufficientData(matrix); + n = matrix.getRowDimension(); + covarianceMatrix = computeCovarianceMatrix(matrix, biasCorrected); + } + + /** + * Create a covariance matrix from a matrix whose columns + * represent covariates. + * + * <p>The matrix must have at least one column and two rows</p> + * + * @param matrix matrix with columns representing covariates + * @throws MathIllegalArgumentException if the input matrix does not have + * at least two rows and one column + */ + public Covariance(RealMatrix matrix) throws MathIllegalArgumentException { + this(matrix, true); + } + + /** + * Returns the covariance matrix + * + * @return covariance matrix + */ + public RealMatrix getCovarianceMatrix() { + return covarianceMatrix; + } + + /** + * Returns the number of observations (length of covariate vectors) + * + * @return number of observations + */ + public int getN() { + return n; + } + + /** + * Compute a covariance matrix from a matrix whose columns represent + * covariates. + * @param matrix input matrix (must have at least one column and two rows) + * @param biasCorrected determines whether or not covariance estimates are bias-corrected + * @return covariance matrix + * @throws MathIllegalArgumentException if the matrix does not contain sufficient data + */ + protected RealMatrix computeCovarianceMatrix(RealMatrix matrix, boolean biasCorrected) + throws MathIllegalArgumentException { + int dimension = matrix.getColumnDimension(); + Variance variance = new Variance(biasCorrected); + RealMatrix outMatrix = new BlockRealMatrix(dimension, dimension); + for (int i = 0; i < dimension; i++) { + for (int j = 0; j < i; j++) { + double cov = covariance(matrix.getColumn(i), matrix.getColumn(j), biasCorrected); + outMatrix.setEntry(i, j, cov); + outMatrix.setEntry(j, i, cov); + } + outMatrix.setEntry(i, i, variance.evaluate(matrix.getColumn(i))); + } + return outMatrix; + } + + /** + * Create a covariance matrix from a matrix whose columns represent + * covariates. Covariances are computed using the bias-corrected formula. + * @param matrix input matrix (must have at least one column and two rows) + * @return covariance matrix + * @throws MathIllegalArgumentException if matrix does not contain sufficient data + * @see #Covariance + */ + protected RealMatrix computeCovarianceMatrix(RealMatrix matrix) + throws MathIllegalArgumentException { + return computeCovarianceMatrix(matrix, true); + } + + /** + * Compute a covariance matrix from a rectangular array whose columns represent + * covariates. + * @param data input array (must have at least one column and two rows) + * @param biasCorrected determines whether or not covariance estimates are bias-corrected + * @return covariance matrix + * @throws MathIllegalArgumentException if the data array does not contain sufficient + * data + * @throws NotStrictlyPositiveException if the input data array is not + * rectangular with at least one row and one column. + */ + protected RealMatrix computeCovarianceMatrix(double[][] data, boolean biasCorrected) + throws MathIllegalArgumentException, NotStrictlyPositiveException { + return computeCovarianceMatrix(new BlockRealMatrix(data), biasCorrected); + } + + /** + * Create a covariance matrix from a rectangular array whose columns represent + * covariates. Covariances are computed using the bias-corrected formula. + * @param data input array (must have at least one column and two rows) + * @return covariance matrix + * @throws MathIllegalArgumentException if the data array does not contain sufficient data + * @throws NotStrictlyPositiveException if the input data array is not + * rectangular with at least one row and one column. + * @see #Covariance + */ + protected RealMatrix computeCovarianceMatrix(double[][] data) + throws MathIllegalArgumentException, NotStrictlyPositiveException { + return computeCovarianceMatrix(data, true); + } + + /** + * Computes the covariance between the two arrays. + * + * <p>Array lengths must match and the common length must be at least 2.</p> + * + * @param xArray first data array + * @param yArray second data array + * @param biasCorrected if true, returned value will be bias-corrected + * @return returns the covariance for the two arrays + * @throws MathIllegalArgumentException if the arrays lengths do not match or + * there is insufficient data + */ + public double covariance(final double[] xArray, final double[] yArray, boolean biasCorrected) + throws MathIllegalArgumentException { + Mean mean = new Mean(); + double result = 0d; + int length = xArray.length; + if (length != yArray.length) { + throw new MathIllegalArgumentException( + LocalizedFormats.DIMENSIONS_MISMATCH_SIMPLE, length, yArray.length); + } else if (length < 2) { + throw new MathIllegalArgumentException( + LocalizedFormats.INSUFFICIENT_OBSERVED_POINTS_IN_SAMPLE, length, 2); + } else { + double xMean = mean.evaluate(xArray); + double yMean = mean.evaluate(yArray); + for (int i = 0; i < length; i++) { + double xDev = xArray[i] - xMean; + double yDev = yArray[i] - yMean; + result += (xDev * yDev - result) / (i + 1); + } + } + return biasCorrected ? result * ((double) length / (double)(length - 1)) : result; + } + + /** + * Computes the covariance between the two arrays, using the bias-corrected + * formula. + * + * <p>Array lengths must match and the common length must be at least 2.</p> + * + * @param xArray first data array + * @param yArray second data array + * @return returns the covariance for the two arrays + * @throws MathIllegalArgumentException if the arrays lengths do not match or + * there is insufficient data + */ + public double covariance(final double[] xArray, final double[] yArray) + throws MathIllegalArgumentException { + return covariance(xArray, yArray, true); + } + + /** + * Throws MathIllegalArgumentException if the matrix does not have at least + * one column and two rows. + * @param matrix matrix to check + * @throws MathIllegalArgumentException if the matrix does not contain sufficient data + * to compute covariance + */ + private void checkSufficientData(final RealMatrix matrix) throws MathIllegalArgumentException { + int nRows = matrix.getRowDimension(); + int nCols = matrix.getColumnDimension(); + if (nRows < 2 || nCols < 1) { + throw new MathIllegalArgumentException( + LocalizedFormats.INSUFFICIENT_ROWS_AND_COLUMNS, + nRows, nCols); + } + } +} diff --git a/src/main/java/org/apache/commons/math3/stat/correlation/KendallsCorrelation.java b/src/main/java/org/apache/commons/math3/stat/correlation/KendallsCorrelation.java new file mode 100644 index 0000000..d38cf71 --- /dev/null +++ b/src/main/java/org/apache/commons/math3/stat/correlation/KendallsCorrelation.java @@ -0,0 +1,272 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math3.stat.correlation; + +import org.apache.commons.math3.exception.DimensionMismatchException; +import org.apache.commons.math3.linear.BlockRealMatrix; +import org.apache.commons.math3.linear.MatrixUtils; +import org.apache.commons.math3.linear.RealMatrix; +import org.apache.commons.math3.util.FastMath; +import org.apache.commons.math3.util.Pair; + +import java.util.Arrays; +import java.util.Comparator; + +/** + * Implementation of Kendall's Tau-b rank correlation</a>. + * <p> + * A pair of observations (x<sub>1</sub>, y<sub>1</sub>) and + * (x<sub>2</sub>, y<sub>2</sub>) are considered <i>concordant</i> if + * x<sub>1</sub> < x<sub>2</sub> and y<sub>1</sub> < y<sub>2</sub> + * or x<sub>2</sub> < x<sub>1</sub> and y<sub>2</sub> < y<sub>1</sub>. + * The pair is <i>discordant</i> if x<sub>1</sub> < x<sub>2</sub> and + * y<sub>2</sub> < y<sub>1</sub> or x<sub>2</sub> < x<sub>1</sub> and + * y<sub>1</sub> < y<sub>2</sub>. If either x<sub>1</sub> = x<sub>2</sub> + * or y<sub>1</sub> = y<sub>2</sub>, the pair is neither concordant nor + * discordant. + * <p> + * Kendall's Tau-b is defined as: + * <pre> + * tau<sub>b</sub> = (n<sub>c</sub> - n<sub>d</sub>) / sqrt((n<sub>0</sub> - n<sub>1</sub>) * (n<sub>0</sub> - n<sub>2</sub>)) + * </pre> + * <p> + * where: + * <ul> + * <li>n<sub>0</sub> = n * (n - 1) / 2</li> + * <li>n<sub>c</sub> = Number of concordant pairs</li> + * <li>n<sub>d</sub> = Number of discordant pairs</li> + * <li>n<sub>1</sub> = sum of t<sub>i</sub> * (t<sub>i</sub> - 1) / 2 for all i</li> + * <li>n<sub>2</sub> = sum of u<sub>j</sub> * (u<sub>j</sub> - 1) / 2 for all j</li> + * <li>t<sub>i</sub> = Number of tied values in the i<sup>th</sup> group of ties in x</li> + * <li>u<sub>j</sub> = Number of tied values in the j<sup>th</sup> group of ties in y</li> + * </ul> + * <p> + * This implementation uses the O(n log n) algorithm described in + * William R. Knight's 1966 paper "A Computer Method for Calculating + * Kendall's Tau with Ungrouped Data" in the Journal of the American + * Statistical Association. + * + * @see <a href="http://en.wikipedia.org/wiki/Kendall_tau_rank_correlation_coefficient"> + * Kendall tau rank correlation coefficient (Wikipedia)</a> + * @see <a href="http://www.jstor.org/stable/2282833">A Computer + * Method for Calculating Kendall's Tau with Ungrouped Data</a> + * + * @since 3.3 + */ +public class KendallsCorrelation { + + /** correlation matrix */ + private final RealMatrix correlationMatrix; + + /** + * Create a KendallsCorrelation instance without data. + */ + public KendallsCorrelation() { + correlationMatrix = null; + } + + /** + * Create a KendallsCorrelation from a rectangular array + * whose columns represent values of variables to be correlated. + * + * @param data rectangular array with columns representing variables + * @throws IllegalArgumentException if the input data array is not + * rectangular with at least two rows and two columns. + */ + public KendallsCorrelation(double[][] data) { + this(MatrixUtils.createRealMatrix(data)); + } + + /** + * Create a KendallsCorrelation from a RealMatrix whose columns + * represent variables to be correlated. + * + * @param matrix matrix with columns representing variables to correlate + */ + public KendallsCorrelation(RealMatrix matrix) { + correlationMatrix = computeCorrelationMatrix(matrix); + } + + /** + * Returns the correlation matrix. + * + * @return correlation matrix + */ + public RealMatrix getCorrelationMatrix() { + return correlationMatrix; + } + + /** + * Computes the Kendall's Tau rank correlation matrix for the columns of + * the input matrix. + * + * @param matrix matrix with columns representing variables to correlate + * @return correlation matrix + */ + public RealMatrix computeCorrelationMatrix(final RealMatrix matrix) { + int nVars = matrix.getColumnDimension(); + RealMatrix outMatrix = new BlockRealMatrix(nVars, nVars); + for (int i = 0; i < nVars; i++) { + for (int j = 0; j < i; j++) { + double corr = correlation(matrix.getColumn(i), matrix.getColumn(j)); + outMatrix.setEntry(i, j, corr); + outMatrix.setEntry(j, i, corr); + } + outMatrix.setEntry(i, i, 1d); + } + return outMatrix; + } + + /** + * Computes the Kendall's Tau rank correlation matrix for the columns of + * the input rectangular array. The columns of the array represent values + * of variables to be correlated. + * + * @param matrix matrix with columns representing variables to correlate + * @return correlation matrix + */ + public RealMatrix computeCorrelationMatrix(final double[][] matrix) { + return computeCorrelationMatrix(new BlockRealMatrix(matrix)); + } + + /** + * Computes the Kendall's Tau rank correlation coefficient between the two arrays. + * + * @param xArray first data array + * @param yArray second data array + * @return Returns Kendall's Tau rank correlation coefficient for the two arrays + * @throws DimensionMismatchException if the arrays lengths do not match + */ + public double correlation(final double[] xArray, final double[] yArray) + throws DimensionMismatchException { + + if (xArray.length != yArray.length) { + throw new DimensionMismatchException(xArray.length, yArray.length); + } + + final int n = xArray.length; + final long numPairs = sum(n - 1); + + @SuppressWarnings("unchecked") + Pair<Double, Double>[] pairs = new Pair[n]; + for (int i = 0; i < n; i++) { + pairs[i] = new Pair<Double, Double>(xArray[i], yArray[i]); + } + + Arrays.sort(pairs, new Comparator<Pair<Double, Double>>() { + /** {@inheritDoc} */ + public int compare(Pair<Double, Double> pair1, Pair<Double, Double> pair2) { + int compareFirst = pair1.getFirst().compareTo(pair2.getFirst()); + return compareFirst != 0 ? compareFirst : pair1.getSecond().compareTo(pair2.getSecond()); + } + }); + + long tiedXPairs = 0; + long tiedXYPairs = 0; + long consecutiveXTies = 1; + long consecutiveXYTies = 1; + Pair<Double, Double> prev = pairs[0]; + for (int i = 1; i < n; i++) { + final Pair<Double, Double> curr = pairs[i]; + if (curr.getFirst().equals(prev.getFirst())) { + consecutiveXTies++; + if (curr.getSecond().equals(prev.getSecond())) { + consecutiveXYTies++; + } else { + tiedXYPairs += sum(consecutiveXYTies - 1); + consecutiveXYTies = 1; + } + } else { + tiedXPairs += sum(consecutiveXTies - 1); + consecutiveXTies = 1; + tiedXYPairs += sum(consecutiveXYTies - 1); + consecutiveXYTies = 1; + } + prev = curr; + } + tiedXPairs += sum(consecutiveXTies - 1); + tiedXYPairs += sum(consecutiveXYTies - 1); + + long swaps = 0; + @SuppressWarnings("unchecked") + Pair<Double, Double>[] pairsDestination = new Pair[n]; + for (int segmentSize = 1; segmentSize < n; segmentSize <<= 1) { + for (int offset = 0; offset < n; offset += 2 * segmentSize) { + int i = offset; + final int iEnd = FastMath.min(i + segmentSize, n); + int j = iEnd; + final int jEnd = FastMath.min(j + segmentSize, n); + + int copyLocation = offset; + while (i < iEnd || j < jEnd) { + if (i < iEnd) { + if (j < jEnd) { + if (pairs[i].getSecond().compareTo(pairs[j].getSecond()) <= 0) { + pairsDestination[copyLocation] = pairs[i]; + i++; + } else { + pairsDestination[copyLocation] = pairs[j]; + j++; + swaps += iEnd - i; + } + } else { + pairsDestination[copyLocation] = pairs[i]; + i++; + } + } else { + pairsDestination[copyLocation] = pairs[j]; + j++; + } + copyLocation++; + } + } + final Pair<Double, Double>[] pairsTemp = pairs; + pairs = pairsDestination; + pairsDestination = pairsTemp; + } + + long tiedYPairs = 0; + long consecutiveYTies = 1; + prev = pairs[0]; + for (int i = 1; i < n; i++) { + final Pair<Double, Double> curr = pairs[i]; + if (curr.getSecond().equals(prev.getSecond())) { + consecutiveYTies++; + } else { + tiedYPairs += sum(consecutiveYTies - 1); + consecutiveYTies = 1; + } + prev = curr; + } + tiedYPairs += sum(consecutiveYTies - 1); + + final long concordantMinusDiscordant = numPairs - tiedXPairs - tiedYPairs + tiedXYPairs - 2 * swaps; + final double nonTiedPairsMultiplied = (numPairs - tiedXPairs) * (double) (numPairs - tiedYPairs); + return concordantMinusDiscordant / FastMath.sqrt(nonTiedPairsMultiplied); + } + + /** + * Returns the sum of the number from 1 .. n according to Gauss' summation formula: + * \[ \sum\limits_{k=1}^n k = \frac{n(n + 1)}{2} \] + * + * @param n the summation end + * @return the sum of the number from 1 to n + */ + private static long sum(long n) { + return n * (n + 1) / 2l; + } +} diff --git a/src/main/java/org/apache/commons/math3/stat/correlation/PearsonsCorrelation.java b/src/main/java/org/apache/commons/math3/stat/correlation/PearsonsCorrelation.java new file mode 100644 index 0000000..53d17ab --- /dev/null +++ b/src/main/java/org/apache/commons/math3/stat/correlation/PearsonsCorrelation.java @@ -0,0 +1,330 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math3.stat.correlation; + +import org.apache.commons.math3.distribution.TDistribution; +import org.apache.commons.math3.exception.util.LocalizedFormats; +import org.apache.commons.math3.exception.MathIllegalArgumentException; +import org.apache.commons.math3.exception.NullArgumentException; +import org.apache.commons.math3.exception.DimensionMismatchException; +import org.apache.commons.math3.linear.RealMatrix; +import org.apache.commons.math3.linear.BlockRealMatrix; +import org.apache.commons.math3.stat.regression.SimpleRegression; +import org.apache.commons.math3.util.FastMath; + +/** + * Computes Pearson's product-moment correlation coefficients for pairs of arrays + * or columns of a matrix. + * + * <p>The constructors that take <code>RealMatrix</code> or + * <code>double[][]</code> arguments generate correlation matrices. The + * columns of the input matrices are assumed to represent variable values. + * Correlations are given by the formula</p> + * + * <p><code>cor(X, Y) = Σ[(x<sub>i</sub> - E(X))(y<sub>i</sub> - E(Y))] / [(n - 1)s(X)s(Y)]</code> + * where <code>E(X)</code> is the mean of <code>X</code>, <code>E(Y)</code> + * is the mean of the <code>Y</code> values and s(X), s(Y) are standard deviations.</p> + * + * <p>To compute the correlation coefficient for a single pair of arrays, use {@link #PearsonsCorrelation()} + * to construct an instance with no data and then {@link #correlation(double[], double[])}. + * Correlation matrices can also be computed directly from an instance with no data using + * {@link #computeCorrelationMatrix(double[][])}. In order to use {@link #getCorrelationMatrix()}, + * {@link #getCorrelationPValues()}, or {@link #getCorrelationStandardErrors()}; however, one of the + * constructors supplying data or a covariance matrix must be used to create the instance.</p> + * + * @since 2.0 + */ +public class PearsonsCorrelation { + + /** correlation matrix */ + private final RealMatrix correlationMatrix; + + /** number of observations */ + private final int nObs; + + /** + * Create a PearsonsCorrelation instance without data. + */ + public PearsonsCorrelation() { + super(); + correlationMatrix = null; + nObs = 0; + } + + /** + * Create a PearsonsCorrelation from a rectangular array + * whose columns represent values of variables to be correlated. + * + * Throws MathIllegalArgumentException if the input array does not have at least + * two columns and two rows. Pairwise correlations are set to NaN if one + * of the correlates has zero variance. + * + * @param data rectangular array with columns representing variables + * @throws MathIllegalArgumentException if the input data array is not + * rectangular with at least two rows and two columns. + * @see #correlation(double[], double[]) + */ + public PearsonsCorrelation(double[][] data) { + this(new BlockRealMatrix(data)); + } + + /** + * Create a PearsonsCorrelation from a RealMatrix whose columns + * represent variables to be correlated. + * + * Throws MathIllegalArgumentException if the matrix does not have at least + * two columns and two rows. Pairwise correlations are set to NaN if one + * of the correlates has zero variance. + * + * @param matrix matrix with columns representing variables to correlate + * @throws MathIllegalArgumentException if the matrix does not contain sufficient data + * @see #correlation(double[], double[]) + */ + public PearsonsCorrelation(RealMatrix matrix) { + nObs = matrix.getRowDimension(); + correlationMatrix = computeCorrelationMatrix(matrix); + } + + /** + * Create a PearsonsCorrelation from a {@link Covariance}. The correlation + * matrix is computed by scaling the Covariance's covariance matrix. + * The Covariance instance must have been created from a data matrix with + * columns representing variable values. + * + * @param covariance Covariance instance + */ + public PearsonsCorrelation(Covariance covariance) { + RealMatrix covarianceMatrix = covariance.getCovarianceMatrix(); + if (covarianceMatrix == null) { + throw new NullArgumentException(LocalizedFormats.COVARIANCE_MATRIX); + } + nObs = covariance.getN(); + correlationMatrix = covarianceToCorrelation(covarianceMatrix); + } + + /** + * Create a PearsonsCorrelation from a covariance matrix. The correlation + * matrix is computed by scaling the covariance matrix. + * + * @param covarianceMatrix covariance matrix + * @param numberOfObservations the number of observations in the dataset used to compute + * the covariance matrix + */ + public PearsonsCorrelation(RealMatrix covarianceMatrix, int numberOfObservations) { + nObs = numberOfObservations; + correlationMatrix = covarianceToCorrelation(covarianceMatrix); + } + + /** + * Returns the correlation matrix. + * + * <p>This method will return null if the argumentless constructor was used + * to create this instance, even if {@link #computeCorrelationMatrix(double[][])} + * has been called before it is activated.</p> + * + * @return correlation matrix + */ + public RealMatrix getCorrelationMatrix() { + return correlationMatrix; + } + + /** + * Returns a matrix of standard errors associated with the estimates + * in the correlation matrix.<br/> + * <code>getCorrelationStandardErrors().getEntry(i,j)</code> is the standard + * error associated with <code>getCorrelationMatrix.getEntry(i,j)</code> + * + * <p>The formula used to compute the standard error is <br/> + * <code>SE<sub>r</sub> = ((1 - r<sup>2</sup>) / (n - 2))<sup>1/2</sup></code> + * where <code>r</code> is the estimated correlation coefficient and + * <code>n</code> is the number of observations in the source dataset.</p> + * + * <p>To use this method, one of the constructors that supply an input + * matrix must have been used to create this instance.</p> + * + * @return matrix of correlation standard errors + * @throws NullPointerException if this instance was created with no data + */ + public RealMatrix getCorrelationStandardErrors() { + int nVars = correlationMatrix.getColumnDimension(); + double[][] out = new double[nVars][nVars]; + for (int i = 0; i < nVars; i++) { + for (int j = 0; j < nVars; j++) { + double r = correlationMatrix.getEntry(i, j); + out[i][j] = FastMath.sqrt((1 - r * r) /(nObs - 2)); + } + } + return new BlockRealMatrix(out); + } + + /** + * Returns a matrix of p-values associated with the (two-sided) null + * hypothesis that the corresponding correlation coefficient is zero. + * + * <p><code>getCorrelationPValues().getEntry(i,j)</code> is the probability + * that a random variable distributed as <code>t<sub>n-2</sub></code> takes + * a value with absolute value greater than or equal to <br> + * <code>|r|((n - 2) / (1 - r<sup>2</sup>))<sup>1/2</sup></code></p> + * + * <p>The values in the matrix are sometimes referred to as the + * <i>significance</i> of the corresponding correlation coefficients.</p> + * + * <p>To use this method, one of the constructors that supply an input + * matrix must have been used to create this instance.</p> + * + * @return matrix of p-values + * @throws org.apache.commons.math3.exception.MaxCountExceededException + * if an error occurs estimating probabilities + * @throws NullPointerException if this instance was created with no data + */ + public RealMatrix getCorrelationPValues() { + TDistribution tDistribution = new TDistribution(nObs - 2); + int nVars = correlationMatrix.getColumnDimension(); + double[][] out = new double[nVars][nVars]; + for (int i = 0; i < nVars; i++) { + for (int j = 0; j < nVars; j++) { + if (i == j) { + out[i][j] = 0d; + } else { + double r = correlationMatrix.getEntry(i, j); + double t = FastMath.abs(r * FastMath.sqrt((nObs - 2)/(1 - r * r))); + out[i][j] = 2 * tDistribution.cumulativeProbability(-t); + } + } + } + return new BlockRealMatrix(out); + } + + + /** + * Computes the correlation matrix for the columns of the + * input matrix, using {@link #correlation(double[], double[])}. + * + * Throws MathIllegalArgumentException if the matrix does not have at least + * two columns and two rows. Pairwise correlations are set to NaN if one + * of the correlates has zero variance. + * + * @param matrix matrix with columns representing variables to correlate + * @return correlation matrix + * @throws MathIllegalArgumentException if the matrix does not contain sufficient data + * @see #correlation(double[], double[]) + */ + public RealMatrix computeCorrelationMatrix(RealMatrix matrix) { + checkSufficientData(matrix); + int nVars = matrix.getColumnDimension(); + RealMatrix outMatrix = new BlockRealMatrix(nVars, nVars); + for (int i = 0; i < nVars; i++) { + for (int j = 0; j < i; j++) { + double corr = correlation(matrix.getColumn(i), matrix.getColumn(j)); + outMatrix.setEntry(i, j, corr); + outMatrix.setEntry(j, i, corr); + } + outMatrix.setEntry(i, i, 1d); + } + return outMatrix; + } + + /** + * Computes the correlation matrix for the columns of the + * input rectangular array. The columns of the array represent values + * of variables to be correlated. + * + * Throws MathIllegalArgumentException if the matrix does not have at least + * two columns and two rows or if the array is not rectangular. Pairwise + * correlations are set to NaN if one of the correlates has zero variance. + * + * @param data matrix with columns representing variables to correlate + * @return correlation matrix + * @throws MathIllegalArgumentException if the array does not contain sufficient data + * @see #correlation(double[], double[]) + */ + public RealMatrix computeCorrelationMatrix(double[][] data) { + return computeCorrelationMatrix(new BlockRealMatrix(data)); + } + + /** + * Computes the Pearson's product-moment correlation coefficient between two arrays. + * + * <p>Throws MathIllegalArgumentException if the arrays do not have the same length + * or their common length is less than 2. Returns {@code NaN} if either of the arrays + * has zero variance (i.e., if one of the arrays does not contain at least two distinct + * values).</p> + * + * @param xArray first data array + * @param yArray second data array + * @return Returns Pearson's correlation coefficient for the two arrays + * @throws DimensionMismatchException if the arrays lengths do not match + * @throws MathIllegalArgumentException if there is insufficient data + */ + public double correlation(final double[] xArray, final double[] yArray) { + SimpleRegression regression = new SimpleRegression(); + if (xArray.length != yArray.length) { + throw new DimensionMismatchException(xArray.length, yArray.length); + } else if (xArray.length < 2) { + throw new MathIllegalArgumentException(LocalizedFormats.INSUFFICIENT_DIMENSION, + xArray.length, 2); + } else { + for(int i=0; i<xArray.length; i++) { + regression.addData(xArray[i], yArray[i]); + } + return regression.getR(); + } + } + + /** + * Derives a correlation matrix from a covariance matrix. + * + * <p>Uses the formula <br/> + * <code>r(X,Y) = cov(X,Y)/s(X)s(Y)</code> where + * <code>r(·,·)</code> is the correlation coefficient and + * <code>s(·)</code> means standard deviation.</p> + * + * @param covarianceMatrix the covariance matrix + * @return correlation matrix + */ + public RealMatrix covarianceToCorrelation(RealMatrix covarianceMatrix) { + int nVars = covarianceMatrix.getColumnDimension(); + RealMatrix outMatrix = new BlockRealMatrix(nVars, nVars); + for (int i = 0; i < nVars; i++) { + double sigma = FastMath.sqrt(covarianceMatrix.getEntry(i, i)); + outMatrix.setEntry(i, i, 1d); + for (int j = 0; j < i; j++) { + double entry = covarianceMatrix.getEntry(i, j) / + (sigma * FastMath.sqrt(covarianceMatrix.getEntry(j, j))); + outMatrix.setEntry(i, j, entry); + outMatrix.setEntry(j, i, entry); + } + } + return outMatrix; + } + + /** + * Throws MathIllegalArgumentException if the matrix does not have at least + * two columns and two rows. + * + * @param matrix matrix to check for sufficiency + * @throws MathIllegalArgumentException if there is insufficient data + */ + private void checkSufficientData(final RealMatrix matrix) { + int nRows = matrix.getRowDimension(); + int nCols = matrix.getColumnDimension(); + if (nRows < 2 || nCols < 2) { + throw new MathIllegalArgumentException(LocalizedFormats.INSUFFICIENT_ROWS_AND_COLUMNS, + nRows, nCols); + } + } +} diff --git a/src/main/java/org/apache/commons/math3/stat/correlation/SpearmansCorrelation.java b/src/main/java/org/apache/commons/math3/stat/correlation/SpearmansCorrelation.java new file mode 100644 index 0000000..80c0a54 --- /dev/null +++ b/src/main/java/org/apache/commons/math3/stat/correlation/SpearmansCorrelation.java @@ -0,0 +1,262 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.math3.stat.correlation; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.commons.math3.exception.DimensionMismatchException; +import org.apache.commons.math3.exception.MathIllegalArgumentException; +import org.apache.commons.math3.exception.util.LocalizedFormats; +import org.apache.commons.math3.linear.BlockRealMatrix; +import org.apache.commons.math3.linear.RealMatrix; +import org.apache.commons.math3.stat.ranking.NaNStrategy; +import org.apache.commons.math3.stat.ranking.NaturalRanking; +import org.apache.commons.math3.stat.ranking.RankingAlgorithm; + +/** + * Spearman's rank correlation. This implementation performs a rank + * transformation on the input data and then computes {@link PearsonsCorrelation} + * on the ranked data. + * <p> + * By default, ranks are computed using {@link NaturalRanking} with default + * strategies for handling NaNs and ties in the data (NaNs maximal, ties averaged). + * The ranking algorithm can be set using a constructor argument. + * + * @since 2.0 + */ +public class SpearmansCorrelation { + + /** Input data */ + private final RealMatrix data; + + /** Ranking algorithm */ + private final RankingAlgorithm rankingAlgorithm; + + /** Rank correlation */ + private final PearsonsCorrelation rankCorrelation; + + /** + * Create a SpearmansCorrelation without data. + */ + public SpearmansCorrelation() { + this(new NaturalRanking()); + } + + /** + * Create a SpearmansCorrelation with the given ranking algorithm. + * <p> + * From version 4.0 onwards this constructor will throw an exception + * if the provided {@link NaturalRanking} uses a {@link NaNStrategy#REMOVED} strategy. + * + * @param rankingAlgorithm ranking algorithm + * @since 3.1 + */ + public SpearmansCorrelation(final RankingAlgorithm rankingAlgorithm) { + data = null; + this.rankingAlgorithm = rankingAlgorithm; + rankCorrelation = null; + } + + /** + * Create a SpearmansCorrelation from the given data matrix. + * + * @param dataMatrix matrix of data with columns representing + * variables to correlate + */ + public SpearmansCorrelation(final RealMatrix dataMatrix) { + this(dataMatrix, new NaturalRanking()); + } + + /** + * Create a SpearmansCorrelation with the given input data matrix + * and ranking algorithm. + * <p> + * From version 4.0 onwards this constructor will throw an exception + * if the provided {@link NaturalRanking} uses a {@link NaNStrategy#REMOVED} strategy. + * + * @param dataMatrix matrix of data with columns representing + * variables to correlate + * @param rankingAlgorithm ranking algorithm + */ + public SpearmansCorrelation(final RealMatrix dataMatrix, final RankingAlgorithm rankingAlgorithm) { + this.rankingAlgorithm = rankingAlgorithm; + this.data = rankTransform(dataMatrix); + rankCorrelation = new PearsonsCorrelation(data); + } + + /** + * Calculate the Spearman Rank Correlation Matrix. + * + * @return Spearman Rank Correlation Matrix + * @throws NullPointerException if this instance was created with no data + */ + public RealMatrix getCorrelationMatrix() { + return rankCorrelation.getCorrelationMatrix(); + } + + /** + * Returns a {@link PearsonsCorrelation} instance constructed from the + * ranked input data. That is, + * <code>new SpearmansCorrelation(matrix).getRankCorrelation()</code> + * is equivalent to + * <code>new PearsonsCorrelation(rankTransform(matrix))</code> where + * <code>rankTransform(matrix)</code> is the result of applying the + * configured <code>RankingAlgorithm</code> to each of the columns of + * <code>matrix.</code> + * + * <p>Returns null if this instance was created with no data.</p> + * + * @return PearsonsCorrelation among ranked column data + */ + public PearsonsCorrelation getRankCorrelation() { + return rankCorrelation; + } + + /** + * Computes the Spearman's rank correlation matrix for the columns of the + * input matrix. + * + * @param matrix matrix with columns representing variables to correlate + * @return correlation matrix + */ + public RealMatrix computeCorrelationMatrix(final RealMatrix matrix) { + final RealMatrix matrixCopy = rankTransform(matrix); + return new PearsonsCorrelation().computeCorrelationMatrix(matrixCopy); + } + + /** + * Computes the Spearman's rank correlation matrix for the columns of the + * input rectangular array. The columns of the array represent values + * of variables to be correlated. + * + * @param matrix matrix with columns representing variables to correlate + * @return correlation matrix + */ + public RealMatrix computeCorrelationMatrix(final double[][] matrix) { + return computeCorrelationMatrix(new BlockRealMatrix(matrix)); + } + + /** + * Computes the Spearman's rank correlation coefficient between the two arrays. + * + * @param xArray first data array + * @param yArray second data array + * @return Returns Spearman's rank correlation coefficient for the two arrays + * @throws DimensionMismatchException if the arrays lengths do not match + * @throws MathIllegalArgumentException if the array length is less than 2 + */ + public double correlation(final double[] xArray, final double[] yArray) { + if (xArray.length != yArray.length) { + throw new DimensionMismatchException(xArray.length, yArray.length); + } else if (xArray.length < 2) { + throw new MathIllegalArgumentException(LocalizedFormats.INSUFFICIENT_DIMENSION, + xArray.length, 2); + } else { + double[] x = xArray; + double[] y = yArray; + if (rankingAlgorithm instanceof NaturalRanking && + NaNStrategy.REMOVED == ((NaturalRanking) rankingAlgorithm).getNanStrategy()) { + final Set<Integer> nanPositions = new HashSet<Integer>(); + + nanPositions.addAll(getNaNPositions(xArray)); + nanPositions.addAll(getNaNPositions(yArray)); + + x = removeValues(xArray, nanPositions); + y = removeValues(yArray, nanPositions); + } + return new PearsonsCorrelation().correlation(rankingAlgorithm.rank(x), rankingAlgorithm.rank(y)); + } + } + + /** + * Applies rank transform to each of the columns of <code>matrix</code> + * using the current <code>rankingAlgorithm</code>. + * + * @param matrix matrix to transform + * @return a rank-transformed matrix + */ + private RealMatrix rankTransform(final RealMatrix matrix) { + RealMatrix transformed = null; + + if (rankingAlgorithm instanceof NaturalRanking && + ((NaturalRanking) rankingAlgorithm).getNanStrategy() == NaNStrategy.REMOVED) { + final Set<Integer> nanPositions = new HashSet<Integer>(); + for (int i = 0; i < matrix.getColumnDimension(); i++) { + nanPositions.addAll(getNaNPositions(matrix.getColumn(i))); + } + + // if we have found NaN values, we have to update the matrix size + if (!nanPositions.isEmpty()) { + transformed = new BlockRealMatrix(matrix.getRowDimension() - nanPositions.size(), + matrix.getColumnDimension()); + for (int i = 0; i < transformed.getColumnDimension(); i++) { + transformed.setColumn(i, removeValues(matrix.getColumn(i), nanPositions)); + } + } + } + + if (transformed == null) { + transformed = matrix.copy(); + } + + for (int i = 0; i < transformed.getColumnDimension(); i++) { + transformed.setColumn(i, rankingAlgorithm.rank(transformed.getColumn(i))); + } + + return transformed; + } + + /** + * Returns a list containing the indices of NaN values in the input array. + * + * @param input the input array + * @return a list of NaN positions in the input array + */ + private List<Integer> getNaNPositions(final double[] input) { + final List<Integer> positions = new ArrayList<Integer>(); + for (int i = 0; i < input.length; i++) { + if (Double.isNaN(input[i])) { + positions.add(i); + } + } + return positions; + } + + /** + * Removes all values from the input array at the specified indices. + * + * @param input the input array + * @param indices a set containing the indices to be removed + * @return the input array without the values at the specified indices + */ + private double[] removeValues(final double[] input, final Set<Integer> indices) { + if (indices.isEmpty()) { + return input; + } + final double[] result = new double[input.length - indices.size()]; + for (int i = 0, j = 0; i < input.length; i++) { + if (!indices.contains(i)) { + result[j++] = input[i]; + } + } + return result; + } +} diff --git a/src/main/java/org/apache/commons/math3/stat/correlation/StorelessBivariateCovariance.java b/src/main/java/org/apache/commons/math3/stat/correlation/StorelessBivariateCovariance.java new file mode 100644 index 0000000..1a798d2 --- /dev/null +++ b/src/main/java/org/apache/commons/math3/stat/correlation/StorelessBivariateCovariance.java @@ -0,0 +1,138 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math3.stat.correlation; + +import org.apache.commons.math3.exception.NumberIsTooSmallException; +import org.apache.commons.math3.exception.util.LocalizedFormats; + +/** + * Bivariate Covariance implementation that does not require input data to be + * stored in memory. + * + * <p>This class is based on a paper written by Philippe Pébay: + * <a href="http://prod.sandia.gov/techlib/access-control.cgi/2008/086212.pdf"> + * Formulas for Robust, One-Pass Parallel Computation of Covariances and + * Arbitrary-Order Statistical Moments</a>, 2008, Technical Report SAND2008-6212, + * Sandia National Laboratories. It computes the covariance for a pair of variables. + * Use {@link StorelessCovariance} to estimate an entire covariance matrix.</p> + * + * <p>Note: This class is package private as it is only used internally in + * the {@link StorelessCovariance} class.</p> + * + * @since 3.0 + */ +class StorelessBivariateCovariance { + + /** the mean of variable x */ + private double meanX; + + /** the mean of variable y */ + private double meanY; + + /** number of observations */ + private double n; + + /** the running covariance estimate */ + private double covarianceNumerator; + + /** flag for bias correction */ + private boolean biasCorrected; + + /** + * Create an empty {@link StorelessBivariateCovariance} instance with + * bias correction. + */ + StorelessBivariateCovariance() { + this(true); + } + + /** + * Create an empty {@link StorelessBivariateCovariance} instance. + * + * @param biasCorrection if <code>true</code> the covariance estimate is corrected + * for bias, i.e. n-1 in the denominator, otherwise there is no bias correction, + * i.e. n in the denominator. + */ + StorelessBivariateCovariance(final boolean biasCorrection) { + meanX = meanY = 0.0; + n = 0; + covarianceNumerator = 0.0; + biasCorrected = biasCorrection; + } + + /** + * Update the covariance estimation with a pair of variables (x, y). + * + * @param x the x value + * @param y the y value + */ + public void increment(final double x, final double y) { + n++; + final double deltaX = x - meanX; + final double deltaY = y - meanY; + meanX += deltaX / n; + meanY += deltaY / n; + covarianceNumerator += ((n - 1.0) / n) * deltaX * deltaY; + } + + /** + * Appends another bivariate covariance calculation to this. + * After this operation, statistics returned should be close to what would + * have been obtained by by performing all of the {@link #increment(double, double)} + * operations in {@code cov} directly on this. + * + * @param cov StorelessBivariateCovariance instance to append. + */ + public void append(StorelessBivariateCovariance cov) { + double oldN = n; + n += cov.n; + final double deltaX = cov.meanX - meanX; + final double deltaY = cov.meanY - meanY; + meanX += deltaX * cov.n / n; + meanY += deltaY * cov.n / n; + covarianceNumerator += cov.covarianceNumerator + oldN * cov.n / n * deltaX * deltaY; + } + + /** + * Returns the number of observations. + * + * @return number of observations + */ + public double getN() { + return n; + } + + /** + * Return the current covariance estimate. + * + * @return the current covariance + * @throws NumberIsTooSmallException if the number of observations + * is < 2 + */ + public double getResult() throws NumberIsTooSmallException { + if (n < 2) { + throw new NumberIsTooSmallException(LocalizedFormats.INSUFFICIENT_DIMENSION, + n, 2, true); + } + if (biasCorrected) { + return covarianceNumerator / (n - 1d); + } else { + return covarianceNumerator / n; + } + } +} + diff --git a/src/main/java/org/apache/commons/math3/stat/correlation/StorelessCovariance.java b/src/main/java/org/apache/commons/math3/stat/correlation/StorelessCovariance.java new file mode 100644 index 0000000..7e927ca --- /dev/null +++ b/src/main/java/org/apache/commons/math3/stat/correlation/StorelessCovariance.java @@ -0,0 +1,229 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math3.stat.correlation; + +import org.apache.commons.math3.exception.DimensionMismatchException; +import org.apache.commons.math3.exception.MathUnsupportedOperationException; +import org.apache.commons.math3.exception.NumberIsTooSmallException; +import org.apache.commons.math3.linear.MatrixUtils; +import org.apache.commons.math3.linear.RealMatrix; + +/** + * Covariance implementation that does not require input data to be + * stored in memory. The size of the covariance matrix is specified in the + * constructor. Specific elements of the matrix are incrementally updated with + * calls to incrementRow() or increment Covariance(). + * + * <p>This class is based on a paper written by Philippe Pébay: + * <a href="http://prod.sandia.gov/techlib/access-control.cgi/2008/086212.pdf"> + * Formulas for Robust, One-Pass Parallel Computation of Covariances and + * Arbitrary-Order Statistical Moments</a>, 2008, Technical Report SAND2008-6212, + * Sandia National Laboratories.</p> + * + * <p>Note: the underlying covariance matrix is symmetric, thus only the + * upper triangular part of the matrix is stored and updated each increment.</p> + * + * @since 3.0 + */ +public class StorelessCovariance extends Covariance { + + /** the square covariance matrix (upper triangular part) */ + private StorelessBivariateCovariance[] covMatrix; + + /** dimension of the square covariance matrix */ + private int dimension; + + /** + * Create a bias corrected covariance matrix with a given dimension. + * + * @param dim the dimension of the square covariance matrix + */ + public StorelessCovariance(final int dim) { + this(dim, true); + } + + /** + * Create a covariance matrix with a given number of rows and columns and the + * indicated bias correction. + * + * @param dim the dimension of the covariance matrix + * @param biasCorrected if <code>true</code> the covariance estimate is corrected + * for bias, i.e. n-1 in the denominator, otherwise there is no bias correction, + * i.e. n in the denominator. + */ + public StorelessCovariance(final int dim, final boolean biasCorrected) { + dimension = dim; + covMatrix = new StorelessBivariateCovariance[dimension * (dimension + 1) / 2]; + initializeMatrix(biasCorrected); + } + + /** + * Initialize the internal two-dimensional array of + * {@link StorelessBivariateCovariance} instances. + * + * @param biasCorrected if the covariance estimate shall be corrected for bias + */ + private void initializeMatrix(final boolean biasCorrected) { + for(int i = 0; i < dimension; i++){ + for(int j = 0; j < dimension; j++){ + setElement(i, j, new StorelessBivariateCovariance(biasCorrected)); + } + } + } + + /** + * Returns the index (i, j) translated into the one-dimensional + * array used to store the upper triangular part of the symmetric + * covariance matrix. + * + * @param i the row index + * @param j the column index + * @return the corresponding index in the matrix array + */ + private int indexOf(final int i, final int j) { + return j < i ? i * (i + 1) / 2 + j : j * (j + 1) / 2 + i; + } + + /** + * Gets the element at index (i, j) from the covariance matrix + * @param i the row index + * @param j the column index + * @return the {@link StorelessBivariateCovariance} element at the given index + */ + private StorelessBivariateCovariance getElement(final int i, final int j) { + return covMatrix[indexOf(i, j)]; + } + + /** + * Sets the covariance element at index (i, j) in the covariance matrix + * @param i the row index + * @param j the column index + * @param cov the {@link StorelessBivariateCovariance} element to be set + */ + private void setElement(final int i, final int j, + final StorelessBivariateCovariance cov) { + covMatrix[indexOf(i, j)] = cov; + } + + /** + * Get the covariance for an individual element of the covariance matrix. + * + * @param xIndex row index in the covariance matrix + * @param yIndex column index in the covariance matrix + * @return the covariance of the given element + * @throws NumberIsTooSmallException if the number of observations + * in the cell is < 2 + */ + public double getCovariance(final int xIndex, + final int yIndex) + throws NumberIsTooSmallException { + + return getElement(xIndex, yIndex).getResult(); + + } + + /** + * Increment the covariance matrix with one row of data. + * + * @param data array representing one row of data. + * @throws DimensionMismatchException if the length of <code>rowData</code> + * does not match with the covariance matrix + */ + public void increment(final double[] data) + throws DimensionMismatchException { + + int length = data.length; + if (length != dimension) { + throw new DimensionMismatchException(length, dimension); + } + + // only update the upper triangular part of the covariance matrix + // as only these parts are actually stored + for (int i = 0; i < length; i++){ + for (int j = i; j < length; j++){ + getElement(i, j).increment(data[i], data[j]); + } + } + + } + + /** + * Appends {@code sc} to this, effectively aggregating the computations in {@code sc} + * with this. After invoking this method, covariances returned should be close + * to what would have been obtained by performing all of the {@link #increment(double[])} + * operations in {@code sc} directly on this. + * + * @param sc externally computed StorelessCovariance to add to this + * @throws DimensionMismatchException if the dimension of sc does not match this + * @since 3.3 + */ + public void append(StorelessCovariance sc) throws DimensionMismatchException { + if (sc.dimension != dimension) { + throw new DimensionMismatchException(sc.dimension, dimension); + } + + // only update the upper triangular part of the covariance matrix + // as only these parts are actually stored + for (int i = 0; i < dimension; i++) { + for (int j = i; j < dimension; j++) { + getElement(i, j).append(sc.getElement(i, j)); + } + } + } + + /** + * {@inheritDoc} + * @throws NumberIsTooSmallException if the number of observations + * in a cell is < 2 + */ + @Override + public RealMatrix getCovarianceMatrix() throws NumberIsTooSmallException { + return MatrixUtils.createRealMatrix(getData()); + } + + /** + * Return the covariance matrix as two-dimensional array. + * + * @return a two-dimensional double array of covariance values + * @throws NumberIsTooSmallException if the number of observations + * for a cell is < 2 + */ + public double[][] getData() throws NumberIsTooSmallException { + final double[][] data = new double[dimension][dimension]; + for (int i = 0; i < dimension; i++) { + for (int j = 0; j < dimension; j++) { + data[i][j] = getElement(i, j).getResult(); + } + } + return data; + } + + /** + * This {@link Covariance} method is not supported by a {@link StorelessCovariance}, + * since the number of bivariate observations does not have to be the same for different + * pairs of covariates - i.e., N as defined in {@link Covariance#getN()} is undefined. + * + * @return nothing as this implementation always throws a + * {@link MathUnsupportedOperationException} + * @throws MathUnsupportedOperationException in all cases + */ + @Override + public int getN() + throws MathUnsupportedOperationException { + throw new MathUnsupportedOperationException(); + } +} diff --git a/src/main/java/org/apache/commons/math3/stat/correlation/package-info.java b/src/main/java/org/apache/commons/math3/stat/correlation/package-info.java new file mode 100644 index 0000000..adf285e --- /dev/null +++ b/src/main/java/org/apache/commons/math3/stat/correlation/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * + * Correlations/Covariance computations. + * + */ +package org.apache.commons.math3.stat.correlation; |