diff options
Diffstat (limited to 'src/main/java/org/apache/commons/math3/stat/regression')
10 files changed, 3430 insertions, 0 deletions
diff --git a/src/main/java/org/apache/commons/math3/stat/regression/AbstractMultipleLinearRegression.java b/src/main/java/org/apache/commons/math3/stat/regression/AbstractMultipleLinearRegression.java new file mode 100644 index 0000000..9b7c40a --- /dev/null +++ b/src/main/java/org/apache/commons/math3/stat/regression/AbstractMultipleLinearRegression.java @@ -0,0 +1,383 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math3.stat.regression; + +import org.apache.commons.math3.exception.DimensionMismatchException; +import org.apache.commons.math3.exception.InsufficientDataException; +import org.apache.commons.math3.exception.MathIllegalArgumentException; +import org.apache.commons.math3.exception.NoDataException; +import org.apache.commons.math3.exception.NullArgumentException; +import org.apache.commons.math3.exception.util.LocalizedFormats; +import org.apache.commons.math3.linear.NonSquareMatrixException; +import org.apache.commons.math3.linear.RealMatrix; +import org.apache.commons.math3.linear.Array2DRowRealMatrix; +import org.apache.commons.math3.linear.RealVector; +import org.apache.commons.math3.linear.ArrayRealVector; +import org.apache.commons.math3.stat.descriptive.moment.Variance; +import org.apache.commons.math3.util.FastMath; + +/** + * Abstract base class for implementations of MultipleLinearRegression. + * @since 2.0 + */ +public abstract class AbstractMultipleLinearRegression implements + MultipleLinearRegression { + + /** X sample data. */ + private RealMatrix xMatrix; + + /** Y sample data. */ + private RealVector yVector; + + /** Whether or not the regression model includes an intercept. True means no intercept. */ + private boolean noIntercept = false; + + /** + * @return the X sample data. + */ + protected RealMatrix getX() { + return xMatrix; + } + + /** + * @return the Y sample data. + */ + protected RealVector getY() { + return yVector; + } + + /** + * @return true if the model has no intercept term; false otherwise + * @since 2.2 + */ + public boolean isNoIntercept() { + return noIntercept; + } + + /** + * @param noIntercept true means the model is to be estimated without an intercept term + * @since 2.2 + */ + public void setNoIntercept(boolean noIntercept) { + this.noIntercept = noIntercept; + } + + /** + * <p>Loads model x and y sample data from a flat input array, overriding any previous sample. + * </p> + * <p>Assumes that rows are concatenated with y values first in each row. For example, an input + * <code>data</code> array containing the sequence of values (1, 2, 3, 4, 5, 6, 7, 8, 9) with + * <code>nobs = 3</code> and <code>nvars = 2</code> creates a regression dataset with two + * independent variables, as below: + * <pre> + * y x[0] x[1] + * -------------- + * 1 2 3 + * 4 5 6 + * 7 8 9 + * </pre> + * </p> + * <p>Note that there is no need to add an initial unitary column (column of 1's) when + * specifying a model including an intercept term. If {@link #isNoIntercept()} is <code>true</code>, + * the X matrix will be created without an initial column of "1"s; otherwise this column will + * be added. + * </p> + * <p>Throws IllegalArgumentException if any of the following preconditions fail: + * <ul><li><code>data</code> cannot be null</li> + * <li><code>data.length = nobs * (nvars + 1)</li> + * <li><code>nobs > nvars</code></li></ul> + * </p> + * + * @param data input data array + * @param nobs number of observations (rows) + * @param nvars number of independent variables (columns, not counting y) + * @throws NullArgumentException if the data array is null + * @throws DimensionMismatchException if the length of the data array is not equal + * to <code>nobs * (nvars + 1)</code> + * @throws InsufficientDataException if <code>nobs</code> is less than + * <code>nvars + 1</code> + */ + public void newSampleData(double[] data, int nobs, int nvars) { + if (data == null) { + throw new NullArgumentException(); + } + if (data.length != nobs * (nvars + 1)) { + throw new DimensionMismatchException(data.length, nobs * (nvars + 1)); + } + if (nobs <= nvars) { + throw new InsufficientDataException(LocalizedFormats.INSUFFICIENT_OBSERVED_POINTS_IN_SAMPLE, nobs, nvars + 1); + } + double[] y = new double[nobs]; + final int cols = noIntercept ? nvars: nvars + 1; + double[][] x = new double[nobs][cols]; + int pointer = 0; + for (int i = 0; i < nobs; i++) { + y[i] = data[pointer++]; + if (!noIntercept) { + x[i][0] = 1.0d; + } + for (int j = noIntercept ? 0 : 1; j < cols; j++) { + x[i][j] = data[pointer++]; + } + } + this.xMatrix = new Array2DRowRealMatrix(x); + this.yVector = new ArrayRealVector(y); + } + + /** + * Loads new y sample data, overriding any previous data. + * + * @param y the array representing the y sample + * @throws NullArgumentException if y is null + * @throws NoDataException if y is empty + */ + protected void newYSampleData(double[] y) { + if (y == null) { + throw new NullArgumentException(); + } + if (y.length == 0) { + throw new NoDataException(); + } + this.yVector = new ArrayRealVector(y); + } + + /** + * <p>Loads new x sample data, overriding any previous data. + * </p> + * The input <code>x</code> array should have one row for each sample + * observation, with columns corresponding to independent variables. + * For example, if <pre> + * <code> x = new double[][] {{1, 2}, {3, 4}, {5, 6}} </code></pre> + * then <code>setXSampleData(x) </code> results in a model with two independent + * variables and 3 observations: + * <pre> + * x[0] x[1] + * ---------- + * 1 2 + * 3 4 + * 5 6 + * </pre> + * </p> + * <p>Note that there is no need to add an initial unitary column (column of 1's) when + * specifying a model including an intercept term. + * </p> + * @param x the rectangular array representing the x sample + * @throws NullArgumentException if x is null + * @throws NoDataException if x is empty + * @throws DimensionMismatchException if x is not rectangular + */ + protected void newXSampleData(double[][] x) { + if (x == null) { + throw new NullArgumentException(); + } + if (x.length == 0) { + throw new NoDataException(); + } + if (noIntercept) { + this.xMatrix = new Array2DRowRealMatrix(x, true); + } else { // Augment design matrix with initial unitary column + final int nVars = x[0].length; + final double[][] xAug = new double[x.length][nVars + 1]; + for (int i = 0; i < x.length; i++) { + if (x[i].length != nVars) { + throw new DimensionMismatchException(x[i].length, nVars); + } + xAug[i][0] = 1.0d; + System.arraycopy(x[i], 0, xAug[i], 1, nVars); + } + this.xMatrix = new Array2DRowRealMatrix(xAug, false); + } + } + + /** + * Validates sample data. Checks that + * <ul><li>Neither x nor y is null or empty;</li> + * <li>The length (i.e. number of rows) of x equals the length of y</li> + * <li>x has at least one more row than it has columns (i.e. there is + * sufficient data to estimate regression coefficients for each of the + * columns in x plus an intercept.</li> + * </ul> + * + * @param x the [n,k] array representing the x data + * @param y the [n,1] array representing the y data + * @throws NullArgumentException if {@code x} or {@code y} is null + * @throws DimensionMismatchException if {@code x} and {@code y} do not + * have the same length + * @throws NoDataException if {@code x} or {@code y} are zero-length + * @throws MathIllegalArgumentException if the number of rows of {@code x} + * is not larger than the number of columns + 1 + */ + protected void validateSampleData(double[][] x, double[] y) throws MathIllegalArgumentException { + if ((x == null) || (y == null)) { + throw new NullArgumentException(); + } + if (x.length != y.length) { + throw new DimensionMismatchException(y.length, x.length); + } + if (x.length == 0) { // Must be no y data either + throw new NoDataException(); + } + if (x[0].length + 1 > x.length) { + throw new MathIllegalArgumentException( + LocalizedFormats.NOT_ENOUGH_DATA_FOR_NUMBER_OF_PREDICTORS, + x.length, x[0].length); + } + } + + /** + * Validates that the x data and covariance matrix have the same + * number of rows and that the covariance matrix is square. + * + * @param x the [n,k] array representing the x sample + * @param covariance the [n,n] array representing the covariance matrix + * @throws DimensionMismatchException if the number of rows in x is not equal + * to the number of rows in covariance + * @throws NonSquareMatrixException if the covariance matrix is not square + */ + protected void validateCovarianceData(double[][] x, double[][] covariance) { + if (x.length != covariance.length) { + throw new DimensionMismatchException(x.length, covariance.length); + } + if (covariance.length > 0 && covariance.length != covariance[0].length) { + throw new NonSquareMatrixException(covariance.length, covariance[0].length); + } + } + + /** + * {@inheritDoc} + */ + public double[] estimateRegressionParameters() { + RealVector b = calculateBeta(); + return b.toArray(); + } + + /** + * {@inheritDoc} + */ + public double[] estimateResiduals() { + RealVector b = calculateBeta(); + RealVector e = yVector.subtract(xMatrix.operate(b)); + return e.toArray(); + } + + /** + * {@inheritDoc} + */ + public double[][] estimateRegressionParametersVariance() { + return calculateBetaVariance().getData(); + } + + /** + * {@inheritDoc} + */ + public double[] estimateRegressionParametersStandardErrors() { + double[][] betaVariance = estimateRegressionParametersVariance(); + double sigma = calculateErrorVariance(); + int length = betaVariance[0].length; + double[] result = new double[length]; + for (int i = 0; i < length; i++) { + result[i] = FastMath.sqrt(sigma * betaVariance[i][i]); + } + return result; + } + + /** + * {@inheritDoc} + */ + public double estimateRegressandVariance() { + return calculateYVariance(); + } + + /** + * Estimates the variance of the error. + * + * @return estimate of the error variance + * @since 2.2 + */ + public double estimateErrorVariance() { + return calculateErrorVariance(); + + } + + /** + * Estimates the standard error of the regression. + * + * @return regression standard error + * @since 2.2 + */ + public double estimateRegressionStandardError() { + return FastMath.sqrt(estimateErrorVariance()); + } + + /** + * Calculates the beta of multiple linear regression in matrix notation. + * + * @return beta + */ + protected abstract RealVector calculateBeta(); + + /** + * Calculates the beta variance of multiple linear regression in matrix + * notation. + * + * @return beta variance + */ + protected abstract RealMatrix calculateBetaVariance(); + + + /** + * Calculates the variance of the y values. + * + * @return Y variance + */ + protected double calculateYVariance() { + return new Variance().evaluate(yVector.toArray()); + } + + /** + * <p>Calculates the variance of the error term.</p> + * Uses the formula <pre> + * var(u) = u · u / (n - k) + * </pre> + * where n and k are the row and column dimensions of the design + * matrix X. + * + * @return error variance estimate + * @since 2.2 + */ + protected double calculateErrorVariance() { + RealVector residuals = calculateResiduals(); + return residuals.dotProduct(residuals) / + (xMatrix.getRowDimension() - xMatrix.getColumnDimension()); + } + + /** + * Calculates the residuals of multiple linear regression in matrix + * notation. + * + * <pre> + * u = y - X * b + * </pre> + * + * @return The residuals [n,1] matrix + */ + protected RealVector calculateResiduals() { + RealVector b = calculateBeta(); + return yVector.subtract(xMatrix.operate(b)); + } + +} diff --git a/src/main/java/org/apache/commons/math3/stat/regression/GLSMultipleLinearRegression.java b/src/main/java/org/apache/commons/math3/stat/regression/GLSMultipleLinearRegression.java new file mode 100644 index 0000000..1644e6d --- /dev/null +++ b/src/main/java/org/apache/commons/math3/stat/regression/GLSMultipleLinearRegression.java @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math3.stat.regression; + +import org.apache.commons.math3.linear.LUDecomposition; +import org.apache.commons.math3.linear.RealMatrix; +import org.apache.commons.math3.linear.Array2DRowRealMatrix; +import org.apache.commons.math3.linear.RealVector; + +/** + * The GLS implementation of multiple linear regression. + * + * GLS assumes a general covariance matrix Omega of the error + * <pre> + * u ~ N(0, Omega) + * </pre> + * + * Estimated by GLS, + * <pre> + * b=(X' Omega^-1 X)^-1X'Omega^-1 y + * </pre> + * whose variance is + * <pre> + * Var(b)=(X' Omega^-1 X)^-1 + * </pre> + * @since 2.0 + */ +public class GLSMultipleLinearRegression extends AbstractMultipleLinearRegression { + + /** Covariance matrix. */ + private RealMatrix Omega; + + /** Inverse of covariance matrix. */ + private RealMatrix OmegaInverse; + + /** Replace sample data, overriding any previous sample. + * @param y y values of the sample + * @param x x values of the sample + * @param covariance array representing the covariance matrix + */ + public void newSampleData(double[] y, double[][] x, double[][] covariance) { + validateSampleData(x, y); + newYSampleData(y); + newXSampleData(x); + validateCovarianceData(x, covariance); + newCovarianceData(covariance); + } + + /** + * Add the covariance data. + * + * @param omega the [n,n] array representing the covariance + */ + protected void newCovarianceData(double[][] omega){ + this.Omega = new Array2DRowRealMatrix(omega); + this.OmegaInverse = null; + } + + /** + * Get the inverse of the covariance. + * <p>The inverse of the covariance matrix is lazily evaluated and cached.</p> + * @return inverse of the covariance + */ + protected RealMatrix getOmegaInverse() { + if (OmegaInverse == null) { + OmegaInverse = new LUDecomposition(Omega).getSolver().getInverse(); + } + return OmegaInverse; + } + + /** + * Calculates beta by GLS. + * <pre> + * b=(X' Omega^-1 X)^-1X'Omega^-1 y + * </pre> + * @return beta + */ + @Override + protected RealVector calculateBeta() { + RealMatrix OI = getOmegaInverse(); + RealMatrix XT = getX().transpose(); + RealMatrix XTOIX = XT.multiply(OI).multiply(getX()); + RealMatrix inverse = new LUDecomposition(XTOIX).getSolver().getInverse(); + return inverse.multiply(XT).multiply(OI).operate(getY()); + } + + /** + * Calculates the variance on the beta. + * <pre> + * Var(b)=(X' Omega^-1 X)^-1 + * </pre> + * @return The beta variance matrix + */ + @Override + protected RealMatrix calculateBetaVariance() { + RealMatrix OI = getOmegaInverse(); + RealMatrix XTOIX = getX().transpose().multiply(OI).multiply(getX()); + return new LUDecomposition(XTOIX).getSolver().getInverse(); + } + + + /** + * Calculates the estimated variance of the error term using the formula + * <pre> + * Var(u) = Tr(u' Omega^-1 u)/(n-k) + * </pre> + * where n and k are the row and column dimensions of the design + * matrix X. + * + * @return error variance + * @since 2.2 + */ + @Override + protected double calculateErrorVariance() { + RealVector residuals = calculateResiduals(); + double t = residuals.dotProduct(getOmegaInverse().operate(residuals)); + return t / (getX().getRowDimension() - getX().getColumnDimension()); + + } + +} diff --git a/src/main/java/org/apache/commons/math3/stat/regression/MillerUpdatingRegression.java b/src/main/java/org/apache/commons/math3/stat/regression/MillerUpdatingRegression.java new file mode 100644 index 0000000..3fe3c03 --- /dev/null +++ b/src/main/java/org/apache/commons/math3/stat/regression/MillerUpdatingRegression.java @@ -0,0 +1,1101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math3.stat.regression; + +import java.util.Arrays; +import org.apache.commons.math3.exception.util.LocalizedFormats; +import org.apache.commons.math3.util.FastMath; +import org.apache.commons.math3.util.Precision; +import org.apache.commons.math3.util.MathArrays; + +/** + * This class is a concrete implementation of the {@link UpdatingMultipleLinearRegression} interface. + * + * <p>The algorithm is described in: <pre> + * Algorithm AS 274: Least Squares Routines to Supplement Those of Gentleman + * Author(s): Alan J. Miller + * Source: Journal of the Royal Statistical Society. + * Series C (Applied Statistics), Vol. 41, No. 2 + * (1992), pp. 458-478 + * Published by: Blackwell Publishing for the Royal Statistical Society + * Stable URL: http://www.jstor.org/stable/2347583 </pre></p> + * + * <p>This method for multiple regression forms the solution to the OLS problem + * by updating the QR decomposition as described by Gentleman.</p> + * + * @since 3.0 + */ +public class MillerUpdatingRegression implements UpdatingMultipleLinearRegression { + + /** number of variables in regression */ + private final int nvars; + /** diagonals of cross products matrix */ + private final double[] d; + /** the elements of the R`Y */ + private final double[] rhs; + /** the off diagonal portion of the R matrix */ + private final double[] r; + /** the tolerance for each of the variables */ + private final double[] tol; + /** residual sum of squares for all nested regressions */ + private final double[] rss; + /** order of the regressors */ + private final int[] vorder; + /** scratch space for tolerance calc */ + private final double[] work_tolset; + /** number of observations entered */ + private long nobs = 0; + /** sum of squared errors of largest regression */ + private double sserr = 0.0; + /** has rss been called? */ + private boolean rss_set = false; + /** has the tolerance setting method been called */ + private boolean tol_set = false; + /** flags for variables with linear dependency problems */ + private final boolean[] lindep; + /** singular x values */ + private final double[] x_sing; + /** workspace for singularity method */ + private final double[] work_sing; + /** summation of Y variable */ + private double sumy = 0.0; + /** summation of squared Y values */ + private double sumsqy = 0.0; + /** boolean flag whether a regression constant is added */ + private boolean hasIntercept; + /** zero tolerance */ + private final double epsilon; + /** + * Set the default constructor to private access + * to prevent inadvertent instantiation + */ + @SuppressWarnings("unused") + private MillerUpdatingRegression() { + this(-1, false, Double.NaN); + } + + /** + * This is the augmented constructor for the MillerUpdatingRegression class. + * + * @param numberOfVariables number of regressors to expect, not including constant + * @param includeConstant include a constant automatically + * @param errorTolerance zero tolerance, how machine zero is determined + * @throws ModelSpecificationException if {@code numberOfVariables is less than 1} + */ + public MillerUpdatingRegression(int numberOfVariables, boolean includeConstant, double errorTolerance) + throws ModelSpecificationException { + if (numberOfVariables < 1) { + throw new ModelSpecificationException(LocalizedFormats.NO_REGRESSORS); + } + if (includeConstant) { + this.nvars = numberOfVariables + 1; + } else { + this.nvars = numberOfVariables; + } + this.hasIntercept = includeConstant; + this.nobs = 0; + this.d = new double[this.nvars]; + this.rhs = new double[this.nvars]; + this.r = new double[this.nvars * (this.nvars - 1) / 2]; + this.tol = new double[this.nvars]; + this.rss = new double[this.nvars]; + this.vorder = new int[this.nvars]; + this.x_sing = new double[this.nvars]; + this.work_sing = new double[this.nvars]; + this.work_tolset = new double[this.nvars]; + this.lindep = new boolean[this.nvars]; + for (int i = 0; i < this.nvars; i++) { + vorder[i] = i; + } + if (errorTolerance > 0) { + this.epsilon = errorTolerance; + } else { + this.epsilon = -errorTolerance; + } + } + + /** + * Primary constructor for the MillerUpdatingRegression. + * + * @param numberOfVariables maximum number of potential regressors + * @param includeConstant include a constant automatically + * @throws ModelSpecificationException if {@code numberOfVariables is less than 1} + */ + public MillerUpdatingRegression(int numberOfVariables, boolean includeConstant) + throws ModelSpecificationException { + this(numberOfVariables, includeConstant, Precision.EPSILON); + } + + /** + * A getter method which determines whether a constant is included. + * @return true regression has an intercept, false no intercept + */ + public boolean hasIntercept() { + return this.hasIntercept; + } + + /** + * Gets the number of observations added to the regression model. + * @return number of observations + */ + public long getN() { + return this.nobs; + } + + /** + * Adds an observation to the regression model. + * @param x the array with regressor values + * @param y the value of dependent variable given these regressors + * @exception ModelSpecificationException if the length of {@code x} does not equal + * the number of independent variables in the model + */ + public void addObservation(final double[] x, final double y) + throws ModelSpecificationException { + + if ((!this.hasIntercept && x.length != nvars) || + (this.hasIntercept && x.length + 1 != nvars)) { + throw new ModelSpecificationException(LocalizedFormats.INVALID_REGRESSION_OBSERVATION, + x.length, nvars); + } + if (!this.hasIntercept) { + include(MathArrays.copyOf(x, x.length), 1.0, y); + } else { + final double[] tmp = new double[x.length + 1]; + System.arraycopy(x, 0, tmp, 1, x.length); + tmp[0] = 1.0; + include(tmp, 1.0, y); + } + ++nobs; + + } + + /** + * Adds multiple observations to the model. + * @param x observations on the regressors + * @param y observations on the regressand + * @throws ModelSpecificationException if {@code x} is not rectangular, does not match + * the length of {@code y} or does not contain sufficient data to estimate the model + */ + public void addObservations(double[][] x, double[] y) throws ModelSpecificationException { + if ((x == null) || (y == null) || (x.length != y.length)) { + throw new ModelSpecificationException( + LocalizedFormats.DIMENSIONS_MISMATCH_SIMPLE, + (x == null) ? 0 : x.length, + (y == null) ? 0 : y.length); + } + if (x.length == 0) { // Must be no y data either + throw new ModelSpecificationException( + LocalizedFormats.NO_DATA); + } + if (x[0].length + 1 > x.length) { + throw new ModelSpecificationException( + LocalizedFormats.NOT_ENOUGH_DATA_FOR_NUMBER_OF_PREDICTORS, + x.length, x[0].length); + } + for (int i = 0; i < x.length; i++) { + addObservation(x[i], y[i]); + } + } + + /** + * The include method is where the QR decomposition occurs. This statement forms all + * intermediate data which will be used for all derivative measures. + * According to the miller paper, note that in the original implementation the x vector + * is overwritten. In this implementation, the include method is passed a copy of the + * original data vector so that there is no contamination of the data. Additionally, + * this method differs slightly from Gentleman's method, in that the assumption is + * of dense design matrices, there is some advantage in using the original gentleman algorithm + * on sparse matrices. + * + * @param x observations on the regressors + * @param wi weight of the this observation (-1,1) + * @param yi observation on the regressand + */ + private void include(final double[] x, final double wi, final double yi) { + int nextr = 0; + double w = wi; + double y = yi; + double xi; + double di; + double wxi; + double dpi; + double xk; + double _w; + this.rss_set = false; + sumy = smartAdd(yi, sumy); + sumsqy = smartAdd(sumsqy, yi * yi); + for (int i = 0; i < x.length; i++) { + if (w == 0.0) { + return; + } + xi = x[i]; + + if (xi == 0.0) { + nextr += nvars - i - 1; + continue; + } + di = d[i]; + wxi = w * xi; + _w = w; + if (di != 0.0) { + dpi = smartAdd(di, wxi * xi); + final double tmp = wxi * xi / di; + if (FastMath.abs(tmp) > Precision.EPSILON) { + w = (di * w) / dpi; + } + } else { + dpi = wxi * xi; + w = 0.0; + } + d[i] = dpi; + for (int k = i + 1; k < nvars; k++) { + xk = x[k]; + x[k] = smartAdd(xk, -xi * r[nextr]); + if (di != 0.0) { + r[nextr] = smartAdd(di * r[nextr], (_w * xi) * xk) / dpi; + } else { + r[nextr] = xk / xi; + } + ++nextr; + } + xk = y; + y = smartAdd(xk, -xi * rhs[i]); + if (di != 0.0) { + rhs[i] = smartAdd(di * rhs[i], wxi * xk) / dpi; + } else { + rhs[i] = xk / xi; + } + } + sserr = smartAdd(sserr, w * y * y); + } + + /** + * Adds to number a and b such that the contamination due to + * numerical smallness of one addend does not corrupt the sum. + * @param a - an addend + * @param b - an addend + * @return the sum of the a and b + */ + private double smartAdd(double a, double b) { + final double _a = FastMath.abs(a); + final double _b = FastMath.abs(b); + if (_a > _b) { + final double eps = _a * Precision.EPSILON; + if (_b > eps) { + return a + b; + } + return a; + } else { + final double eps = _b * Precision.EPSILON; + if (_a > eps) { + return a + b; + } + return b; + } + } + + /** + * As the name suggests, clear wipes the internals and reorders everything in the + * canonical order. + */ + public void clear() { + Arrays.fill(this.d, 0.0); + Arrays.fill(this.rhs, 0.0); + Arrays.fill(this.r, 0.0); + Arrays.fill(this.tol, 0.0); + Arrays.fill(this.rss, 0.0); + Arrays.fill(this.work_tolset, 0.0); + Arrays.fill(this.work_sing, 0.0); + Arrays.fill(this.x_sing, 0.0); + Arrays.fill(this.lindep, false); + for (int i = 0; i < nvars; i++) { + this.vorder[i] = i; + } + this.nobs = 0; + this.sserr = 0.0; + this.sumy = 0.0; + this.sumsqy = 0.0; + this.rss_set = false; + this.tol_set = false; + } + + /** + * This sets up tolerances for singularity testing. + */ + private void tolset() { + int pos; + double total; + final double eps = this.epsilon; + for (int i = 0; i < nvars; i++) { + this.work_tolset[i] = FastMath.sqrt(d[i]); + } + tol[0] = eps * this.work_tolset[0]; + for (int col = 1; col < nvars; col++) { + pos = col - 1; + total = work_tolset[col]; + for (int row = 0; row < col; row++) { + total += FastMath.abs(r[pos]) * work_tolset[row]; + pos += nvars - row - 2; + } + tol[col] = eps * total; + } + tol_set = true; + } + + /** + * The regcf method conducts the linear regression and extracts the + * parameter vector. Notice that the algorithm can do subset regression + * with no alteration. + * + * @param nreq how many of the regressors to include (either in canonical + * order, or in the current reordered state) + * @return an array with the estimated slope coefficients + * @throws ModelSpecificationException if {@code nreq} is less than 1 + * or greater than the number of independent variables + */ + private double[] regcf(int nreq) throws ModelSpecificationException { + int nextr; + if (nreq < 1) { + throw new ModelSpecificationException(LocalizedFormats.NO_REGRESSORS); + } + if (nreq > this.nvars) { + throw new ModelSpecificationException( + LocalizedFormats.TOO_MANY_REGRESSORS, nreq, this.nvars); + } + if (!this.tol_set) { + tolset(); + } + final double[] ret = new double[nreq]; + boolean rankProblem = false; + for (int i = nreq - 1; i > -1; i--) { + if (FastMath.sqrt(d[i]) < tol[i]) { + ret[i] = 0.0; + d[i] = 0.0; + rankProblem = true; + } else { + ret[i] = rhs[i]; + nextr = i * (nvars + nvars - i - 1) / 2; + for (int j = i + 1; j < nreq; j++) { + ret[i] = smartAdd(ret[i], -r[nextr] * ret[j]); + ++nextr; + } + } + } + if (rankProblem) { + for (int i = 0; i < nreq; i++) { + if (this.lindep[i]) { + ret[i] = Double.NaN; + } + } + } + return ret; + } + + /** + * The method which checks for singularities and then eliminates the offending + * columns. + */ + private void singcheck() { + int pos; + for (int i = 0; i < nvars; i++) { + work_sing[i] = FastMath.sqrt(d[i]); + } + for (int col = 0; col < nvars; col++) { + // Set elements within R to zero if they are less than tol(col) in + // absolute value after being scaled by the square root of their row + // multiplier + final double temp = tol[col]; + pos = col - 1; + for (int row = 0; row < col - 1; row++) { + if (FastMath.abs(r[pos]) * work_sing[row] < temp) { + r[pos] = 0.0; + } + pos += nvars - row - 2; + } + // If diagonal element is near zero, set it to zero, set appropriate + // element of LINDEP, and use INCLUD to augment the projections in + // the lower rows of the orthogonalization. + lindep[col] = false; + if (work_sing[col] < temp) { + lindep[col] = true; + if (col < nvars - 1) { + Arrays.fill(x_sing, 0.0); + int _pi = col * (nvars + nvars - col - 1) / 2; + for (int _xi = col + 1; _xi < nvars; _xi++, _pi++) { + x_sing[_xi] = r[_pi]; + r[_pi] = 0.0; + } + final double y = rhs[col]; + final double weight = d[col]; + d[col] = 0.0; + rhs[col] = 0.0; + this.include(x_sing, weight, y); + } else { + sserr += d[col] * rhs[col] * rhs[col]; + } + } + } + } + + /** + * Calculates the sum of squared errors for the full regression + * and all subsets in the following manner: <pre> + * rss[] ={ + * ResidualSumOfSquares_allNvars, + * ResidualSumOfSquares_FirstNvars-1, + * ResidualSumOfSquares_FirstNvars-2, + * ..., ResidualSumOfSquares_FirstVariable} </pre> + */ + private void ss() { + double total = sserr; + rss[nvars - 1] = sserr; + for (int i = nvars - 1; i > 0; i--) { + total += d[i] * rhs[i] * rhs[i]; + rss[i - 1] = total; + } + rss_set = true; + } + + /** + * Calculates the cov matrix assuming only the first nreq variables are + * included in the calculation. The returned array contains a symmetric + * matrix stored in lower triangular form. The matrix will have + * ( nreq + 1 ) * nreq / 2 elements. For illustration <pre> + * cov = + * { + * cov_00, + * cov_10, cov_11, + * cov_20, cov_21, cov22, + * ... + * } </pre> + * + * @param nreq how many of the regressors to include (either in canonical + * order, or in the current reordered state) + * @return an array with the variance covariance of the included + * regressors in lower triangular form + */ + private double[] cov(int nreq) { + if (this.nobs <= nreq) { + return null; + } + double rnk = 0.0; + for (int i = 0; i < nreq; i++) { + if (!this.lindep[i]) { + rnk += 1.0; + } + } + final double var = rss[nreq - 1] / (nobs - rnk); + final double[] rinv = new double[nreq * (nreq - 1) / 2]; + inverse(rinv, nreq); + final double[] covmat = new double[nreq * (nreq + 1) / 2]; + Arrays.fill(covmat, Double.NaN); + int pos2; + int pos1; + int start = 0; + double total = 0; + for (int row = 0; row < nreq; row++) { + pos2 = start; + if (!this.lindep[row]) { + for (int col = row; col < nreq; col++) { + if (!this.lindep[col]) { + pos1 = start + col - row; + if (row == col) { + total = 1.0 / d[col]; + } else { + total = rinv[pos1 - 1] / d[col]; + } + for (int k = col + 1; k < nreq; k++) { + if (!this.lindep[k]) { + total += rinv[pos1] * rinv[pos2] / d[k]; + } + ++pos1; + ++pos2; + } + covmat[ (col + 1) * col / 2 + row] = total * var; + } else { + pos2 += nreq - col - 1; + } + } + } + start += nreq - row - 1; + } + return covmat; + } + + /** + * This internal method calculates the inverse of the upper-triangular portion + * of the R matrix. + * @param rinv the storage for the inverse of r + * @param nreq how many of the regressors to include (either in canonical + * order, or in the current reordered state) + */ + private void inverse(double[] rinv, int nreq) { + int pos = nreq * (nreq - 1) / 2 - 1; + int pos1 = -1; + int pos2 = -1; + double total = 0.0; + Arrays.fill(rinv, Double.NaN); + for (int row = nreq - 1; row > 0; --row) { + if (!this.lindep[row]) { + final int start = (row - 1) * (nvars + nvars - row) / 2; + for (int col = nreq; col > row; --col) { + pos1 = start; + pos2 = pos; + total = 0.0; + for (int k = row; k < col - 1; k++) { + pos2 += nreq - k - 1; + if (!this.lindep[k]) { + total += -r[pos1] * rinv[pos2]; + } + ++pos1; + } + rinv[pos] = total - r[pos1]; + --pos; + } + } else { + pos -= nreq - row; + } + } + } + + /** + * In the original algorithm only the partial correlations of the regressors + * is returned to the user. In this implementation, we have <pre> + * corr = + * { + * corrxx - lower triangular + * corrxy - bottom row of the matrix + * } + * Replaces subroutines PCORR and COR of: + * ALGORITHM AS274 APPL. STATIST. (1992) VOL.41, NO. 2 </pre> + * + * <p>Calculate partial correlations after the variables in rows + * 1, 2, ..., IN have been forced into the regression. + * If IN = 1, and the first row of R represents a constant in the + * model, then the usual simple correlations are returned.</p> + * + * <p>If IN = 0, the value returned in array CORMAT for the correlation + * of variables Xi & Xj is: <pre> + * sum ( Xi.Xj ) / Sqrt ( sum (Xi^2) . sum (Xj^2) )</pre></p> + * + * <p>On return, array CORMAT contains the upper triangle of the matrix of + * partial correlations stored by rows, excluding the 1's on the diagonal. + * e.g. if IN = 2, the consecutive elements returned are: + * (3,4) (3,5) ... (3,ncol), (4,5) (4,6) ... (4,ncol), etc. + * Array YCORR stores the partial correlations with the Y-variable + * starting with YCORR(IN+1) = partial correlation with the variable in + * position (IN+1). </p> + * + * @param in how many of the regressors to include (either in canonical + * order, or in the current reordered state) + * @return an array with the partial correlations of the remainder of + * regressors with each other and the regressand, in lower triangular form + */ + public double[] getPartialCorrelations(int in) { + final double[] output = new double[(nvars - in + 1) * (nvars - in) / 2]; + int pos; + int pos1; + int pos2; + final int rms_off = -in; + final int wrk_off = -(in + 1); + final double[] rms = new double[nvars - in]; + final double[] work = new double[nvars - in - 1]; + double sumxx; + double sumxy; + double sumyy; + final int offXX = (nvars - in) * (nvars - in - 1) / 2; + if (in < -1 || in >= nvars) { + return null; + } + final int nvm = nvars - 1; + final int base_pos = r.length - (nvm - in) * (nvm - in + 1) / 2; + if (d[in] > 0.0) { + rms[in + rms_off] = 1.0 / FastMath.sqrt(d[in]); + } + for (int col = in + 1; col < nvars; col++) { + pos = base_pos + col - 1 - in; + sumxx = d[col]; + for (int row = in; row < col; row++) { + sumxx += d[row] * r[pos] * r[pos]; + pos += nvars - row - 2; + } + if (sumxx > 0.0) { + rms[col + rms_off] = 1.0 / FastMath.sqrt(sumxx); + } else { + rms[col + rms_off] = 0.0; + } + } + sumyy = sserr; + for (int row = in; row < nvars; row++) { + sumyy += d[row] * rhs[row] * rhs[row]; + } + if (sumyy > 0.0) { + sumyy = 1.0 / FastMath.sqrt(sumyy); + } + pos = 0; + for (int col1 = in; col1 < nvars; col1++) { + sumxy = 0.0; + Arrays.fill(work, 0.0); + pos1 = base_pos + col1 - in - 1; + for (int row = in; row < col1; row++) { + pos2 = pos1 + 1; + for (int col2 = col1 + 1; col2 < nvars; col2++) { + work[col2 + wrk_off] += d[row] * r[pos1] * r[pos2]; + pos2++; + } + sumxy += d[row] * r[pos1] * rhs[row]; + pos1 += nvars - row - 2; + } + pos2 = pos1 + 1; + for (int col2 = col1 + 1; col2 < nvars; col2++) { + work[col2 + wrk_off] += d[col1] * r[pos2]; + ++pos2; + output[ (col2 - 1 - in) * (col2 - in) / 2 + col1 - in] = + work[col2 + wrk_off] * rms[col1 + rms_off] * rms[col2 + rms_off]; + ++pos; + } + sumxy += d[col1] * rhs[col1]; + output[col1 + rms_off + offXX] = sumxy * rms[col1 + rms_off] * sumyy; + } + + return output; + } + + /** + * ALGORITHM AS274 APPL. STATIST. (1992) VOL.41, NO. 2. + * Move variable from position FROM to position TO in an + * orthogonal reduction produced by AS75.1. + * + * @param from initial position + * @param to destination + */ + private void vmove(int from, int to) { + double d1; + double d2; + double X; + double d1new; + double d2new; + double cbar; + double sbar; + double Y; + int first; + int inc; + int m1; + int m2; + int mp1; + int pos; + boolean bSkipTo40 = false; + if (from == to) { + return; + } + if (!this.rss_set) { + ss(); + } + int count = 0; + if (from < to) { + first = from; + inc = 1; + count = to - from; + } else { + first = from - 1; + inc = -1; + count = from - to; + } + + int m = first; + int idx = 0; + while (idx < count) { + m1 = m * (nvars + nvars - m - 1) / 2; + m2 = m1 + nvars - m - 1; + mp1 = m + 1; + + d1 = d[m]; + d2 = d[mp1]; + // Special cases. + if (d1 > this.epsilon || d2 > this.epsilon) { + X = r[m1]; + if (FastMath.abs(X) * FastMath.sqrt(d1) < tol[mp1]) { + X = 0.0; + } + if (d1 < this.epsilon || FastMath.abs(X) < this.epsilon) { + d[m] = d2; + d[mp1] = d1; + r[m1] = 0.0; + for (int col = m + 2; col < nvars; col++) { + ++m1; + X = r[m1]; + r[m1] = r[m2]; + r[m2] = X; + ++m2; + } + X = rhs[m]; + rhs[m] = rhs[mp1]; + rhs[mp1] = X; + bSkipTo40 = true; + //break; + } else if (d2 < this.epsilon) { + d[m] = d1 * X * X; + r[m1] = 1.0 / X; + for (int _i = m1 + 1; _i < m1 + nvars - m - 1; _i++) { + r[_i] /= X; + } + rhs[m] /= X; + bSkipTo40 = true; + //break; + } + if (!bSkipTo40) { + d1new = d2 + d1 * X * X; + cbar = d2 / d1new; + sbar = X * d1 / d1new; + d2new = d1 * cbar; + d[m] = d1new; + d[mp1] = d2new; + r[m1] = sbar; + for (int col = m + 2; col < nvars; col++) { + ++m1; + Y = r[m1]; + r[m1] = cbar * r[m2] + sbar * Y; + r[m2] = Y - X * r[m2]; + ++m2; + } + Y = rhs[m]; + rhs[m] = cbar * rhs[mp1] + sbar * Y; + rhs[mp1] = Y - X * rhs[mp1]; + } + } + if (m > 0) { + pos = m; + for (int row = 0; row < m; row++) { + X = r[pos]; + r[pos] = r[pos - 1]; + r[pos - 1] = X; + pos += nvars - row - 2; + } + } + // Adjust variable order (VORDER), the tolerances (TOL) and + // the vector of residual sums of squares (RSS). + m1 = vorder[m]; + vorder[m] = vorder[mp1]; + vorder[mp1] = m1; + X = tol[m]; + tol[m] = tol[mp1]; + tol[mp1] = X; + rss[m] = rss[mp1] + d[mp1] * rhs[mp1] * rhs[mp1]; + + m += inc; + ++idx; + } + } + + /** + * ALGORITHM AS274 APPL. STATIST. (1992) VOL.41, NO. 2 + * + * <p> Re-order the variables in an orthogonal reduction produced by + * AS75.1 so that the N variables in LIST start at position POS1, + * though will not necessarily be in the same order as in LIST. + * Any variables in VORDER before position POS1 are not moved. + * Auxiliary routine called: VMOVE. </p> + * + * <p>This internal method reorders the regressors.</p> + * + * @param list the regressors to move + * @param pos1 where the list will be placed + * @return -1 error, 0 everything ok + */ + private int reorderRegressors(int[] list, int pos1) { + int next; + int i; + int l; + if (list.length < 1 || list.length > nvars + 1 - pos1) { + return -1; + } + next = pos1; + i = pos1; + while (i < nvars) { + l = vorder[i]; + for (int j = 0; j < list.length; j++) { + if (l == list[j] && i > next) { + this.vmove(i, next); + ++next; + if (next >= list.length + pos1) { + return 0; + } else { + break; + } + } + } + ++i; + } + return 0; + } + + /** + * Gets the diagonal of the Hat matrix also known as the leverage matrix. + * + * @param row_data returns the diagonal of the hat matrix for this observation + * @return the diagonal element of the hatmatrix + */ + public double getDiagonalOfHatMatrix(double[] row_data) { + double[] wk = new double[this.nvars]; + int pos; + double total; + + if (row_data.length > nvars) { + return Double.NaN; + } + double[] xrow; + if (this.hasIntercept) { + xrow = new double[row_data.length + 1]; + xrow[0] = 1.0; + System.arraycopy(row_data, 0, xrow, 1, row_data.length); + } else { + xrow = row_data; + } + double hii = 0.0; + for (int col = 0; col < xrow.length; col++) { + if (FastMath.sqrt(d[col]) < tol[col]) { + wk[col] = 0.0; + } else { + pos = col - 1; + total = xrow[col]; + for (int row = 0; row < col; row++) { + total = smartAdd(total, -wk[row] * r[pos]); + pos += nvars - row - 2; + } + wk[col] = total; + hii = smartAdd(hii, (total * total) / d[col]); + } + } + return hii; + } + + /** + * Gets the order of the regressors, useful if some type of reordering + * has been called. Calling regress with int[]{} args will trigger + * a reordering. + * + * @return int[] with the current order of the regressors + */ + public int[] getOrderOfRegressors(){ + return MathArrays.copyOf(vorder); + } + + /** + * Conducts a regression on the data in the model, using all regressors. + * + * @return RegressionResults the structure holding all regression results + * @exception ModelSpecificationException - thrown if number of observations is + * less than the number of variables + */ + public RegressionResults regress() throws ModelSpecificationException { + return regress(this.nvars); + } + + /** + * Conducts a regression on the data in the model, using a subset of regressors. + * + * @param numberOfRegressors many of the regressors to include (either in canonical + * order, or in the current reordered state) + * @return RegressionResults the structure holding all regression results + * @exception ModelSpecificationException - thrown if number of observations is + * less than the number of variables or number of regressors requested + * is greater than the regressors in the model + */ + public RegressionResults regress(int numberOfRegressors) throws ModelSpecificationException { + if (this.nobs <= numberOfRegressors) { + throw new ModelSpecificationException( + LocalizedFormats.NOT_ENOUGH_DATA_FOR_NUMBER_OF_PREDICTORS, + this.nobs, numberOfRegressors); + } + if( numberOfRegressors > this.nvars ){ + throw new ModelSpecificationException( + LocalizedFormats.TOO_MANY_REGRESSORS, numberOfRegressors, this.nvars); + } + + tolset(); + singcheck(); + + double[] beta = this.regcf(numberOfRegressors); + + ss(); + + double[] cov = this.cov(numberOfRegressors); + + int rnk = 0; + for (int i = 0; i < this.lindep.length; i++) { + if (!this.lindep[i]) { + ++rnk; + } + } + + boolean needsReorder = false; + for (int i = 0; i < numberOfRegressors; i++) { + if (this.vorder[i] != i) { + needsReorder = true; + break; + } + } + if (!needsReorder) { + return new RegressionResults( + beta, new double[][]{cov}, true, this.nobs, rnk, + this.sumy, this.sumsqy, this.sserr, this.hasIntercept, false); + } else { + double[] betaNew = new double[beta.length]; + double[] covNew = new double[cov.length]; + + int[] newIndices = new int[beta.length]; + for (int i = 0; i < nvars; i++) { + for (int j = 0; j < numberOfRegressors; j++) { + if (this.vorder[j] == i) { + betaNew[i] = beta[ j]; + newIndices[i] = j; + } + } + } + + int idx1 = 0; + int idx2; + int _i; + int _j; + for (int i = 0; i < beta.length; i++) { + _i = newIndices[i]; + for (int j = 0; j <= i; j++, idx1++) { + _j = newIndices[j]; + if (_i > _j) { + idx2 = _i * (_i + 1) / 2 + _j; + } else { + idx2 = _j * (_j + 1) / 2 + _i; + } + covNew[idx1] = cov[idx2]; + } + } + return new RegressionResults( + betaNew, new double[][]{covNew}, true, this.nobs, rnk, + this.sumy, this.sumsqy, this.sserr, this.hasIntercept, false); + } + } + + /** + * Conducts a regression on the data in the model, using regressors in array + * Calling this method will change the internal order of the regressors + * and care is required in interpreting the hatmatrix. + * + * @param variablesToInclude array of variables to include in regression + * @return RegressionResults the structure holding all regression results + * @exception ModelSpecificationException - thrown if number of observations is + * less than the number of variables, the number of regressors requested + * is greater than the regressors in the model or a regressor index in + * regressor array does not exist + */ + public RegressionResults regress(int[] variablesToInclude) throws ModelSpecificationException { + if (variablesToInclude.length > this.nvars) { + throw new ModelSpecificationException( + LocalizedFormats.TOO_MANY_REGRESSORS, variablesToInclude.length, this.nvars); + } + if (this.nobs <= this.nvars) { + throw new ModelSpecificationException( + LocalizedFormats.NOT_ENOUGH_DATA_FOR_NUMBER_OF_PREDICTORS, + this.nobs, this.nvars); + } + Arrays.sort(variablesToInclude); + int iExclude = 0; + for (int i = 0; i < variablesToInclude.length; i++) { + if (i >= this.nvars) { + throw new ModelSpecificationException( + LocalizedFormats.INDEX_LARGER_THAN_MAX, i, this.nvars); + } + if (i > 0 && variablesToInclude[i] == variablesToInclude[i - 1]) { + variablesToInclude[i] = -1; + ++iExclude; + } + } + int[] series; + if (iExclude > 0) { + int j = 0; + series = new int[variablesToInclude.length - iExclude]; + for (int i = 0; i < variablesToInclude.length; i++) { + if (variablesToInclude[i] > -1) { + series[j] = variablesToInclude[i]; + ++j; + } + } + } else { + series = variablesToInclude; + } + + reorderRegressors(series, 0); + tolset(); + singcheck(); + + double[] beta = this.regcf(series.length); + + ss(); + + double[] cov = this.cov(series.length); + + int rnk = 0; + for (int i = 0; i < this.lindep.length; i++) { + if (!this.lindep[i]) { + ++rnk; + } + } + + boolean needsReorder = false; + for (int i = 0; i < this.nvars; i++) { + if (this.vorder[i] != series[i]) { + needsReorder = true; + break; + } + } + if (!needsReorder) { + return new RegressionResults( + beta, new double[][]{cov}, true, this.nobs, rnk, + this.sumy, this.sumsqy, this.sserr, this.hasIntercept, false); + } else { + double[] betaNew = new double[beta.length]; + int[] newIndices = new int[beta.length]; + for (int i = 0; i < series.length; i++) { + for (int j = 0; j < this.vorder.length; j++) { + if (this.vorder[j] == series[i]) { + betaNew[i] = beta[ j]; + newIndices[i] = j; + } + } + } + double[] covNew = new double[cov.length]; + int idx1 = 0; + int idx2; + int _i; + int _j; + for (int i = 0; i < beta.length; i++) { + _i = newIndices[i]; + for (int j = 0; j <= i; j++, idx1++) { + _j = newIndices[j]; + if (_i > _j) { + idx2 = _i * (_i + 1) / 2 + _j; + } else { + idx2 = _j * (_j + 1) / 2 + _i; + } + covNew[idx1] = cov[idx2]; + } + } + return new RegressionResults( + betaNew, new double[][]{covNew}, true, this.nobs, rnk, + this.sumy, this.sumsqy, this.sserr, this.hasIntercept, false); + } + } +} diff --git a/src/main/java/org/apache/commons/math3/stat/regression/ModelSpecificationException.java b/src/main/java/org/apache/commons/math3/stat/regression/ModelSpecificationException.java new file mode 100644 index 0000000..f3804db --- /dev/null +++ b/src/main/java/org/apache/commons/math3/stat/regression/ModelSpecificationException.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math3.stat.regression; + +import org.apache.commons.math3.exception.MathIllegalArgumentException; +import org.apache.commons.math3.exception.util.Localizable; + +/** + * Exception thrown when a regression model is not correctly specified. + * + * @since 3.0 + */ +public class ModelSpecificationException extends MathIllegalArgumentException { + /** Serializable version Id. */ + private static final long serialVersionUID = 4206514456095401070L; + + /** + * @param pattern message pattern describing the specification error. + * + * @param args arguments. + */ + public ModelSpecificationException(Localizable pattern, + Object ... args) { + super(pattern, args); + } +} diff --git a/src/main/java/org/apache/commons/math3/stat/regression/MultipleLinearRegression.java b/src/main/java/org/apache/commons/math3/stat/regression/MultipleLinearRegression.java new file mode 100644 index 0000000..866214f --- /dev/null +++ b/src/main/java/org/apache/commons/math3/stat/regression/MultipleLinearRegression.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math3.stat.regression; + +/** + * The multiple linear regression can be represented in matrix-notation. + * <pre> + * y=X*b+u + * </pre> + * where y is an <code>n-vector</code> <b>regressand</b>, X is a <code>[n,k]</code> matrix whose <code>k</code> columns are called + * <b>regressors</b>, b is <code>k-vector</code> of <b>regression parameters</b> and <code>u</code> is an <code>n-vector</code> + * of <b>error terms</b> or <b>residuals</b>. + * + * The notation is quite standard in literature, + * cf eg <a href="http://www.econ.queensu.ca/ETM">Davidson and MacKinnon, Econometrics Theory and Methods, 2004</a>. + * @since 2.0 + */ +public interface MultipleLinearRegression { + + /** + * Estimates the regression parameters b. + * + * @return The [k,1] array representing b + */ + double[] estimateRegressionParameters(); + + /** + * Estimates the variance of the regression parameters, ie Var(b). + * + * @return The [k,k] array representing the variance of b + */ + double[][] estimateRegressionParametersVariance(); + + /** + * Estimates the residuals, ie u = y - X*b. + * + * @return The [n,1] array representing the residuals + */ + double[] estimateResiduals(); + + /** + * Returns the variance of the regressand, ie Var(y). + * + * @return The double representing the variance of y + */ + double estimateRegressandVariance(); + + /** + * Returns the standard errors of the regression parameters. + * + * @return standard errors of estimated regression parameters + */ + double[] estimateRegressionParametersStandardErrors(); + +} diff --git a/src/main/java/org/apache/commons/math3/stat/regression/OLSMultipleLinearRegression.java b/src/main/java/org/apache/commons/math3/stat/regression/OLSMultipleLinearRegression.java new file mode 100644 index 0000000..7fff940 --- /dev/null +++ b/src/main/java/org/apache/commons/math3/stat/regression/OLSMultipleLinearRegression.java @@ -0,0 +1,285 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math3.stat.regression; + +import org.apache.commons.math3.exception.MathIllegalArgumentException; +import org.apache.commons.math3.linear.Array2DRowRealMatrix; +import org.apache.commons.math3.linear.LUDecomposition; +import org.apache.commons.math3.linear.QRDecomposition; +import org.apache.commons.math3.linear.RealMatrix; +import org.apache.commons.math3.linear.RealVector; +import org.apache.commons.math3.stat.StatUtils; +import org.apache.commons.math3.stat.descriptive.moment.SecondMoment; + +/** + * <p>Implements ordinary least squares (OLS) to estimate the parameters of a + * multiple linear regression model.</p> + * + * <p>The regression coefficients, <code>b</code>, satisfy the normal equations: + * <pre><code> X<sup>T</sup> X b = X<sup>T</sup> y </code></pre></p> + * + * <p>To solve the normal equations, this implementation uses QR decomposition + * of the <code>X</code> matrix. (See {@link QRDecomposition} for details on the + * decomposition algorithm.) The <code>X</code> matrix, also known as the <i>design matrix,</i> + * has rows corresponding to sample observations and columns corresponding to independent + * variables. When the model is estimated using an intercept term (i.e. when + * {@link #isNoIntercept() isNoIntercept} is false as it is by default), the <code>X</code> + * matrix includes an initial column identically equal to 1. We solve the normal equations + * as follows: + * <pre><code> X<sup>T</sup>X b = X<sup>T</sup> y + * (QR)<sup>T</sup> (QR) b = (QR)<sup>T</sup>y + * R<sup>T</sup> (Q<sup>T</sup>Q) R b = R<sup>T</sup> Q<sup>T</sup> y + * R<sup>T</sup> R b = R<sup>T</sup> Q<sup>T</sup> y + * (R<sup>T</sup>)<sup>-1</sup> R<sup>T</sup> R b = (R<sup>T</sup>)<sup>-1</sup> R<sup>T</sup> Q<sup>T</sup> y + * R b = Q<sup>T</sup> y </code></pre></p> + * + * <p>Given <code>Q</code> and <code>R</code>, the last equation is solved by back-substitution.</p> + * + * @since 2.0 + */ +public class OLSMultipleLinearRegression extends AbstractMultipleLinearRegression { + + /** Cached QR decomposition of X matrix */ + private QRDecomposition qr = null; + + /** Singularity threshold for QR decomposition */ + private final double threshold; + + /** + * Create an empty OLSMultipleLinearRegression instance. + */ + public OLSMultipleLinearRegression() { + this(0d); + } + + /** + * Create an empty OLSMultipleLinearRegression instance, using the given + * singularity threshold for the QR decomposition. + * + * @param threshold the singularity threshold + * @since 3.3 + */ + public OLSMultipleLinearRegression(final double threshold) { + this.threshold = threshold; + } + + /** + * Loads model x and y sample data, overriding any previous sample. + * + * Computes and caches QR decomposition of the X matrix. + * @param y the [n,1] array representing the y sample + * @param x the [n,k] array representing the x sample + * @throws MathIllegalArgumentException if the x and y array data are not + * compatible for the regression + */ + public void newSampleData(double[] y, double[][] x) throws MathIllegalArgumentException { + validateSampleData(x, y); + newYSampleData(y); + newXSampleData(x); + } + + /** + * {@inheritDoc} + * <p>This implementation computes and caches the QR decomposition of the X matrix.</p> + */ + @Override + public void newSampleData(double[] data, int nobs, int nvars) { + super.newSampleData(data, nobs, nvars); + qr = new QRDecomposition(getX(), threshold); + } + + /** + * <p>Compute the "hat" matrix. + * </p> + * <p>The hat matrix is defined in terms of the design matrix X + * by X(X<sup>T</sup>X)<sup>-1</sup>X<sup>T</sup> + * </p> + * <p>The implementation here uses the QR decomposition to compute the + * hat matrix as Q I<sub>p</sub>Q<sup>T</sup> where I<sub>p</sub> is the + * p-dimensional identity matrix augmented by 0's. This computational + * formula is from "The Hat Matrix in Regression and ANOVA", + * David C. Hoaglin and Roy E. Welsch, + * <i>The American Statistician</i>, Vol. 32, No. 1 (Feb., 1978), pp. 17-22. + * </p> + * <p>Data for the model must have been successfully loaded using one of + * the {@code newSampleData} methods before invoking this method; otherwise + * a {@code NullPointerException} will be thrown.</p> + * + * @return the hat matrix + * @throws NullPointerException unless method {@code newSampleData} has been + * called beforehand. + */ + public RealMatrix calculateHat() { + // Create augmented identity matrix + RealMatrix Q = qr.getQ(); + final int p = qr.getR().getColumnDimension(); + final int n = Q.getColumnDimension(); + // No try-catch or advertised NotStrictlyPositiveException - NPE above if n < 3 + Array2DRowRealMatrix augI = new Array2DRowRealMatrix(n, n); + double[][] augIData = augI.getDataRef(); + for (int i = 0; i < n; i++) { + for (int j =0; j < n; j++) { + if (i == j && i < p) { + augIData[i][j] = 1d; + } else { + augIData[i][j] = 0d; + } + } + } + + // Compute and return Hat matrix + // No DME advertised - args valid if we get here + return Q.multiply(augI).multiply(Q.transpose()); + } + + /** + * <p>Returns the sum of squared deviations of Y from its mean.</p> + * + * <p>If the model has no intercept term, <code>0</code> is used for the + * mean of Y - i.e., what is returned is the sum of the squared Y values.</p> + * + * <p>The value returned by this method is the SSTO value used in + * the {@link #calculateRSquared() R-squared} computation.</p> + * + * @return SSTO - the total sum of squares + * @throws NullPointerException if the sample has not been set + * @see #isNoIntercept() + * @since 2.2 + */ + public double calculateTotalSumOfSquares() { + if (isNoIntercept()) { + return StatUtils.sumSq(getY().toArray()); + } else { + return new SecondMoment().evaluate(getY().toArray()); + } + } + + /** + * Returns the sum of squared residuals. + * + * @return residual sum of squares + * @since 2.2 + * @throws org.apache.commons.math3.linear.SingularMatrixException if the design matrix is singular + * @throws NullPointerException if the data for the model have not been loaded + */ + public double calculateResidualSumOfSquares() { + final RealVector residuals = calculateResiduals(); + // No advertised DME, args are valid + return residuals.dotProduct(residuals); + } + + /** + * Returns the R-Squared statistic, defined by the formula <pre> + * R<sup>2</sup> = 1 - SSR / SSTO + * </pre> + * where SSR is the {@link #calculateResidualSumOfSquares() sum of squared residuals} + * and SSTO is the {@link #calculateTotalSumOfSquares() total sum of squares} + * + * <p>If there is no variance in y, i.e., SSTO = 0, NaN is returned.</p> + * + * @return R-square statistic + * @throws NullPointerException if the sample has not been set + * @throws org.apache.commons.math3.linear.SingularMatrixException if the design matrix is singular + * @since 2.2 + */ + public double calculateRSquared() { + return 1 - calculateResidualSumOfSquares() / calculateTotalSumOfSquares(); + } + + /** + * <p>Returns the adjusted R-squared statistic, defined by the formula <pre> + * R<sup>2</sup><sub>adj</sub> = 1 - [SSR (n - 1)] / [SSTO (n - p)] + * </pre> + * where SSR is the {@link #calculateResidualSumOfSquares() sum of squared residuals}, + * SSTO is the {@link #calculateTotalSumOfSquares() total sum of squares}, n is the number + * of observations and p is the number of parameters estimated (including the intercept).</p> + * + * <p>If the regression is estimated without an intercept term, what is returned is <pre> + * <code> 1 - (1 - {@link #calculateRSquared()}) * (n / (n - p)) </code> + * </pre></p> + * + * <p>If there is no variance in y, i.e., SSTO = 0, NaN is returned.</p> + * + * @return adjusted R-Squared statistic + * @throws NullPointerException if the sample has not been set + * @throws org.apache.commons.math3.linear.SingularMatrixException if the design matrix is singular + * @see #isNoIntercept() + * @since 2.2 + */ + public double calculateAdjustedRSquared() { + final double n = getX().getRowDimension(); + if (isNoIntercept()) { + return 1 - (1 - calculateRSquared()) * (n / (n - getX().getColumnDimension())); + } else { + return 1 - (calculateResidualSumOfSquares() * (n - 1)) / + (calculateTotalSumOfSquares() * (n - getX().getColumnDimension())); + } + } + + /** + * {@inheritDoc} + * <p>This implementation computes and caches the QR decomposition of the X matrix + * once it is successfully loaded.</p> + */ + @Override + protected void newXSampleData(double[][] x) { + super.newXSampleData(x); + qr = new QRDecomposition(getX(), threshold); + } + + /** + * Calculates the regression coefficients using OLS. + * + * <p>Data for the model must have been successfully loaded using one of + * the {@code newSampleData} methods before invoking this method; otherwise + * a {@code NullPointerException} will be thrown.</p> + * + * @return beta + * @throws org.apache.commons.math3.linear.SingularMatrixException if the design matrix is singular + * @throws NullPointerException if the data for the model have not been loaded + */ + @Override + protected RealVector calculateBeta() { + return qr.getSolver().solve(getY()); + } + + /** + * <p>Calculates the variance-covariance matrix of the regression parameters. + * </p> + * <p>Var(b) = (X<sup>T</sup>X)<sup>-1</sup> + * </p> + * <p>Uses QR decomposition to reduce (X<sup>T</sup>X)<sup>-1</sup> + * to (R<sup>T</sup>R)<sup>-1</sup>, with only the top p rows of + * R included, where p = the length of the beta vector.</p> + * + * <p>Data for the model must have been successfully loaded using one of + * the {@code newSampleData} methods before invoking this method; otherwise + * a {@code NullPointerException} will be thrown.</p> + * + * @return The beta variance-covariance matrix + * @throws org.apache.commons.math3.linear.SingularMatrixException if the design matrix is singular + * @throws NullPointerException if the data for the model have not been loaded + */ + @Override + protected RealMatrix calculateBetaVariance() { + int p = getX().getColumnDimension(); + RealMatrix Raug = qr.getR().getSubMatrix(0, p - 1 , 0, p - 1); + RealMatrix Rinv = new LUDecomposition(Raug).getSolver().getInverse(); + return Rinv.multiply(Rinv.transpose()); + } + +} diff --git a/src/main/java/org/apache/commons/math3/stat/regression/RegressionResults.java b/src/main/java/org/apache/commons/math3/stat/regression/RegressionResults.java new file mode 100644 index 0000000..70faeac --- /dev/null +++ b/src/main/java/org/apache/commons/math3/stat/regression/RegressionResults.java @@ -0,0 +1,421 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math3.stat.regression; + +import java.io.Serializable; +import java.util.Arrays; +import org.apache.commons.math3.util.FastMath; +import org.apache.commons.math3.util.MathArrays; +import org.apache.commons.math3.exception.OutOfRangeException; + +/** + * Results of a Multiple Linear Regression model fit. + * + * @since 3.0 + */ +public class RegressionResults implements Serializable { + + /** INDEX of Sum of Squared Errors */ + private static final int SSE_IDX = 0; + /** INDEX of Sum of Squares of Model */ + private static final int SST_IDX = 1; + /** INDEX of R-Squared of regression */ + private static final int RSQ_IDX = 2; + /** INDEX of Mean Squared Error */ + private static final int MSE_IDX = 3; + /** INDEX of Adjusted R Squared */ + private static final int ADJRSQ_IDX = 4; + /** UID */ + private static final long serialVersionUID = 1l; + /** regression slope parameters */ + private final double[] parameters; + /** variance covariance matrix of parameters */ + private final double[][] varCovData; + /** boolean flag for variance covariance matrix in symm compressed storage */ + private final boolean isSymmetricVCD; + /** rank of the solution */ + @SuppressWarnings("unused") + private final int rank; + /** number of observations on which results are based */ + private final long nobs; + /** boolean flag indicator of whether a constant was included*/ + private final boolean containsConstant; + /** array storing global results, SSE, MSE, RSQ, adjRSQ */ + private final double[] globalFitInfo; + + /** + * Set the default constructor to private access + * to prevent inadvertent instantiation + */ + @SuppressWarnings("unused") + private RegressionResults() { + this.parameters = null; + this.varCovData = null; + this.rank = -1; + this.nobs = -1; + this.containsConstant = false; + this.isSymmetricVCD = false; + this.globalFitInfo = null; + } + + /** + * Constructor for Regression Results. + * + * @param parameters a double array with the regression slope estimates + * @param varcov the variance covariance matrix, stored either in a square matrix + * or as a compressed + * @param isSymmetricCompressed a flag which denotes that the variance covariance + * matrix is in symmetric compressed format + * @param nobs the number of observations of the regression estimation + * @param rank the number of independent variables in the regression + * @param sumy the sum of the independent variable + * @param sumysq the sum of the squared independent variable + * @param sse sum of squared errors + * @param containsConstant true model has constant, false model does not have constant + * @param copyData if true a deep copy of all input data is made, if false only references + * are copied and the RegressionResults become mutable + */ + public RegressionResults( + final double[] parameters, final double[][] varcov, + final boolean isSymmetricCompressed, + final long nobs, final int rank, + final double sumy, final double sumysq, final double sse, + final boolean containsConstant, + final boolean copyData) { + if (copyData) { + this.parameters = MathArrays.copyOf(parameters); + this.varCovData = new double[varcov.length][]; + for (int i = 0; i < varcov.length; i++) { + this.varCovData[i] = MathArrays.copyOf(varcov[i]); + } + } else { + this.parameters = parameters; + this.varCovData = varcov; + } + this.isSymmetricVCD = isSymmetricCompressed; + this.nobs = nobs; + this.rank = rank; + this.containsConstant = containsConstant; + this.globalFitInfo = new double[5]; + Arrays.fill(this.globalFitInfo, Double.NaN); + + if (rank > 0) { + this.globalFitInfo[SST_IDX] = containsConstant ? + (sumysq - sumy * sumy / nobs) : sumysq; + } + + this.globalFitInfo[SSE_IDX] = sse; + this.globalFitInfo[MSE_IDX] = this.globalFitInfo[SSE_IDX] / + (nobs - rank); + this.globalFitInfo[RSQ_IDX] = 1.0 - + this.globalFitInfo[SSE_IDX] / + this.globalFitInfo[SST_IDX]; + + if (!containsConstant) { + this.globalFitInfo[ADJRSQ_IDX] = 1.0- + (1.0 - this.globalFitInfo[RSQ_IDX]) * + ( (double) nobs / ( (double) (nobs - rank))); + } else { + this.globalFitInfo[ADJRSQ_IDX] = 1.0 - (sse * (nobs - 1.0)) / + (globalFitInfo[SST_IDX] * (nobs - rank)); + } + } + + /** + * <p>Returns the parameter estimate for the regressor at the given index.</p> + * + * <p>A redundant regressor will have its redundancy flag set, as well as + * a parameters estimated equal to {@code Double.NaN}</p> + * + * @param index Index. + * @return the parameters estimated for regressor at index. + * @throws OutOfRangeException if {@code index} is not in the interval + * {@code [0, number of parameters)}. + */ + public double getParameterEstimate(int index) throws OutOfRangeException { + if (parameters == null) { + return Double.NaN; + } + if (index < 0 || index >= this.parameters.length) { + throw new OutOfRangeException(index, 0, this.parameters.length - 1); + } + return this.parameters[index]; + } + + /** + * <p>Returns a copy of the regression parameters estimates.</p> + * + * <p>The parameter estimates are returned in the natural order of the data.</p> + * + * <p>A redundant regressor will have its redundancy flag set, as will + * a parameter estimate equal to {@code Double.NaN}.</p> + * + * @return array of parameter estimates, null if no estimation occurred + */ + public double[] getParameterEstimates() { + if (this.parameters == null) { + return null; + } + return MathArrays.copyOf(parameters); + } + + /** + * Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard + * error of the parameter estimate at index</a>, + * usually denoted s(b<sub>index</sub>). + * + * @param index Index. + * @return the standard errors associated with parameters estimated at index. + * @throws OutOfRangeException if {@code index} is not in the interval + * {@code [0, number of parameters)}. + */ + public double getStdErrorOfEstimate(int index) throws OutOfRangeException { + if (parameters == null) { + return Double.NaN; + } + if (index < 0 || index >= this.parameters.length) { + throw new OutOfRangeException(index, 0, this.parameters.length - 1); + } + double var = this.getVcvElement(index, index); + if (!Double.isNaN(var) && var > Double.MIN_VALUE) { + return FastMath.sqrt(var); + } + return Double.NaN; + } + + /** + * <p>Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard + * error of the parameter estimates</a>, + * usually denoted s(b<sub>i</sub>).</p> + * + * <p>If there are problems with an ill conditioned design matrix then the regressor + * which is redundant will be assigned <code>Double.NaN</code>. </p> + * + * @return an array standard errors associated with parameters estimates, + * null if no estimation occurred + */ + public double[] getStdErrorOfEstimates() { + if (parameters == null) { + return null; + } + double[] se = new double[this.parameters.length]; + for (int i = 0; i < this.parameters.length; i++) { + double var = this.getVcvElement(i, i); + if (!Double.isNaN(var) && var > Double.MIN_VALUE) { + se[i] = FastMath.sqrt(var); + continue; + } + se[i] = Double.NaN; + } + return se; + } + + /** + * <p>Returns the covariance between regression parameters i and j.</p> + * + * <p>If there are problems with an ill conditioned design matrix then the covariance + * which involves redundant columns will be assigned {@code Double.NaN}. </p> + * + * @param i {@code i}th regression parameter. + * @param j {@code j}th regression parameter. + * @return the covariance of the parameter estimates. + * @throws OutOfRangeException if {@code i} or {@code j} is not in the + * interval {@code [0, number of parameters)}. + */ + public double getCovarianceOfParameters(int i, int j) throws OutOfRangeException { + if (parameters == null) { + return Double.NaN; + } + if (i < 0 || i >= this.parameters.length) { + throw new OutOfRangeException(i, 0, this.parameters.length - 1); + } + if (j < 0 || j >= this.parameters.length) { + throw new OutOfRangeException(j, 0, this.parameters.length - 1); + } + return this.getVcvElement(i, j); + } + + /** + * <p>Returns the number of parameters estimated in the model.</p> + * + * <p>This is the maximum number of regressors, some techniques may drop + * redundant parameters</p> + * + * @return number of regressors, -1 if not estimated + */ + public int getNumberOfParameters() { + if (this.parameters == null) { + return -1; + } + return this.parameters.length; + } + + /** + * Returns the number of observations added to the regression model. + * + * @return Number of observations, -1 if an error condition prevents estimation + */ + public long getN() { + return this.nobs; + } + + /** + * <p>Returns the sum of squared deviations of the y values about their mean.</p> + * + * <p>This is defined as SSTO + * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>.</p> + * + * <p>If {@code n < 2}, this returns {@code Double.NaN}.</p> + * + * @return sum of squared deviations of y values + */ + public double getTotalSumSquares() { + return this.globalFitInfo[SST_IDX]; + } + + /** + * <p>Returns the sum of squared deviations of the predicted y values about + * their mean (which equals the mean of y).</p> + * + * <p>This is usually abbreviated SSR or SSM. It is defined as SSM + * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a></p> + * + * <p><strong>Preconditions</strong>: <ul> + * <li>At least two observations (with at least two different x values) + * must have been added before invoking this method. If this method is + * invoked before a model can be estimated, <code>Double.NaN</code> is + * returned. + * </li></ul></p> + * + * @return sum of squared deviations of predicted y values + */ + public double getRegressionSumSquares() { + return this.globalFitInfo[SST_IDX] - this.globalFitInfo[SSE_IDX]; + } + + /** + * <p>Returns the <a href="http://www.xycoon.com/SumOfSquares.htm"> + * sum of squared errors</a> (SSE) associated with the regression + * model.</p> + * + * <p>The return value is constrained to be non-negative - i.e., if due to + * rounding errors the computational formula returns a negative result, + * 0 is returned.</p> + * + * <p><strong>Preconditions</strong>: <ul> + * <li>numberOfParameters data pairs + * must have been added before invoking this method. If this method is + * invoked before a model can be estimated, <code>Double,NaN</code> is + * returned. + * </li></ul></p> + * + * @return sum of squared errors associated with the regression model + */ + public double getErrorSumSquares() { + return this.globalFitInfo[ SSE_IDX]; + } + + /** + * <p>Returns the sum of squared errors divided by the degrees of freedom, + * usually abbreviated MSE.</p> + * + * <p>If there are fewer than <strong>numberOfParameters + 1</strong> data pairs in the model, + * or if there is no variation in <code>x</code>, this returns + * <code>Double.NaN</code>.</p> + * + * @return sum of squared deviations of y values + */ + public double getMeanSquareError() { + return this.globalFitInfo[ MSE_IDX]; + } + + /** + * <p>Returns the <a href="http://www.xycoon.com/coefficient1.htm"> + * coefficient of multiple determination</a>, + * usually denoted r-square.</p> + * + * <p><strong>Preconditions</strong>: <ul> + * <li>At least numberOfParameters observations (with at least numberOfParameters different x values) + * must have been added before invoking this method. If this method is + * invoked before a model can be estimated, {@code Double,NaN} is + * returned. + * </li></ul></p> + * + * @return r-square, a double in the interval [0, 1] + */ + public double getRSquared() { + return this.globalFitInfo[ RSQ_IDX]; + } + + /** + * <p>Returns the adjusted R-squared statistic, defined by the formula <pre> + * R<sup>2</sup><sub>adj</sub> = 1 - [SSR (n - 1)] / [SSTO (n - p)] + * </pre> + * where SSR is the sum of squared residuals}, + * SSTO is the total sum of squares}, n is the number + * of observations and p is the number of parameters estimated (including the intercept).</p> + * + * <p>If the regression is estimated without an intercept term, what is returned is <pre> + * <code> 1 - (1 - {@link #getRSquared()} ) * (n / (n - p)) </code> + * </pre></p> + * + * @return adjusted R-Squared statistic + */ + public double getAdjustedRSquared() { + return this.globalFitInfo[ ADJRSQ_IDX]; + } + + /** + * Returns true if the regression model has been computed including an intercept. + * In this case, the coefficient of the intercept is the first element of the + * {@link #getParameterEstimates() parameter estimates}. + * @return true if the model has an intercept term + */ + public boolean hasIntercept() { + return this.containsConstant; + } + + /** + * Gets the i-jth element of the variance-covariance matrix. + * + * @param i first variable index + * @param j second variable index + * @return the requested variance-covariance matrix entry + */ + private double getVcvElement(int i, int j) { + if (this.isSymmetricVCD) { + if (this.varCovData.length > 1) { + //could be stored in upper or lower triangular + if (i == j) { + return varCovData[i][i]; + } else if (i >= varCovData[j].length) { + return varCovData[i][j]; + } else { + return varCovData[j][i]; + } + } else {//could be in single array + if (i > j) { + return varCovData[0][(i + 1) * i / 2 + j]; + } else { + return varCovData[0][(j + 1) * j / 2 + i]; + } + } + } else { + return this.varCovData[i][j]; + } + } +} diff --git a/src/main/java/org/apache/commons/math3/stat/regression/SimpleRegression.java b/src/main/java/org/apache/commons/math3/stat/regression/SimpleRegression.java new file mode 100644 index 0000000..02bf8f4 --- /dev/null +++ b/src/main/java/org/apache/commons/math3/stat/regression/SimpleRegression.java @@ -0,0 +1,881 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.math3.stat.regression; +import java.io.Serializable; + +import org.apache.commons.math3.distribution.TDistribution; +import org.apache.commons.math3.exception.MathIllegalArgumentException; +import org.apache.commons.math3.exception.NoDataException; +import org.apache.commons.math3.exception.OutOfRangeException; +import org.apache.commons.math3.exception.util.LocalizedFormats; +import org.apache.commons.math3.util.FastMath; +import org.apache.commons.math3.util.Precision; + +/** + * Estimates an ordinary least squares regression model + * with one independent variable. + * <p> + * <code> y = intercept + slope * x </code></p> + * <p> + * Standard errors for <code>intercept</code> and <code>slope</code> are + * available as well as ANOVA, r-square and Pearson's r statistics.</p> + * <p> + * Observations (x,y pairs) can be added to the model one at a time or they + * can be provided in a 2-dimensional array. The observations are not stored + * in memory, so there is no limit to the number of observations that can be + * added to the model.</p> + * <p> + * <strong>Usage Notes</strong>: <ul> + * <li> When there are fewer than two observations in the model, or when + * there is no variation in the x values (i.e. all x values are the same) + * all statistics return <code>NaN</code>. At least two observations with + * different x coordinates are required to estimate a bivariate regression + * model. + * </li> + * <li> Getters for the statistics always compute values based on the current + * set of observations -- i.e., you can get statistics, then add more data + * and get updated statistics without using a new instance. There is no + * "compute" method that updates all statistics. Each of the getters performs + * the necessary computations to return the requested statistic. + * </li> + * <li> The intercept term may be suppressed by passing {@code false} to + * the {@link #SimpleRegression(boolean)} constructor. When the + * {@code hasIntercept} property is false, the model is estimated without a + * constant term and {@link #getIntercept()} returns {@code 0}.</li> + * </ul></p> + * + */ +public class SimpleRegression implements Serializable, UpdatingMultipleLinearRegression { + + /** Serializable version identifier */ + private static final long serialVersionUID = -3004689053607543335L; + + /** sum of x values */ + private double sumX = 0d; + + /** total variation in x (sum of squared deviations from xbar) */ + private double sumXX = 0d; + + /** sum of y values */ + private double sumY = 0d; + + /** total variation in y (sum of squared deviations from ybar) */ + private double sumYY = 0d; + + /** sum of products */ + private double sumXY = 0d; + + /** number of observations */ + private long n = 0; + + /** mean of accumulated x values, used in updating formulas */ + private double xbar = 0; + + /** mean of accumulated y values, used in updating formulas */ + private double ybar = 0; + + /** include an intercept or not */ + private final boolean hasIntercept; + // ---------------------Public methods-------------------------------------- + + /** + * Create an empty SimpleRegression instance + */ + public SimpleRegression() { + this(true); + } + /** + * Create a SimpleRegression instance, specifying whether or not to estimate + * an intercept. + * + * <p>Use {@code false} to estimate a model with no intercept. When the + * {@code hasIntercept} property is false, the model is estimated without a + * constant term and {@link #getIntercept()} returns {@code 0}.</p> + * + * @param includeIntercept whether or not to include an intercept term in + * the regression model + */ + public SimpleRegression(boolean includeIntercept) { + super(); + hasIntercept = includeIntercept; + } + + /** + * Adds the observation (x,y) to the regression data set. + * <p> + * Uses updating formulas for means and sums of squares defined in + * "Algorithms for Computing the Sample Variance: Analysis and + * Recommendations", Chan, T.F., Golub, G.H., and LeVeque, R.J. + * 1983, American Statistician, vol. 37, pp. 242-247, referenced in + * Weisberg, S. "Applied Linear Regression". 2nd Ed. 1985.</p> + * + * + * @param x independent variable value + * @param y dependent variable value + */ + public void addData(final double x,final double y) { + if (n == 0) { + xbar = x; + ybar = y; + } else { + if( hasIntercept ){ + final double fact1 = 1.0 + n; + final double fact2 = n / (1.0 + n); + final double dx = x - xbar; + final double dy = y - ybar; + sumXX += dx * dx * fact2; + sumYY += dy * dy * fact2; + sumXY += dx * dy * fact2; + xbar += dx / fact1; + ybar += dy / fact1; + } + } + if( !hasIntercept ){ + sumXX += x * x ; + sumYY += y * y ; + sumXY += x * y ; + } + sumX += x; + sumY += y; + n++; + } + + /** + * Appends data from another regression calculation to this one. + * + * <p>The mean update formulae are based on a paper written by Philippe + * Pébay: + * <a + * href="http://prod.sandia.gov/techlib/access-control.cgi/2008/086212.pdf"> + * Formulas for Robust, One-Pass Parallel Computation of Covariances and + * Arbitrary-Order Statistical Moments</a>, 2008, Technical Report + * SAND2008-6212, Sandia National Laboratories.</p> + * + * @param reg model to append data from + * @since 3.3 + */ + public void append(SimpleRegression reg) { + if (n == 0) { + xbar = reg.xbar; + ybar = reg.ybar; + sumXX = reg.sumXX; + sumYY = reg.sumYY; + sumXY = reg.sumXY; + } else { + if (hasIntercept) { + final double fact1 = reg.n / (double) (reg.n + n); + final double fact2 = n * reg.n / (double) (reg.n + n); + final double dx = reg.xbar - xbar; + final double dy = reg.ybar - ybar; + sumXX += reg.sumXX + dx * dx * fact2; + sumYY += reg.sumYY + dy * dy * fact2; + sumXY += reg.sumXY + dx * dy * fact2; + xbar += dx * fact1; + ybar += dy * fact1; + }else{ + sumXX += reg.sumXX; + sumYY += reg.sumYY; + sumXY += reg.sumXY; + } + } + sumX += reg.sumX; + sumY += reg.sumY; + n += reg.n; + } + + /** + * Removes the observation (x,y) from the regression data set. + * <p> + * Mirrors the addData method. This method permits the use of + * SimpleRegression instances in streaming mode where the regression + * is applied to a sliding "window" of observations, however the caller is + * responsible for maintaining the set of observations in the window.</p> + * + * The method has no effect if there are no points of data (i.e. n=0) + * + * @param x independent variable value + * @param y dependent variable value + */ + public void removeData(final double x,final double y) { + if (n > 0) { + if (hasIntercept) { + final double fact1 = n - 1.0; + final double fact2 = n / (n - 1.0); + final double dx = x - xbar; + final double dy = y - ybar; + sumXX -= dx * dx * fact2; + sumYY -= dy * dy * fact2; + sumXY -= dx * dy * fact2; + xbar -= dx / fact1; + ybar -= dy / fact1; + } else { + final double fact1 = n - 1.0; + sumXX -= x * x; + sumYY -= y * y; + sumXY -= x * y; + xbar -= x / fact1; + ybar -= y / fact1; + } + sumX -= x; + sumY -= y; + n--; + } + } + + /** + * Adds the observations represented by the elements in + * <code>data</code>. + * <p> + * <code>(data[0][0],data[0][1])</code> will be the first observation, then + * <code>(data[1][0],data[1][1])</code>, etc.</p> + * <p> + * This method does not replace data that has already been added. The + * observations represented by <code>data</code> are added to the existing + * dataset.</p> + * <p> + * To replace all data, use <code>clear()</code> before adding the new + * data.</p> + * + * @param data array of observations to be added + * @throws ModelSpecificationException if the length of {@code data[i]} is not + * greater than or equal to 2 + */ + public void addData(final double[][] data) throws ModelSpecificationException { + for (int i = 0; i < data.length; i++) { + if( data[i].length < 2 ){ + throw new ModelSpecificationException(LocalizedFormats.INVALID_REGRESSION_OBSERVATION, + data[i].length, 2); + } + addData(data[i][0], data[i][1]); + } + } + + /** + * Adds one observation to the regression model. + * + * @param x the independent variables which form the design matrix + * @param y the dependent or response variable + * @throws ModelSpecificationException if the length of {@code x} does not equal + * the number of independent variables in the model + */ + public void addObservation(final double[] x,final double y) + throws ModelSpecificationException { + if( x == null || x.length == 0 ){ + throw new ModelSpecificationException(LocalizedFormats.INVALID_REGRESSION_OBSERVATION,x!=null?x.length:0, 1); + } + addData( x[0], y ); + } + + /** + * Adds a series of observations to the regression model. The lengths of + * x and y must be the same and x must be rectangular. + * + * @param x a series of observations on the independent variables + * @param y a series of observations on the dependent variable + * The length of x and y must be the same + * @throws ModelSpecificationException if {@code x} is not rectangular, does not match + * the length of {@code y} or does not contain sufficient data to estimate the model + */ + public void addObservations(final double[][] x,final double[] y) throws ModelSpecificationException { + if ((x == null) || (y == null) || (x.length != y.length)) { + throw new ModelSpecificationException( + LocalizedFormats.DIMENSIONS_MISMATCH_SIMPLE, + (x == null) ? 0 : x.length, + (y == null) ? 0 : y.length); + } + boolean obsOk=true; + for( int i = 0 ; i < x.length; i++){ + if( x[i] == null || x[i].length == 0 ){ + obsOk = false; + } + } + if( !obsOk ){ + throw new ModelSpecificationException( + LocalizedFormats.NOT_ENOUGH_DATA_FOR_NUMBER_OF_PREDICTORS, + 0, 1); + } + for( int i = 0 ; i < x.length ; i++){ + addData( x[i][0], y[i] ); + } + } + + /** + * Removes observations represented by the elements in <code>data</code>. + * <p> + * If the array is larger than the current n, only the first n elements are + * processed. This method permits the use of SimpleRegression instances in + * streaming mode where the regression is applied to a sliding "window" of + * observations, however the caller is responsible for maintaining the set + * of observations in the window.</p> + * <p> + * To remove all data, use <code>clear()</code>.</p> + * + * @param data array of observations to be removed + */ + public void removeData(double[][] data) { + for (int i = 0; i < data.length && n > 0; i++) { + removeData(data[i][0], data[i][1]); + } + } + + /** + * Clears all data from the model. + */ + public void clear() { + sumX = 0d; + sumXX = 0d; + sumY = 0d; + sumYY = 0d; + sumXY = 0d; + n = 0; + } + + /** + * Returns the number of observations that have been added to the model. + * + * @return n number of observations that have been added. + */ + public long getN() { + return n; + } + + /** + * Returns the "predicted" <code>y</code> value associated with the + * supplied <code>x</code> value, based on the data that has been + * added to the model when this method is activated. + * <p> + * <code> predict(x) = intercept + slope * x </code></p> + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>At least two observations (with at least two different x values) + * must have been added before invoking this method. If this method is + * invoked before a model can be estimated, <code>Double,NaN</code> is + * returned. + * </li></ul></p> + * + * @param x input <code>x</code> value + * @return predicted <code>y</code> value + */ + public double predict(final double x) { + final double b1 = getSlope(); + if (hasIntercept) { + return getIntercept(b1) + b1 * x; + } + return b1 * x; + } + + /** + * Returns the intercept of the estimated regression line, if + * {@link #hasIntercept()} is true; otherwise 0. + * <p> + * The least squares estimate of the intercept is computed using the + * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>. + * The intercept is sometimes denoted b0.</p> + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>At least two observations (with at least two different x values) + * must have been added before invoking this method. If this method is + * invoked before a model can be estimated, <code>Double,NaN</code> is + * returned. + * </li></ul></p> + * + * @return the intercept of the regression line if the model includes an + * intercept; 0 otherwise + * @see #SimpleRegression(boolean) + */ + public double getIntercept() { + return hasIntercept ? getIntercept(getSlope()) : 0.0; + } + + /** + * Returns true if the model includes an intercept term. + * + * @return true if the regression includes an intercept; false otherwise + * @see #SimpleRegression(boolean) + */ + public boolean hasIntercept() { + return hasIntercept; + } + + /** + * Returns the slope of the estimated regression line. + * <p> + * The least squares estimate of the slope is computed using the + * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>. + * The slope is sometimes denoted b1.</p> + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>At least two observations (with at least two different x values) + * must have been added before invoking this method. If this method is + * invoked before a model can be estimated, <code>Double.NaN</code> is + * returned. + * </li></ul></p> + * + * @return the slope of the regression line + */ + public double getSlope() { + if (n < 2) { + return Double.NaN; //not enough data + } + if (FastMath.abs(sumXX) < 10 * Double.MIN_VALUE) { + return Double.NaN; //not enough variation in x + } + return sumXY / sumXX; + } + + /** + * Returns the <a href="http://www.xycoon.com/SumOfSquares.htm"> + * sum of squared errors</a> (SSE) associated with the regression + * model. + * <p> + * The sum is computed using the computational formula</p> + * <p> + * <code>SSE = SYY - (SXY * SXY / SXX)</code></p> + * <p> + * where <code>SYY</code> is the sum of the squared deviations of the y + * values about their mean, <code>SXX</code> is similarly defined and + * <code>SXY</code> is the sum of the products of x and y mean deviations. + * </p><p> + * The sums are accumulated using the updating algorithm referenced in + * {@link #addData}.</p> + * <p> + * The return value is constrained to be non-negative - i.e., if due to + * rounding errors the computational formula returns a negative result, + * 0 is returned.</p> + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>At least two observations (with at least two different x values) + * must have been added before invoking this method. If this method is + * invoked before a model can be estimated, <code>Double,NaN</code> is + * returned. + * </li></ul></p> + * + * @return sum of squared errors associated with the regression model + */ + public double getSumSquaredErrors() { + return FastMath.max(0d, sumYY - sumXY * sumXY / sumXX); + } + + /** + * Returns the sum of squared deviations of the y values about their mean. + * <p> + * This is defined as SSTO + * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>.</p> + * <p> + * If <code>n < 2</code>, this returns <code>Double.NaN</code>.</p> + * + * @return sum of squared deviations of y values + */ + public double getTotalSumSquares() { + if (n < 2) { + return Double.NaN; + } + return sumYY; + } + + /** + * Returns the sum of squared deviations of the x values about their mean. + * + * If <code>n < 2</code>, this returns <code>Double.NaN</code>.</p> + * + * @return sum of squared deviations of x values + */ + public double getXSumSquares() { + if (n < 2) { + return Double.NaN; + } + return sumXX; + } + + /** + * Returns the sum of crossproducts, x<sub>i</sub>*y<sub>i</sub>. + * + * @return sum of cross products + */ + public double getSumOfCrossProducts() { + return sumXY; + } + + /** + * Returns the sum of squared deviations of the predicted y values about + * their mean (which equals the mean of y). + * <p> + * This is usually abbreviated SSR or SSM. It is defined as SSM + * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a></p> + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>At least two observations (with at least two different x values) + * must have been added before invoking this method. If this method is + * invoked before a model can be estimated, <code>Double.NaN</code> is + * returned. + * </li></ul></p> + * + * @return sum of squared deviations of predicted y values + */ + public double getRegressionSumSquares() { + return getRegressionSumSquares(getSlope()); + } + + /** + * Returns the sum of squared errors divided by the degrees of freedom, + * usually abbreviated MSE. + * <p> + * If there are fewer than <strong>three</strong> data pairs in the model, + * or if there is no variation in <code>x</code>, this returns + * <code>Double.NaN</code>.</p> + * + * @return sum of squared deviations of y values + */ + public double getMeanSquareError() { + if (n < 3) { + return Double.NaN; + } + return hasIntercept ? (getSumSquaredErrors() / (n - 2)) : (getSumSquaredErrors() / (n - 1)); + } + + /** + * Returns <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html"> + * Pearson's product moment correlation coefficient</a>, + * usually denoted r. + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>At least two observations (with at least two different x values) + * must have been added before invoking this method. If this method is + * invoked before a model can be estimated, <code>Double,NaN</code> is + * returned. + * </li></ul></p> + * + * @return Pearson's r + */ + public double getR() { + double b1 = getSlope(); + double result = FastMath.sqrt(getRSquare()); + if (b1 < 0) { + result = -result; + } + return result; + } + + /** + * Returns the <a href="http://www.xycoon.com/coefficient1.htm"> + * coefficient of determination</a>, + * usually denoted r-square. + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>At least two observations (with at least two different x values) + * must have been added before invoking this method. If this method is + * invoked before a model can be estimated, <code>Double,NaN</code> is + * returned. + * </li></ul></p> + * + * @return r-square + */ + public double getRSquare() { + double ssto = getTotalSumSquares(); + return (ssto - getSumSquaredErrors()) / ssto; + } + + /** + * Returns the <a href="http://www.xycoon.com/standarderrorb0.htm"> + * standard error of the intercept estimate</a>, + * usually denoted s(b0). + * <p> + * If there are fewer that <strong>three</strong> observations in the + * model, or if there is no variation in x, this returns + * <code>Double.NaN</code>.</p> Additionally, a <code>Double.NaN</code> is + * returned when the intercept is constrained to be zero + * + * @return standard error associated with intercept estimate + */ + public double getInterceptStdErr() { + if( !hasIntercept ){ + return Double.NaN; + } + return FastMath.sqrt( + getMeanSquareError() * ((1d / n) + (xbar * xbar) / sumXX)); + } + + /** + * Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard + * error of the slope estimate</a>, + * usually denoted s(b1). + * <p> + * If there are fewer that <strong>three</strong> data pairs in the model, + * or if there is no variation in x, this returns <code>Double.NaN</code>. + * </p> + * + * @return standard error associated with slope estimate + */ + public double getSlopeStdErr() { + return FastMath.sqrt(getMeanSquareError() / sumXX); + } + + /** + * Returns the half-width of a 95% confidence interval for the slope + * estimate. + * <p> + * The 95% confidence interval is</p> + * <p> + * <code>(getSlope() - getSlopeConfidenceInterval(), + * getSlope() + getSlopeConfidenceInterval())</code></p> + * <p> + * If there are fewer that <strong>three</strong> observations in the + * model, or if there is no variation in x, this returns + * <code>Double.NaN</code>.</p> + * <p> + * <strong>Usage Note</strong>:<br> + * The validity of this statistic depends on the assumption that the + * observations included in the model are drawn from a + * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html"> + * Bivariate Normal Distribution</a>.</p> + * + * @return half-width of 95% confidence interval for the slope estimate + * @throws OutOfRangeException if the confidence interval can not be computed. + */ + public double getSlopeConfidenceInterval() throws OutOfRangeException { + return getSlopeConfidenceInterval(0.05d); + } + + /** + * Returns the half-width of a (100-100*alpha)% confidence interval for + * the slope estimate. + * <p> + * The (100-100*alpha)% confidence interval is </p> + * <p> + * <code>(getSlope() - getSlopeConfidenceInterval(), + * getSlope() + getSlopeConfidenceInterval())</code></p> + * <p> + * To request, for example, a 99% confidence interval, use + * <code>alpha = .01</code></p> + * <p> + * <strong>Usage Note</strong>:<br> + * The validity of this statistic depends on the assumption that the + * observations included in the model are drawn from a + * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html"> + * Bivariate Normal Distribution</a>.</p> + * <p> + * <strong> Preconditions:</strong><ul> + * <li>If there are fewer that <strong>three</strong> observations in the + * model, or if there is no variation in x, this returns + * <code>Double.NaN</code>. + * </li> + * <li><code>(0 < alpha < 1)</code>; otherwise an + * <code>OutOfRangeException</code> is thrown. + * </li></ul></p> + * + * @param alpha the desired significance level + * @return half-width of 95% confidence interval for the slope estimate + * @throws OutOfRangeException if the confidence interval can not be computed. + */ + public double getSlopeConfidenceInterval(final double alpha) + throws OutOfRangeException { + if (n < 3) { + return Double.NaN; + } + if (alpha >= 1 || alpha <= 0) { + throw new OutOfRangeException(LocalizedFormats.SIGNIFICANCE_LEVEL, + alpha, 0, 1); + } + // No advertised NotStrictlyPositiveException here - will return NaN above + TDistribution distribution = new TDistribution(n - 2); + return getSlopeStdErr() * + distribution.inverseCumulativeProbability(1d - alpha / 2d); + } + + /** + * Returns the significance level of the slope (equiv) correlation. + * <p> + * Specifically, the returned value is the smallest <code>alpha</code> + * such that the slope confidence interval with significance level + * equal to <code>alpha</code> does not include <code>0</code>. + * On regression output, this is often denoted <code>Prob(|t| > 0)</code> + * </p><p> + * <strong>Usage Note</strong>:<br> + * The validity of this statistic depends on the assumption that the + * observations included in the model are drawn from a + * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html"> + * Bivariate Normal Distribution</a>.</p> + * <p> + * If there are fewer that <strong>three</strong> observations in the + * model, or if there is no variation in x, this returns + * <code>Double.NaN</code>.</p> + * + * @return significance level for slope/correlation + * @throws org.apache.commons.math3.exception.MaxCountExceededException + * if the significance level can not be computed. + */ + public double getSignificance() { + if (n < 3) { + return Double.NaN; + } + // No advertised NotStrictlyPositiveException here - will return NaN above + TDistribution distribution = new TDistribution(n - 2); + return 2d * (1.0 - distribution.cumulativeProbability( + FastMath.abs(getSlope()) / getSlopeStdErr())); + } + + // ---------------------Private methods----------------------------------- + + /** + * Returns the intercept of the estimated regression line, given the slope. + * <p> + * Will return <code>NaN</code> if slope is <code>NaN</code>.</p> + * + * @param slope current slope + * @return the intercept of the regression line + */ + private double getIntercept(final double slope) { + if( hasIntercept){ + return (sumY - slope * sumX) / n; + } + return 0.0; + } + + /** + * Computes SSR from b1. + * + * @param slope regression slope estimate + * @return sum of squared deviations of predicted y values + */ + private double getRegressionSumSquares(final double slope) { + return slope * slope * sumXX; + } + + /** + * Performs a regression on data present in buffers and outputs a RegressionResults object. + * + * <p>If there are fewer than 3 observations in the model and {@code hasIntercept} is true + * a {@code NoDataException} is thrown. If there is no intercept term, the model must + * contain at least 2 observations.</p> + * + * @return RegressionResults acts as a container of regression output + * @throws ModelSpecificationException if the model is not correctly specified + * @throws NoDataException if there is not sufficient data in the model to + * estimate the regression parameters + */ + public RegressionResults regress() throws ModelSpecificationException, NoDataException { + if (hasIntercept) { + if (n < 3) { + throw new NoDataException(LocalizedFormats.NOT_ENOUGH_DATA_REGRESSION); + } + if (FastMath.abs(sumXX) > Precision.SAFE_MIN) { + final double[] params = new double[] { getIntercept(), getSlope() }; + final double mse = getMeanSquareError(); + final double _syy = sumYY + sumY * sumY / n; + final double[] vcv = new double[] { mse * (xbar * xbar / sumXX + 1.0 / n), -xbar * mse / sumXX, mse / sumXX }; + return new RegressionResults(params, new double[][] { vcv }, true, n, 2, sumY, _syy, getSumSquaredErrors(), true, + false); + } else { + final double[] params = new double[] { sumY / n, Double.NaN }; + // final double mse = getMeanSquareError(); + final double[] vcv = new double[] { ybar / (n - 1.0), Double.NaN, Double.NaN }; + return new RegressionResults(params, new double[][] { vcv }, true, n, 1, sumY, sumYY, getSumSquaredErrors(), true, + false); + } + } else { + if (n < 2) { + throw new NoDataException(LocalizedFormats.NOT_ENOUGH_DATA_REGRESSION); + } + if (!Double.isNaN(sumXX)) { + final double[] vcv = new double[] { getMeanSquareError() / sumXX }; + final double[] params = new double[] { sumXY / sumXX }; + return new RegressionResults(params, new double[][] { vcv }, true, n, 1, sumY, sumYY, getSumSquaredErrors(), false, + false); + } else { + final double[] vcv = new double[] { Double.NaN }; + final double[] params = new double[] { Double.NaN }; + return new RegressionResults(params, new double[][] { vcv }, true, n, 1, Double.NaN, Double.NaN, Double.NaN, false, + false); + } + } + } + + /** + * Performs a regression on data present in buffers including only regressors + * indexed in variablesToInclude and outputs a RegressionResults object + * @param variablesToInclude an array of indices of regressors to include + * @return RegressionResults acts as a container of regression output + * @throws MathIllegalArgumentException if the variablesToInclude array is null or zero length + * @throws OutOfRangeException if a requested variable is not present in model + */ + public RegressionResults regress(int[] variablesToInclude) throws MathIllegalArgumentException{ + if( variablesToInclude == null || variablesToInclude.length == 0){ + throw new MathIllegalArgumentException(LocalizedFormats.ARRAY_ZERO_LENGTH_OR_NULL_NOT_ALLOWED); + } + if( variablesToInclude.length > 2 || (variablesToInclude.length > 1 && !hasIntercept) ){ + throw new ModelSpecificationException( + LocalizedFormats.ARRAY_SIZE_EXCEEDS_MAX_VARIABLES, + (variablesToInclude.length > 1 && !hasIntercept) ? 1 : 2); + } + + if( hasIntercept ){ + if( variablesToInclude.length == 2 ){ + if( variablesToInclude[0] == 1 ){ + throw new ModelSpecificationException(LocalizedFormats.NOT_INCREASING_SEQUENCE); + }else if( variablesToInclude[0] != 0 ){ + throw new OutOfRangeException( variablesToInclude[0], 0,1 ); + } + if( variablesToInclude[1] != 1){ + throw new OutOfRangeException( variablesToInclude[0], 0,1 ); + } + return regress(); + }else{ + if( variablesToInclude[0] != 1 && variablesToInclude[0] != 0 ){ + throw new OutOfRangeException( variablesToInclude[0],0,1 ); + } + final double _mean = sumY * sumY / n; + final double _syy = sumYY + _mean; + if( variablesToInclude[0] == 0 ){ + //just the mean + final double[] vcv = new double[]{ sumYY/(((n-1)*n)) }; + final double[] params = new double[]{ ybar }; + return new RegressionResults( + params, new double[][]{vcv}, true, n, 1, + sumY, _syy+_mean, sumYY,true,false); + + }else if( variablesToInclude[0] == 1){ + //final double _syy = sumYY + sumY * sumY / ((double) n); + final double _sxx = sumXX + sumX * sumX / n; + final double _sxy = sumXY + sumX * sumY / n; + final double _sse = FastMath.max(0d, _syy - _sxy * _sxy / _sxx); + final double _mse = _sse/((n-1)); + if( !Double.isNaN(_sxx) ){ + final double[] vcv = new double[]{ _mse / _sxx }; + final double[] params = new double[]{ _sxy/_sxx }; + return new RegressionResults( + params, new double[][]{vcv}, true, n, 1, + sumY, _syy, _sse,false,false); + }else{ + final double[] vcv = new double[]{Double.NaN }; + final double[] params = new double[]{ Double.NaN }; + return new RegressionResults( + params, new double[][]{vcv}, true, n, 1, + Double.NaN, Double.NaN, Double.NaN,false,false); + } + } + } + }else{ + if( variablesToInclude[0] != 0 ){ + throw new OutOfRangeException(variablesToInclude[0],0,0); + } + return regress(); + } + + return null; + } +} diff --git a/src/main/java/org/apache/commons/math3/stat/regression/UpdatingMultipleLinearRegression.java b/src/main/java/org/apache/commons/math3/stat/regression/UpdatingMultipleLinearRegression.java new file mode 100644 index 0000000..ebefc31 --- /dev/null +++ b/src/main/java/org/apache/commons/math3/stat/regression/UpdatingMultipleLinearRegression.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math3.stat.regression; + +import org.apache.commons.math3.exception.MathIllegalArgumentException; +import org.apache.commons.math3.exception.NoDataException; + +/** + * An interface for regression models allowing for dynamic updating of the data. + * That is, the entire data set need not be loaded into memory. As observations + * become available, they can be added to the regression model and an updated + * estimate regression statistics can be calculated. + * + * @since 3.0 + */ +public interface UpdatingMultipleLinearRegression { + + /** + * Returns true if a constant has been included false otherwise. + * + * @return true if constant exists, false otherwise + */ + boolean hasIntercept(); + + /** + * Returns the number of observations added to the regression model. + * + * @return Number of observations + */ + long getN(); + + /** + * Adds one observation to the regression model. + * + * @param x the independent variables which form the design matrix + * @param y the dependent or response variable + * @throws ModelSpecificationException if the length of {@code x} does not equal + * the number of independent variables in the model + */ + void addObservation(double[] x, double y) throws ModelSpecificationException; + + /** + * Adds a series of observations to the regression model. The lengths of + * x and y must be the same and x must be rectangular. + * + * @param x a series of observations on the independent variables + * @param y a series of observations on the dependent variable + * The length of x and y must be the same + * @throws ModelSpecificationException if {@code x} is not rectangular, does not match + * the length of {@code y} or does not contain sufficient data to estimate the model + */ + void addObservations(double[][] x, double[] y) throws ModelSpecificationException; + + /** + * Clears internal buffers and resets the regression model. This means all + * data and derived values are initialized + */ + void clear(); + + + /** + * Performs a regression on data present in buffers and outputs a RegressionResults object + * @return RegressionResults acts as a container of regression output + * @throws ModelSpecificationException if the model is not correctly specified + * @throws NoDataException if there is not sufficient data in the model to + * estimate the regression parameters + */ + RegressionResults regress() throws ModelSpecificationException, NoDataException; + + /** + * Performs a regression on data present in buffers including only regressors + * indexed in variablesToInclude and outputs a RegressionResults object + * @param variablesToInclude an array of indices of regressors to include + * @return RegressionResults acts as a container of regression output + * @throws ModelSpecificationException if the model is not correctly specified + * @throws MathIllegalArgumentException if the variablesToInclude array is null or zero length + */ + RegressionResults regress(int[] variablesToInclude) throws ModelSpecificationException, MathIllegalArgumentException; +} diff --git a/src/main/java/org/apache/commons/math3/stat/regression/package-info.java b/src/main/java/org/apache/commons/math3/stat/regression/package-info.java new file mode 100644 index 0000000..fbc0e12 --- /dev/null +++ b/src/main/java/org/apache/commons/math3/stat/regression/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * + * Statistical routines involving multivariate data. + * + */ +package org.apache.commons.math3.stat.regression; |