/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.commons.math3.stat.inference; import java.util.ArrayList; import java.util.Collection; import org.apache.commons.math3.distribution.FDistribution; import org.apache.commons.math3.exception.ConvergenceException; import org.apache.commons.math3.exception.DimensionMismatchException; import org.apache.commons.math3.exception.MaxCountExceededException; import org.apache.commons.math3.exception.NullArgumentException; import org.apache.commons.math3.exception.OutOfRangeException; import org.apache.commons.math3.exception.util.LocalizedFormats; import org.apache.commons.math3.stat.descriptive.SummaryStatistics; import org.apache.commons.math3.util.MathUtils; /** * Implements one-way ANOVA (analysis of variance) statistics. * *
Tests for differences between two or more categories of univariate data * (for example, the body mass index of accountants, lawyers, doctors and * computer programmers). When two categories are given, this is equivalent to * the {@link org.apache.commons.math3.stat.inference.TTest}. *
* Uses the {@link org.apache.commons.math3.distribution.FDistribution * commons-math F Distribution implementation} to estimate exact p-values.
*This implementation is based on a description at * http://faculty.vassar.edu/lowry/ch13pt1.html
** Abbreviations: bg = between groups, * wg = within groups, * ss = sum squared deviations ** * @since 1.2 */ public class OneWayAnova { /** * Default constructor. */ public OneWayAnova() { } /** * Computes the ANOVA F-value for a collection of
double[]
* arrays.
*
* Preconditions:
Collection
must contain
* double[]
arrays.double[]
arrays in the
* categoryData
collection and each of these arrays must
* contain at least two values.* This implementation computes the F statistic using the definitional * formula
* F = msbg/mswg* where
* msbg = between group mean square * mswg = within group mean square* are as defined * here * * @param categoryData
Collection
of double[]
* arrays each containing data for one category
* @return Fvalue
* @throws NullArgumentException if categoryData
is null
* @throws DimensionMismatchException if the length of the categoryData
* array is less than 2 or a contained double[]
array does not have
* at least two values
*/
public double anovaFValue(final Collectiondouble[]
* arrays.
*
* Preconditions:
Collection
must contain
* double[]
arrays.double[]
arrays in the
* categoryData
collection and each of these arrays must
* contain at least two values.* This implementation uses the * {@link org.apache.commons.math3.distribution.FDistribution * commons-math F Distribution implementation} to estimate the exact * p-value, using the formula
* p = 1 - cumulativeProbability(F)* where
F
is the F value and cumulativeProbability
* is the commons-math implementation of the F distribution.
*
* @param categoryData Collection
of double[]
* arrays each containing data for one category
* @return Pvalue
* @throws NullArgumentException if categoryData
is null
* @throws DimensionMismatchException if the length of the categoryData
* array is less than 2 or a contained double[]
array does not have
* at least two values
* @throws ConvergenceException if the p-value can not be computed due to a convergence error
* @throws MaxCountExceededException if the maximum number of iterations is exceeded
*/
public double anovaPValue(final CollectionPreconditions:
Collection
must contain
* {@link SummaryStatistics}.categoryData
collection and each of these statistics must
* contain at least two values.* This implementation uses the * {@link org.apache.commons.math3.distribution.FDistribution * commons-math F Distribution implementation} to estimate the exact * p-value, using the formula
* p = 1 - cumulativeProbability(F)* where
F
is the F value and cumulativeProbability
* is the commons-math implementation of the F distribution.
*
* @param categoryData Collection
of {@link SummaryStatistics}
* each containing data for one category
* @param allowOneElementData if true, allow computation for one catagory
* only or for one data element per category
* @return Pvalue
* @throws NullArgumentException if categoryData
is null
* @throws DimensionMismatchException if the length of the categoryData
* array is less than 2 or a contained {@link SummaryStatistics} does not have
* at least two values
* @throws ConvergenceException if the p-value can not be computed due to a convergence error
* @throws MaxCountExceededException if the maximum number of iterations is exceeded
* @since 3.2
*/
public double anovaPValue(final CollectionCollection
of double[]
arrays each
* containing data for one category
* @return computed AnovaStats
* @throws NullArgumentException
* if categoryData
is null
* @throws DimensionMismatchException
* if the length of the categoryData
array is less
* than 2 or a contained double[]
array does not
* contain at least two values
*/
private AnovaStats anovaStats(final CollectionPreconditions:
Collection
must contain
* double[]
arrays.double[]
arrays in the
* categoryData
collection and each of these arrays must
* contain at least two values.* This implementation uses the * {@link org.apache.commons.math3.distribution.FDistribution * commons-math F Distribution implementation} to estimate the exact * p-value, using the formula
* p = 1 - cumulativeProbability(F)* where
F
is the F value and cumulativeProbability
* is the commons-math implementation of the F distribution.
* True is returned iff the estimated p-value is less than alpha.
* * @param categoryDataCollection
of double[]
* arrays each containing data for one category
* @param alpha significance level of the test
* @return true if the null hypothesis can be rejected with
* confidence 1 - alpha
* @throws NullArgumentException if categoryData
is null
* @throws DimensionMismatchException if the length of the categoryData
* array is less than 2 or a contained double[]
array does not have
* at least two values
* @throws OutOfRangeException if alpha
is not in the range (0, 0.5]
* @throws ConvergenceException if the p-value can not be computed due to a convergence error
* @throws MaxCountExceededException if the maximum number of iterations is exceeded
*/
public boolean anovaTest(final CollectionCollection
of double[]
* arrays each containing data for one category
* @param allowOneElementData if true, allow computation for one catagory
* only or for one data element per category
* @return computed AnovaStats
* @throws NullArgumentException if categoryData
is null
* @throws DimensionMismatchException if allowOneElementData
is false and the number of
* categories is less than 2 or a contained SummaryStatistics does not contain
* at least two values
*/
private AnovaStats anovaStats(final Collection