diff options
author | Raymond <siuchow@google.com> | 2015-04-02 10:43:13 -0700 |
---|---|---|
committer | Raymond <siuchow@google.com> | 2015-04-02 10:43:13 -0700 |
commit | dee0849a9704d532af0b550146cbafbaa6ee1d19 (patch) | |
tree | 8ccce3a046c214fb609977b7fc53c40cef7f9ea5 /src/main/java/org/apache/commons/math/stat | |
parent | 55b0a5efc929efa9615babd3e760547f94e3518e (diff) | |
download | apache-commons-math-dee0849a9704d532af0b550146cbafbaa6ee1d19.tar.gz |
third party library: apache-commons-mathandroid-cts-6.0_r9android-cts-6.0_r8android-cts-6.0_r7android-cts-6.0_r6android-cts-6.0_r5android-cts-6.0_r4android-cts-6.0_r32android-cts-6.0_r31android-cts-6.0_r30android-cts-6.0_r3android-cts-6.0_r29android-cts-6.0_r28android-cts-6.0_r27android-cts-6.0_r26android-cts-6.0_r25android-cts-6.0_r24android-cts-6.0_r23android-cts-6.0_r22android-cts-6.0_r21android-cts-6.0_r20android-cts-6.0_r2android-cts-6.0_r19android-cts-6.0_r18android-cts-6.0_r17android-cts-6.0_r16android-cts-6.0_r15android-cts-6.0_r14android-cts-6.0_r13android-cts-6.0_r12android-cts-6.0_r1android-6.0.1_r9android-6.0.1_r81android-6.0.1_r80android-6.0.1_r8android-6.0.1_r79android-6.0.1_r78android-6.0.1_r77android-6.0.1_r74android-6.0.1_r73android-6.0.1_r72android-6.0.1_r70android-6.0.1_r7android-6.0.1_r69android-6.0.1_r68android-6.0.1_r67android-6.0.1_r66android-6.0.1_r65android-6.0.1_r63android-6.0.1_r62android-6.0.1_r61android-6.0.1_r60android-6.0.1_r59android-6.0.1_r58android-6.0.1_r57android-6.0.1_r56android-6.0.1_r55android-6.0.1_r54android-6.0.1_r53android-6.0.1_r52android-6.0.1_r51android-6.0.1_r50android-6.0.1_r5android-6.0.1_r49android-6.0.1_r48android-6.0.1_r47android-6.0.1_r46android-6.0.1_r45android-6.0.1_r43android-6.0.1_r42android-6.0.1_r41android-6.0.1_r40android-6.0.1_r4android-6.0.1_r33android-6.0.1_r32android-6.0.1_r31android-6.0.1_r30android-6.0.1_r3android-6.0.1_r28android-6.0.1_r27android-6.0.1_r26android-6.0.1_r25android-6.0.1_r24android-6.0.1_r22android-6.0.1_r21android-6.0.1_r20android-6.0.1_r18android-6.0.1_r17android-6.0.1_r16android-6.0.1_r13android-6.0.1_r12android-6.0.1_r11android-6.0.1_r10android-6.0.1_r1android-6.0.0_r7android-6.0.0_r6android-6.0.0_r5android-6.0.0_r41android-6.0.0_r4android-6.0.0_r3android-6.0.0_r26android-6.0.0_r25android-6.0.0_r24android-6.0.0_r23android-6.0.0_r2android-6.0.0_r13android-6.0.0_r12android-6.0.0_r11android-6.0.0_r1marshmallow-releasemarshmallow-mr3-releasemarshmallow-mr2-releasemarshmallow-mr1-releasemarshmallow-mr1-devmarshmallow-dr1.6-releasemarshmallow-dr1.5-releasemarshmallow-dr1.5-devmarshmallow-dr-releasemarshmallow-dr-dragon-releasemarshmallow-dr-devmarshmallow-devmarshmallow-cts-release
Change-Id: I52a325624a7f0dd652b362a9840626d6d9f3c42b
Diffstat (limited to 'src/main/java/org/apache/commons/math/stat')
72 files changed, 17134 insertions, 0 deletions
diff --git a/src/main/java/org/apache/commons/math/stat/Frequency.java b/src/main/java/org/apache/commons/math/stat/Frequency.java new file mode 100644 index 0000000..434819e --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/Frequency.java @@ -0,0 +1,603 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat; + +import java.io.Serializable; +import java.text.NumberFormat; +import java.util.Iterator; +import java.util.Comparator; +import java.util.TreeMap; + +import org.apache.commons.math.MathRuntimeException; +import org.apache.commons.math.exception.util.LocalizedFormats; + +/** + * Maintains a frequency distribution. + * <p> + * Accepts int, long, char or Comparable values. New values added must be + * comparable to those that have been added, otherwise the add method will + * throw an IllegalArgumentException.</p> + * <p> + * Integer values (int, long, Integer, Long) are not distinguished by type -- + * i.e. <code>addValue(Long.valueOf(2)), addValue(2), addValue(2l)</code> all have + * the same effect (similarly for arguments to <code>getCount,</code> etc.).</p> + * <p> + * char values are converted by <code>addValue</code> to Character instances. + * As such, these values are not comparable to integral values, so attempts + * to combine integral types with chars in a frequency distribution will fail. + * </p> + * <p> + * The values are ordered using the default (natural order), unless a + * <code>Comparator</code> is supplied in the constructor.</p> + * + * @version $Revision: 1054186 $ $Date: 2011-01-01 03:28:46 +0100 (sam. 01 janv. 2011) $ + */ +public class Frequency implements Serializable { + + /** Serializable version identifier */ + private static final long serialVersionUID = -3845586908418844111L; + + /** underlying collection */ + private final TreeMap<Comparable<?>, Long> freqTable; + + /** + * Default constructor. + */ + public Frequency() { + freqTable = new TreeMap<Comparable<?>, Long>(); + } + + /** + * Constructor allowing values Comparator to be specified. + * + * @param comparator Comparator used to order values + */ + @SuppressWarnings("unchecked") // TODO is the cast OK? + public Frequency(Comparator<?> comparator) { + freqTable = new TreeMap<Comparable<?>, Long>((Comparator<? super Comparable<?>>) comparator); + } + + /** + * Return a string representation of this frequency + * distribution. + * + * @return a string representation. + */ + @Override + public String toString() { + NumberFormat nf = NumberFormat.getPercentInstance(); + StringBuilder outBuffer = new StringBuilder(); + outBuffer.append("Value \t Freq. \t Pct. \t Cum Pct. \n"); + Iterator<Comparable<?>> iter = freqTable.keySet().iterator(); + while (iter.hasNext()) { + Comparable<?> value = iter.next(); + outBuffer.append(value); + outBuffer.append('\t'); + outBuffer.append(getCount(value)); + outBuffer.append('\t'); + outBuffer.append(nf.format(getPct(value))); + outBuffer.append('\t'); + outBuffer.append(nf.format(getCumPct(value))); + outBuffer.append('\n'); + } + return outBuffer.toString(); + } + + /** + * Adds 1 to the frequency count for v. + * <p> + * If other objects have already been added to this Frequency, v must + * be comparable to those that have already been added. + * </p> + * + * @param v the value to add. + * @throws IllegalArgumentException if <code>v</code> is not Comparable, + * or is not comparable with previous entries + * @deprecated use {@link #addValue(Comparable)} instead + */ + @Deprecated + public void addValue(Object v) { + if (v instanceof Comparable<?>){ + addValue((Comparable<?>) v); + } else { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.CLASS_DOESNT_IMPLEMENT_COMPARABLE, + v.getClass().getName()); + } + } + + /** + * Adds 1 to the frequency count for v. + * <p> + * If other objects have already been added to this Frequency, v must + * be comparable to those that have already been added. + * </p> + * + * @param v the value to add. + * @throws IllegalArgumentException if <code>v</code> is not comparable with previous entries + */ + public void addValue(Comparable<?> v){ + Comparable<?> obj = v; + if (v instanceof Integer) { + obj = Long.valueOf(((Integer) v).longValue()); + } + try { + Long count = freqTable.get(obj); + if (count == null) { + freqTable.put(obj, Long.valueOf(1)); + } else { + freqTable.put(obj, Long.valueOf(count.longValue() + 1)); + } + } catch (ClassCastException ex) { + //TreeMap will throw ClassCastException if v is not comparable + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.INSTANCES_NOT_COMPARABLE_TO_EXISTING_VALUES, + v.getClass().getName()); + } + } + + /** + * Adds 1 to the frequency count for v. + * + * @param v the value to add. + */ + public void addValue(int v) { + addValue(Long.valueOf(v)); + } + + /** + * Adds 1 to the frequency count for v. + * + * @param v the value to add. + * @deprecated to be removed in math 3.0 + */ + @Deprecated + public void addValue(Integer v) { + addValue(Long.valueOf(v.longValue())); + } + + /** + * Adds 1 to the frequency count for v. + * + * @param v the value to add. + */ + public void addValue(long v) { + addValue(Long.valueOf(v)); + } + + /** + * Adds 1 to the frequency count for v. + * + * @param v the value to add. + */ + public void addValue(char v) { + addValue(Character.valueOf(v)); + } + + /** Clears the frequency table */ + public void clear() { + freqTable.clear(); + } + + /** + * Returns an Iterator over the set of values that have been added. + * <p> + * If added values are integral (i.e., integers, longs, Integers, or Longs), + * they are converted to Longs when they are added, so the objects returned + * by the Iterator will in this case be Longs.</p> + * + * @return values Iterator + */ + public Iterator<Comparable<?>> valuesIterator() { + return freqTable.keySet().iterator(); + } + + //------------------------------------------------------------------------- + + /** + * Returns the sum of all frequencies. + * + * @return the total frequency count. + */ + public long getSumFreq() { + long result = 0; + Iterator<Long> iterator = freqTable.values().iterator(); + while (iterator.hasNext()) { + result += iterator.next().longValue(); + } + return result; + } + + /** + * Returns the number of values = v. + * Returns 0 if the value is not comparable. + * + * @param v the value to lookup. + * @return the frequency of v. + * @deprecated replaced by {@link #getCount(Comparable)} as of 2.0 + */ + @Deprecated + public long getCount(Object v) { + return getCount((Comparable<?>) v); + } + + /** + * Returns the number of values = v. + * Returns 0 if the value is not comparable. + * + * @param v the value to lookup. + * @return the frequency of v. + */ + public long getCount(Comparable<?> v) { + if (v instanceof Integer) { + return getCount(((Integer) v).longValue()); + } + long result = 0; + try { + Long count = freqTable.get(v); + if (count != null) { + result = count.longValue(); + } + } catch (ClassCastException ex) { + // ignore and return 0 -- ClassCastException will be thrown if value is not comparable + } + return result; + } + + /** + * Returns the number of values = v. + * + * @param v the value to lookup. + * @return the frequency of v. + */ + public long getCount(int v) { + return getCount(Long.valueOf(v)); + } + + /** + * Returns the number of values = v. + * + * @param v the value to lookup. + * @return the frequency of v. + */ + public long getCount(long v) { + return getCount(Long.valueOf(v)); + } + + /** + * Returns the number of values = v. + * + * @param v the value to lookup. + * @return the frequency of v. + */ + public long getCount(char v) { + return getCount(Character.valueOf(v)); + } + + /** + * Returns the number of values in the frequency table. + * + * @return the number of unique values that have been added to the frequency table. + * @see #valuesIterator() + */ + public int getUniqueCount(){ + return freqTable.keySet().size(); + } + + //------------------------------------------------------------- + + /** + * Returns the percentage of values that are equal to v + * (as a proportion between 0 and 1). + * <p> + * Returns <code>Double.NaN</code> if no values have been added.</p> + * + * @param v the value to lookup + * @return the proportion of values equal to v + * @deprecated replaced by {@link #getPct(Comparable)} as of 2.0 + */ + @Deprecated + public double getPct(Object v) { + return getPct((Comparable<?>) v); + } + + /** + * Returns the percentage of values that are equal to v + * (as a proportion between 0 and 1). + * <p> + * Returns <code>Double.NaN</code> if no values have been added.</p> + * + * @param v the value to lookup + * @return the proportion of values equal to v + */ + public double getPct(Comparable<?> v) { + final long sumFreq = getSumFreq(); + if (sumFreq == 0) { + return Double.NaN; + } + return (double) getCount(v) / (double) sumFreq; + } + + /** + * Returns the percentage of values that are equal to v + * (as a proportion between 0 and 1). + * + * @param v the value to lookup + * @return the proportion of values equal to v + */ + public double getPct(int v) { + return getPct(Long.valueOf(v)); + } + + /** + * Returns the percentage of values that are equal to v + * (as a proportion between 0 and 1). + * + * @param v the value to lookup + * @return the proportion of values equal to v + */ + public double getPct(long v) { + return getPct(Long.valueOf(v)); + } + + /** + * Returns the percentage of values that are equal to v + * (as a proportion between 0 and 1). + * + * @param v the value to lookup + * @return the proportion of values equal to v + */ + public double getPct(char v) { + return getPct(Character.valueOf(v)); + } + + //----------------------------------------------------------------------------------------- + + /** + * Returns the cumulative frequency of values less than or equal to v. + * <p> + * Returns 0 if v is not comparable to the values set.</p> + * + * @param v the value to lookup. + * @return the proportion of values equal to v + * @deprecated replaced by {@link #getCumFreq(Comparable)} as of 2.0 + */ + @Deprecated + public long getCumFreq(Object v) { + return getCumFreq((Comparable<?>) v); + } + + /** + * Returns the cumulative frequency of values less than or equal to v. + * <p> + * Returns 0 if v is not comparable to the values set.</p> + * + * @param v the value to lookup. + * @return the proportion of values equal to v + */ + public long getCumFreq(Comparable<?> v) { + if (getSumFreq() == 0) { + return 0; + } + if (v instanceof Integer) { + return getCumFreq(((Integer) v).longValue()); + } + @SuppressWarnings("unchecked") // OK, freqTable is Comparable<?> + Comparator<Comparable<?>> c = (Comparator<Comparable<?>>) freqTable.comparator(); + if (c == null) { + c = new NaturalComparator(); + } + long result = 0; + + try { + Long value = freqTable.get(v); + if (value != null) { + result = value.longValue(); + } + } catch (ClassCastException ex) { + return result; // v is not comparable + } + + if (c.compare(v, freqTable.firstKey()) < 0) { + return 0; // v is comparable, but less than first value + } + + if (c.compare(v, freqTable.lastKey()) >= 0) { + return getSumFreq(); // v is comparable, but greater than the last value + } + + Iterator<Comparable<?>> values = valuesIterator(); + while (values.hasNext()) { + Comparable<?> nextValue = values.next(); + if (c.compare(v, nextValue) > 0) { + result += getCount(nextValue); + } else { + return result; + } + } + return result; + } + + /** + * Returns the cumulative frequency of values less than or equal to v. + * <p> + * Returns 0 if v is not comparable to the values set.</p> + * + * @param v the value to lookup + * @return the proportion of values equal to v + */ + public long getCumFreq(int v) { + return getCumFreq(Long.valueOf(v)); + } + + /** + * Returns the cumulative frequency of values less than or equal to v. + * <p> + * Returns 0 if v is not comparable to the values set.</p> + * + * @param v the value to lookup + * @return the proportion of values equal to v + */ + public long getCumFreq(long v) { + return getCumFreq(Long.valueOf(v)); + } + + /** + * Returns the cumulative frequency of values less than or equal to v. + * <p> + * Returns 0 if v is not comparable to the values set.</p> + * + * @param v the value to lookup + * @return the proportion of values equal to v + */ + public long getCumFreq(char v) { + return getCumFreq(Character.valueOf(v)); + } + + //---------------------------------------------------------------------------------------------- + + /** + * Returns the cumulative percentage of values less than or equal to v + * (as a proportion between 0 and 1). + * <p> + * Returns <code>Double.NaN</code> if no values have been added. + * Returns 0 if at least one value has been added, but v is not comparable + * to the values set.</p> + * + * @param v the value to lookup + * @return the proportion of values less than or equal to v + * @deprecated replaced by {@link #getCumPct(Comparable)} as of 2.0 + */ + @Deprecated + public double getCumPct(Object v) { + return getCumPct((Comparable<?>) v); + + } + + /** + * Returns the cumulative percentage of values less than or equal to v + * (as a proportion between 0 and 1). + * <p> + * Returns <code>Double.NaN</code> if no values have been added. + * Returns 0 if at least one value has been added, but v is not comparable + * to the values set.</p> + * + * @param v the value to lookup + * @return the proportion of values less than or equal to v + */ + public double getCumPct(Comparable<?> v) { + final long sumFreq = getSumFreq(); + if (sumFreq == 0) { + return Double.NaN; + } + return (double) getCumFreq(v) / (double) sumFreq; + } + + /** + * Returns the cumulative percentage of values less than or equal to v + * (as a proportion between 0 and 1). + * <p> + * Returns 0 if v is not comparable to the values set.</p> + * + * @param v the value to lookup + * @return the proportion of values less than or equal to v + */ + public double getCumPct(int v) { + return getCumPct(Long.valueOf(v)); + } + + /** + * Returns the cumulative percentage of values less than or equal to v + * (as a proportion between 0 and 1). + * <p> + * Returns 0 if v is not comparable to the values set.</p> + * + * @param v the value to lookup + * @return the proportion of values less than or equal to v + */ + public double getCumPct(long v) { + return getCumPct(Long.valueOf(v)); + } + + /** + * Returns the cumulative percentage of values less than or equal to v + * (as a proportion between 0 and 1). + * <p> + * Returns 0 if v is not comparable to the values set.</p> + * + * @param v the value to lookup + * @return the proportion of values less than or equal to v + */ + public double getCumPct(char v) { + return getCumPct(Character.valueOf(v)); + } + + /** + * A Comparator that compares comparable objects using the + * natural order. Copied from Commons Collections ComparableComparator. + */ + private static class NaturalComparator<T extends Comparable<T>> implements Comparator<Comparable<T>>, Serializable { + + /** Serializable version identifier */ + private static final long serialVersionUID = -3852193713161395148L; + + /** + * Compare the two {@link Comparable Comparable} arguments. + * This method is equivalent to: + * <pre>(({@link Comparable Comparable})o1).{@link Comparable#compareTo compareTo}(o2)</pre> + * + * @param o1 the first object + * @param o2 the second object + * @return result of comparison + * @throws NullPointerException when <i>o1</i> is <code>null</code>, + * or when <code>((Comparable)o1).compareTo(o2)</code> does + * @throws ClassCastException when <i>o1</i> is not a {@link Comparable Comparable}, + * or when <code>((Comparable)o1).compareTo(o2)</code> does + */ + @SuppressWarnings("unchecked") // cast to (T) may throw ClassCastException, see Javadoc + public int compare(Comparable<T> o1, Comparable<T> o2) { + return o1.compareTo((T) o2); + } + } + + /** {@inheritDoc} */ + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + + ((freqTable == null) ? 0 : freqTable.hashCode()); + return result; + } + + /** {@inheritDoc} */ + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (!(obj instanceof Frequency)) + return false; + Frequency other = (Frequency) obj; + if (freqTable == null) { + if (other.freqTable != null) + return false; + } else if (!freqTable.equals(other.freqTable)) + return false; + return true; + } + +} diff --git a/src/main/java/org/apache/commons/math/stat/StatUtils.java b/src/main/java/org/apache/commons/math/stat/StatUtils.java new file mode 100644 index 0000000..7ae1e17 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/StatUtils.java @@ -0,0 +1,663 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat; + +import org.apache.commons.math.MathRuntimeException; +import org.apache.commons.math.exception.util.LocalizedFormats; +import org.apache.commons.math.stat.descriptive.DescriptiveStatistics; +import org.apache.commons.math.stat.descriptive.UnivariateStatistic; +import org.apache.commons.math.stat.descriptive.moment.GeometricMean; +import org.apache.commons.math.stat.descriptive.moment.Mean; +import org.apache.commons.math.stat.descriptive.moment.Variance; +import org.apache.commons.math.stat.descriptive.rank.Max; +import org.apache.commons.math.stat.descriptive.rank.Min; +import org.apache.commons.math.stat.descriptive.rank.Percentile; +import org.apache.commons.math.stat.descriptive.summary.Product; +import org.apache.commons.math.stat.descriptive.summary.Sum; +import org.apache.commons.math.stat.descriptive.summary.SumOfLogs; +import org.apache.commons.math.stat.descriptive.summary.SumOfSquares; + +/** + * StatUtils provides static methods for computing statistics based on data + * stored in double[] arrays. + * + * @version $Revision: 1073276 $ $Date: 2011-02-22 10:34:52 +0100 (mar. 22 févr. 2011) $ + */ +public final class StatUtils { + + /** sum */ + private static final UnivariateStatistic SUM = new Sum(); + + /** sumSq */ + private static final UnivariateStatistic SUM_OF_SQUARES = new SumOfSquares(); + + /** prod */ + private static final UnivariateStatistic PRODUCT = new Product(); + + /** sumLog */ + private static final UnivariateStatistic SUM_OF_LOGS = new SumOfLogs(); + + /** min */ + private static final UnivariateStatistic MIN = new Min(); + + /** max */ + private static final UnivariateStatistic MAX = new Max(); + + /** mean */ + private static final UnivariateStatistic MEAN = new Mean(); + + /** variance */ + private static final Variance VARIANCE = new Variance(); + + /** percentile */ + private static final Percentile PERCENTILE = new Percentile(); + + /** geometric mean */ + private static final GeometricMean GEOMETRIC_MEAN = new GeometricMean(); + + /** + * Private Constructor + */ + private StatUtils() { + } + + /** + * Returns the sum of the values in the input array, or + * <code>Double.NaN</code> if the array is empty. + * <p> + * Throws <code>IllegalArgumentException</code> if the input array + * is null.</p> + * + * @param values array of values to sum + * @return the sum of the values or <code>Double.NaN</code> if the array + * is empty + * @throws IllegalArgumentException if the array is null + */ + public static double sum(final double[] values) { + return SUM.evaluate(values); + } + + /** + * Returns the sum of the entries in the specified portion of + * the input array, or <code>Double.NaN</code> if the designated subarray + * is empty. + * <p> + * Throws <code>IllegalArgumentException</code> if the array is null.</p> + * + * @param values the input array + * @param begin index of the first array element to include + * @param length the number of elements to include + * @return the sum of the values or Double.NaN if length = 0 + * @throws IllegalArgumentException if the array is null or the array index + * parameters are not valid + */ + public static double sum(final double[] values, final int begin, + final int length) { + return SUM.evaluate(values, begin, length); + } + + /** + * Returns the sum of the squares of the entries in the input array, or + * <code>Double.NaN</code> if the array is empty. + * <p> + * Throws <code>IllegalArgumentException</code> if the array is null.</p> + * + * @param values input array + * @return the sum of the squared values or <code>Double.NaN</code> if the + * array is empty + * @throws IllegalArgumentException if the array is null + */ + public static double sumSq(final double[] values) { + return SUM_OF_SQUARES.evaluate(values); + } + + /** + * Returns the sum of the squares of the entries in the specified portion of + * the input array, or <code>Double.NaN</code> if the designated subarray + * is empty. + * <p> + * Throws <code>IllegalArgumentException</code> if the array is null.</p> + * + * @param values the input array + * @param begin index of the first array element to include + * @param length the number of elements to include + * @return the sum of the squares of the values or Double.NaN if length = 0 + * @throws IllegalArgumentException if the array is null or the array index + * parameters are not valid + */ + public static double sumSq(final double[] values, final int begin, + final int length) { + return SUM_OF_SQUARES.evaluate(values, begin, length); + } + + /** + * Returns the product of the entries in the input array, or + * <code>Double.NaN</code> if the array is empty. + * <p> + * Throws <code>IllegalArgumentException</code> if the array is null.</p> + * + * @param values the input array + * @return the product of the values or Double.NaN if the array is empty + * @throws IllegalArgumentException if the array is null + */ + public static double product(final double[] values) { + return PRODUCT.evaluate(values); + } + + /** + * Returns the product of the entries in the specified portion of + * the input array, or <code>Double.NaN</code> if the designated subarray + * is empty. + * <p> + * Throws <code>IllegalArgumentException</code> if the array is null.</p> + * + * @param values the input array + * @param begin index of the first array element to include + * @param length the number of elements to include + * @return the product of the values or Double.NaN if length = 0 + * @throws IllegalArgumentException if the array is null or the array index + * parameters are not valid + */ + public static double product(final double[] values, final int begin, + final int length) { + return PRODUCT.evaluate(values, begin, length); + } + + /** + * Returns the sum of the natural logs of the entries in the input array, or + * <code>Double.NaN</code> if the array is empty. + * <p> + * Throws <code>IllegalArgumentException</code> if the array is null.</p> + * <p> + * See {@link org.apache.commons.math.stat.descriptive.summary.SumOfLogs}. + * </p> + * + * @param values the input array + * @return the sum of the natural logs of the values or Double.NaN if + * the array is empty + * @throws IllegalArgumentException if the array is null + */ + public static double sumLog(final double[] values) { + return SUM_OF_LOGS.evaluate(values); + } + + /** + * Returns the sum of the natural logs of the entries in the specified portion of + * the input array, or <code>Double.NaN</code> if the designated subarray + * is empty. + * <p> + * Throws <code>IllegalArgumentException</code> if the array is null.</p> + * <p> + * See {@link org.apache.commons.math.stat.descriptive.summary.SumOfLogs}. + * </p> + * + * @param values the input array + * @param begin index of the first array element to include + * @param length the number of elements to include + * @return the sum of the natural logs of the values or Double.NaN if + * length = 0 + * @throws IllegalArgumentException if the array is null or the array index + * parameters are not valid + */ + public static double sumLog(final double[] values, final int begin, + final int length) { + return SUM_OF_LOGS.evaluate(values, begin, length); + } + + /** + * Returns the arithmetic mean of the entries in the input array, or + * <code>Double.NaN</code> if the array is empty. + * <p> + * Throws <code>IllegalArgumentException</code> if the array is null.</p> + * <p> + * See {@link org.apache.commons.math.stat.descriptive.moment.Mean} for + * details on the computing algorithm.</p> + * + * @param values the input array + * @return the mean of the values or Double.NaN if the array is empty + * @throws IllegalArgumentException if the array is null + */ + public static double mean(final double[] values) { + return MEAN.evaluate(values); + } + + /** + * Returns the arithmetic mean of the entries in the specified portion of + * the input array, or <code>Double.NaN</code> if the designated subarray + * is empty. + * <p> + * Throws <code>IllegalArgumentException</code> if the array is null.</p> + * <p> + * See {@link org.apache.commons.math.stat.descriptive.moment.Mean} for + * details on the computing algorithm.</p> + * + * @param values the input array + * @param begin index of the first array element to include + * @param length the number of elements to include + * @return the mean of the values or Double.NaN if length = 0 + * @throws IllegalArgumentException if the array is null or the array index + * parameters are not valid + */ + public static double mean(final double[] values, final int begin, + final int length) { + return MEAN.evaluate(values, begin, length); + } + + /** + * Returns the geometric mean of the entries in the input array, or + * <code>Double.NaN</code> if the array is empty. + * <p> + * Throws <code>IllegalArgumentException</code> if the array is null.</p> + * <p> + * See {@link org.apache.commons.math.stat.descriptive.moment.GeometricMean} + * for details on the computing algorithm.</p> + * + * @param values the input array + * @return the geometric mean of the values or Double.NaN if the array is empty + * @throws IllegalArgumentException if the array is null + */ + public static double geometricMean(final double[] values) { + return GEOMETRIC_MEAN.evaluate(values); + } + + /** + * Returns the geometric mean of the entries in the specified portion of + * the input array, or <code>Double.NaN</code> if the designated subarray + * is empty. + * <p> + * Throws <code>IllegalArgumentException</code> if the array is null.</p> + * <p> + * See {@link org.apache.commons.math.stat.descriptive.moment.GeometricMean} + * for details on the computing algorithm.</p> + * + * @param values the input array + * @param begin index of the first array element to include + * @param length the number of elements to include + * @return the geometric mean of the values or Double.NaN if length = 0 + * @throws IllegalArgumentException if the array is null or the array index + * parameters are not valid + */ + public static double geometricMean(final double[] values, final int begin, + final int length) { + return GEOMETRIC_MEAN.evaluate(values, begin, length); + } + + + /** + * Returns the variance of the entries in the input array, or + * <code>Double.NaN</code> if the array is empty. + * <p> + * See {@link org.apache.commons.math.stat.descriptive.moment.Variance} for + * details on the computing algorithm.</p> + * <p> + * Returns 0 for a single-value (i.e. length = 1) sample.</p> + * <p> + * Throws <code>IllegalArgumentException</code> if the array is null.</p> + * + * @param values the input array + * @return the variance of the values or Double.NaN if the array is empty + * @throws IllegalArgumentException if the array is null + */ + public static double variance(final double[] values) { + return VARIANCE.evaluate(values); + } + + /** + * Returns the variance of the entries in the specified portion of + * the input array, or <code>Double.NaN</code> if the designated subarray + * is empty. + * <p> + * See {@link org.apache.commons.math.stat.descriptive.moment.Variance} for + * details on the computing algorithm.</p> + * <p> + * Returns 0 for a single-value (i.e. length = 1) sample.</p> + * <p> + * Throws <code>IllegalArgumentException</code> if the array is null or the + * array index parameters are not valid.</p> + * + * @param values the input array + * @param begin index of the first array element to include + * @param length the number of elements to include + * @return the variance of the values or Double.NaN if length = 0 + * @throws IllegalArgumentException if the array is null or the array index + * parameters are not valid + */ + public static double variance(final double[] values, final int begin, + final int length) { + return VARIANCE.evaluate(values, begin, length); + } + + /** + * Returns the variance of the entries in the specified portion of + * the input array, using the precomputed mean value. Returns + * <code>Double.NaN</code> if the designated subarray is empty. + * <p> + * See {@link org.apache.commons.math.stat.descriptive.moment.Variance} for + * details on the computing algorithm.</p> + * <p> + * The formula used assumes that the supplied mean value is the arithmetic + * mean of the sample data, not a known population parameter. This method + * is supplied only to save computation when the mean has already been + * computed.</p> + * <p> + * Returns 0 for a single-value (i.e. length = 1) sample.</p> + * <p> + * Throws <code>IllegalArgumentException</code> if the array is null or the + * array index parameters are not valid.</p> + * + * @param values the input array + * @param mean the precomputed mean value + * @param begin index of the first array element to include + * @param length the number of elements to include + * @return the variance of the values or Double.NaN if length = 0 + * @throws IllegalArgumentException if the array is null or the array index + * parameters are not valid + */ + public static double variance(final double[] values, final double mean, + final int begin, final int length) { + return VARIANCE.evaluate(values, mean, begin, length); + } + + /** + * Returns the variance of the entries in the input array, using the + * precomputed mean value. Returns <code>Double.NaN</code> if the array + * is empty. + * <p> + * See {@link org.apache.commons.math.stat.descriptive.moment.Variance} for + * details on the computing algorithm.</p> + * <p> + * The formula used assumes that the supplied mean value is the arithmetic + * mean of the sample data, not a known population parameter. This method + * is supplied only to save computation when the mean has already been + * computed.</p> + * <p> + * Returns 0 for a single-value (i.e. length = 1) sample.</p> + * <p> + * Throws <code>IllegalArgumentException</code> if the array is null.</p> + * + * @param values the input array + * @param mean the precomputed mean value + * @return the variance of the values or Double.NaN if the array is empty + * @throws IllegalArgumentException if the array is null + */ + public static double variance(final double[] values, final double mean) { + return VARIANCE.evaluate(values, mean); + } + + /** + * Returns the maximum of the entries in the input array, or + * <code>Double.NaN</code> if the array is empty. + * <p> + * Throws <code>IllegalArgumentException</code> if the array is null.</p> + * <p> + * <ul> + * <li>The result is <code>NaN</code> iff all values are <code>NaN</code> + * (i.e. <code>NaN</code> values have no impact on the value of the statistic).</li> + * <li>If any of the values equals <code>Double.POSITIVE_INFINITY</code>, + * the result is <code>Double.POSITIVE_INFINITY.</code></li> + * </ul></p> + * + * @param values the input array + * @return the maximum of the values or Double.NaN if the array is empty + * @throws IllegalArgumentException if the array is null + */ + public static double max(final double[] values) { + return MAX.evaluate(values); + } + + /** + * Returns the maximum of the entries in the specified portion of + * the input array, or <code>Double.NaN</code> if the designated subarray + * is empty. + * <p> + * Throws <code>IllegalArgumentException</code> if the array is null or + * the array index parameters are not valid.</p> + * <p> + * <ul> + * <li>The result is <code>NaN</code> iff all values are <code>NaN</code> + * (i.e. <code>NaN</code> values have no impact on the value of the statistic).</li> + * <li>If any of the values equals <code>Double.POSITIVE_INFINITY</code>, + * the result is <code>Double.POSITIVE_INFINITY.</code></li> + * </ul></p> + * + * @param values the input array + * @param begin index of the first array element to include + * @param length the number of elements to include + * @return the maximum of the values or Double.NaN if length = 0 + * @throws IllegalArgumentException if the array is null or the array index + * parameters are not valid + */ + public static double max(final double[] values, final int begin, + final int length) { + return MAX.evaluate(values, begin, length); + } + + /** + * Returns the minimum of the entries in the input array, or + * <code>Double.NaN</code> if the array is empty. + * <p> + * Throws <code>IllegalArgumentException</code> if the array is null.</p> + * <p> + * <ul> + * <li>The result is <code>NaN</code> iff all values are <code>NaN</code> + * (i.e. <code>NaN</code> values have no impact on the value of the statistic).</li> + * <li>If any of the values equals <code>Double.NEGATIVE_INFINITY</code>, + * the result is <code>Double.NEGATIVE_INFINITY.</code></li> + * </ul> </p> + * + * @param values the input array + * @return the minimum of the values or Double.NaN if the array is empty + * @throws IllegalArgumentException if the array is null + */ + public static double min(final double[] values) { + return MIN.evaluate(values); + } + + /** + * Returns the minimum of the entries in the specified portion of + * the input array, or <code>Double.NaN</code> if the designated subarray + * is empty. + * <p> + * Throws <code>IllegalArgumentException</code> if the array is null or + * the array index parameters are not valid.</p> + * <p> + * <ul> + * <li>The result is <code>NaN</code> iff all values are <code>NaN</code> + * (i.e. <code>NaN</code> values have no impact on the value of the statistic).</li> + * <li>If any of the values equals <code>Double.NEGATIVE_INFINITY</code>, + * the result is <code>Double.NEGATIVE_INFINITY.</code></li> + * </ul></p> + * + * @param values the input array + * @param begin index of the first array element to include + * @param length the number of elements to include + * @return the minimum of the values or Double.NaN if length = 0 + * @throws IllegalArgumentException if the array is null or the array index + * parameters are not valid + */ + public static double min(final double[] values, final int begin, + final int length) { + return MIN.evaluate(values, begin, length); + } + + /** + * Returns an estimate of the <code>p</code>th percentile of the values + * in the <code>values</code> array. + * <p> + * <ul> + * <li>Returns <code>Double.NaN</code> if <code>values</code> has length + * <code>0</code></li></p> + * <li>Returns (for any value of <code>p</code>) <code>values[0]</code> + * if <code>values</code> has length <code>1</code></li> + * <li>Throws <code>IllegalArgumentException</code> if <code>values</code> + * is null or p is not a valid quantile value (p must be greater than 0 + * and less than or equal to 100)</li> + * </ul></p> + * <p> + * See {@link org.apache.commons.math.stat.descriptive.rank.Percentile} for + * a description of the percentile estimation algorithm used.</p> + * + * @param values input array of values + * @param p the percentile value to compute + * @return the percentile value or Double.NaN if the array is empty + * @throws IllegalArgumentException if <code>values</code> is null + * or p is invalid + */ + public static double percentile(final double[] values, final double p) { + return PERCENTILE.evaluate(values,p); + } + + /** + * Returns an estimate of the <code>p</code>th percentile of the values + * in the <code>values</code> array, starting with the element in (0-based) + * position <code>begin</code> in the array and including <code>length</code> + * values. + * <p> + * <ul> + * <li>Returns <code>Double.NaN</code> if <code>length = 0</code></li> + * <li>Returns (for any value of <code>p</code>) <code>values[begin]</code> + * if <code>length = 1 </code></li> + * <li>Throws <code>IllegalArgumentException</code> if <code>values</code> + * is null , <code>begin</code> or <code>length</code> is invalid, or + * <code>p</code> is not a valid quantile value (p must be greater than 0 + * and less than or equal to 100)</li> + * </ul></p> + * <p> + * See {@link org.apache.commons.math.stat.descriptive.rank.Percentile} for + * a description of the percentile estimation algorithm used.</p> + * + * @param values array of input values + * @param p the percentile to compute + * @param begin the first (0-based) element to include in the computation + * @param length the number of array elements to include + * @return the percentile value + * @throws IllegalArgumentException if the parameters are not valid or the + * input array is null + */ + public static double percentile(final double[] values, final int begin, + final int length, final double p) { + return PERCENTILE.evaluate(values, begin, length, p); + } + + /** + * Returns the sum of the (signed) differences between corresponding elements of the + * input arrays -- i.e., sum(sample1[i] - sample2[i]). + * + * @param sample1 the first array + * @param sample2 the second array + * @return sum of paired differences + * @throws IllegalArgumentException if the arrays do not have the same + * (positive) length + */ + public static double sumDifference(final double[] sample1, final double[] sample2) + throws IllegalArgumentException { + int n = sample1.length; + if (n != sample2.length) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.DIMENSIONS_MISMATCH_SIMPLE, n, sample2.length); + } + if (n < 1) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.INSUFFICIENT_DIMENSION, sample2.length, 1); + } + double result = 0; + for (int i = 0; i < n; i++) { + result += sample1[i] - sample2[i]; + } + return result; + } + + /** + * Returns the mean of the (signed) differences between corresponding elements of the + * input arrays -- i.e., sum(sample1[i] - sample2[i]) / sample1.length. + * + * @param sample1 the first array + * @param sample2 the second array + * @return mean of paired differences + * @throws IllegalArgumentException if the arrays do not have the same + * (positive) length + */ + public static double meanDifference(final double[] sample1, final double[] sample2) + throws IllegalArgumentException { + return sumDifference(sample1, sample2) / sample1.length; + } + + /** + * Returns the variance of the (signed) differences between corresponding elements of the + * input arrays -- i.e., var(sample1[i] - sample2[i]). + * + * @param sample1 the first array + * @param sample2 the second array + * @param meanDifference the mean difference between corresponding entries + * @see #meanDifference(double[],double[]) + * @return variance of paired differences + * @throws IllegalArgumentException if the arrays do not have the same + * length or their common length is less than 2. + */ + public static double varianceDifference(final double[] sample1, final double[] sample2, + double meanDifference) throws IllegalArgumentException { + double sum1 = 0d; + double sum2 = 0d; + double diff = 0d; + int n = sample1.length; + if (n != sample2.length) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.DIMENSIONS_MISMATCH_SIMPLE, n, sample2.length); + } + if (n < 2) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.INSUFFICIENT_DIMENSION, n, 2); + } + for (int i = 0; i < n; i++) { + diff = sample1[i] - sample2[i]; + sum1 += (diff - meanDifference) *(diff - meanDifference); + sum2 += diff - meanDifference; + } + return (sum1 - (sum2 * sum2 / n)) / (n - 1); + } + + + /** + * Normalize (standardize) the series, so in the end it is having a mean of 0 and a standard deviation of 1. + * + * @param sample sample to normalize + * @return normalized (standardized) sample + * @since 2.2 + */ + public static double[] normalize(final double[] sample) { + DescriptiveStatistics stats = new DescriptiveStatistics(); + + // Add the data from the series to stats + for (int i = 0; i < sample.length; i++) { + stats.addValue(sample[i]); + } + + // Compute mean and standard deviation + double mean = stats.getMean(); + double standardDeviation = stats.getStandardDeviation(); + + // initialize the standardizedSample, which has the same length as the sample + double[] standardizedSample = new double[sample.length]; + + for (int i = 0; i < sample.length; i++) { + // z = (x- mean)/standardDeviation + standardizedSample[i] = (sample[i] - mean) / standardDeviation; + } + return standardizedSample; + } + +} diff --git a/src/main/java/org/apache/commons/math/stat/clustering/Cluster.java b/src/main/java/org/apache/commons/math/stat/clustering/Cluster.java new file mode 100644 index 0000000..f4913d3 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/clustering/Cluster.java @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.math.stat.clustering; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; + +/** + * Cluster holding a set of {@link Clusterable} points. + * @param <T> the type of points that can be clustered + * @version $Revision: 771076 $ $Date: 2009-05-03 18:28:48 +0200 (dim. 03 mai 2009) $ + * @since 2.0 + */ +public class Cluster<T extends Clusterable<T>> implements Serializable { + + /** Serializable version identifier. */ + private static final long serialVersionUID = -3442297081515880464L; + + /** The points contained in this cluster. */ + private final List<T> points; + + /** Center of the cluster. */ + private final T center; + + /** + * Build a cluster centered at a specified point. + * @param center the point which is to be the center of this cluster + */ + public Cluster(final T center) { + this.center = center; + points = new ArrayList<T>(); + } + + /** + * Add a point to this cluster. + * @param point point to add + */ + public void addPoint(final T point) { + points.add(point); + } + + /** + * Get the points contained in the cluster. + * @return points contained in the cluster + */ + public List<T> getPoints() { + return points; + } + + /** + * Get the point chosen to be the center of this cluster. + * @return chosen cluster center + */ + public T getCenter() { + return center; + } + +} diff --git a/src/main/java/org/apache/commons/math/stat/clustering/Clusterable.java b/src/main/java/org/apache/commons/math/stat/clustering/Clusterable.java new file mode 100644 index 0000000..65132e6 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/clustering/Clusterable.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.math.stat.clustering; + +import java.util.Collection; + +/** + * Interface for points that can be clustered together. + * @param <T> the type of point that can be clustered + * @version $Revision: 811685 $ $Date: 2009-09-05 19:36:48 +0200 (sam. 05 sept. 2009) $ + * @since 2.0 + */ +public interface Clusterable<T> { + + /** + * Returns the distance from the given point. + * + * @param p the point to compute the distance from + * @return the distance from the given point + */ + double distanceFrom(T p); + + /** + * Returns the centroid of the given Collection of points. + * + * @param p the Collection of points to compute the centroid of + * @return the centroid of the given Collection of Points + */ + T centroidOf(Collection<T> p); + +} diff --git a/src/main/java/org/apache/commons/math/stat/clustering/EuclideanIntegerPoint.java b/src/main/java/org/apache/commons/math/stat/clustering/EuclideanIntegerPoint.java new file mode 100644 index 0000000..7fec0ff --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/clustering/EuclideanIntegerPoint.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.math.stat.clustering; + +import java.io.Serializable; +import java.util.Collection; + +import org.apache.commons.math.util.MathUtils; + +/** + * A simple implementation of {@link Clusterable} for points with integer coordinates. + * @version $Revision: 1042376 $ $Date: 2010-12-05 16:54:55 +0100 (dim. 05 déc. 2010) $ + * @since 2.0 + */ +public class EuclideanIntegerPoint implements Clusterable<EuclideanIntegerPoint>, Serializable { + + /** Serializable version identifier. */ + private static final long serialVersionUID = 3946024775784901369L; + + /** Point coordinates. */ + private final int[] point; + + /** + * Build an instance wrapping an integer array. + * <p>The wrapped array is referenced, it is <em>not</em> copied.</p> + * @param point the n-dimensional point in integer space + */ + public EuclideanIntegerPoint(final int[] point) { + this.point = point; + } + + /** + * Get the n-dimensional point in integer space. + * @return a reference (not a copy!) to the wrapped array + */ + public int[] getPoint() { + return point; + } + + /** {@inheritDoc} */ + public double distanceFrom(final EuclideanIntegerPoint p) { + return MathUtils.distance(point, p.getPoint()); + } + + /** {@inheritDoc} */ + public EuclideanIntegerPoint centroidOf(final Collection<EuclideanIntegerPoint> points) { + int[] centroid = new int[getPoint().length]; + for (EuclideanIntegerPoint p : points) { + for (int i = 0; i < centroid.length; i++) { + centroid[i] += p.getPoint()[i]; + } + } + for (int i = 0; i < centroid.length; i++) { + centroid[i] /= points.size(); + } + return new EuclideanIntegerPoint(centroid); + } + + /** {@inheritDoc} */ + @Override + public boolean equals(final Object other) { + if (!(other instanceof EuclideanIntegerPoint)) { + return false; + } + final int[] otherPoint = ((EuclideanIntegerPoint) other).getPoint(); + if (point.length != otherPoint.length) { + return false; + } + for (int i = 0; i < point.length; i++) { + if (point[i] != otherPoint[i]) { + return false; + } + } + return true; + } + + /** {@inheritDoc} */ + @Override + public int hashCode() { + int hashCode = 0; + for (Integer i : point) { + hashCode += i.hashCode() * 13 + 7; + } + return hashCode; + } + + /** + * {@inheritDoc} + * @since 2.1 + */ + @Override + public String toString() { + final StringBuilder buff = new StringBuilder("("); + final int[] coordinates = getPoint(); + for (int i = 0; i < coordinates.length; i++) { + buff.append(coordinates[i]); + if (i < coordinates.length - 1) { + buff.append(","); + } + } + buff.append(")"); + return buff.toString(); + } + +} diff --git a/src/main/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClusterer.java b/src/main/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClusterer.java new file mode 100644 index 0000000..eb61866 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClusterer.java @@ -0,0 +1,333 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.math.stat.clustering; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.Random; + +import org.apache.commons.math.exception.ConvergenceException; +import org.apache.commons.math.exception.util.LocalizedFormats; +import org.apache.commons.math.stat.descriptive.moment.Variance; + +/** + * Clustering algorithm based on David Arthur and Sergei Vassilvitski k-means++ algorithm. + * @param <T> type of the points to cluster + * @see <a href="http://en.wikipedia.org/wiki/K-means%2B%2B">K-means++ (wikipedia)</a> + * @version $Revision: 1054333 $ $Date: 2011-01-02 01:34:58 +0100 (dim. 02 janv. 2011) $ + * @since 2.0 + */ +public class KMeansPlusPlusClusterer<T extends Clusterable<T>> { + + /** Strategies to use for replacing an empty cluster. */ + public static enum EmptyClusterStrategy { + + /** Split the cluster with largest distance variance. */ + LARGEST_VARIANCE, + + /** Split the cluster with largest number of points. */ + LARGEST_POINTS_NUMBER, + + /** Create a cluster around the point farthest from its centroid. */ + FARTHEST_POINT, + + /** Generate an error. */ + ERROR + + } + + /** Random generator for choosing initial centers. */ + private final Random random; + + /** Selected strategy for empty clusters. */ + private final EmptyClusterStrategy emptyStrategy; + + /** Build a clusterer. + * <p> + * The default strategy for handling empty clusters that may appear during + * algorithm iterations is to split the cluster with largest distance variance. + * </p> + * @param random random generator to use for choosing initial centers + */ + public KMeansPlusPlusClusterer(final Random random) { + this(random, EmptyClusterStrategy.LARGEST_VARIANCE); + } + + /** Build a clusterer. + * @param random random generator to use for choosing initial centers + * @param emptyStrategy strategy to use for handling empty clusters that + * may appear during algorithm iterations + * @since 2.2 + */ + public KMeansPlusPlusClusterer(final Random random, final EmptyClusterStrategy emptyStrategy) { + this.random = random; + this.emptyStrategy = emptyStrategy; + } + + /** + * Runs the K-means++ clustering algorithm. + * + * @param points the points to cluster + * @param k the number of clusters to split the data into + * @param maxIterations the maximum number of iterations to run the algorithm + * for. If negative, no maximum will be used + * @return a list of clusters containing the points + */ + public List<Cluster<T>> cluster(final Collection<T> points, + final int k, final int maxIterations) { + // create the initial clusters + List<Cluster<T>> clusters = chooseInitialCenters(points, k, random); + assignPointsToClusters(clusters, points); + + // iterate through updating the centers until we're done + final int max = (maxIterations < 0) ? Integer.MAX_VALUE : maxIterations; + for (int count = 0; count < max; count++) { + boolean clusteringChanged = false; + List<Cluster<T>> newClusters = new ArrayList<Cluster<T>>(); + for (final Cluster<T> cluster : clusters) { + final T newCenter; + if (cluster.getPoints().isEmpty()) { + switch (emptyStrategy) { + case LARGEST_VARIANCE : + newCenter = getPointFromLargestVarianceCluster(clusters); + break; + case LARGEST_POINTS_NUMBER : + newCenter = getPointFromLargestNumberCluster(clusters); + break; + case FARTHEST_POINT : + newCenter = getFarthestPoint(clusters); + break; + default : + throw new ConvergenceException(LocalizedFormats.EMPTY_CLUSTER_IN_K_MEANS); + } + clusteringChanged = true; + } else { + newCenter = cluster.getCenter().centroidOf(cluster.getPoints()); + if (!newCenter.equals(cluster.getCenter())) { + clusteringChanged = true; + } + } + newClusters.add(new Cluster<T>(newCenter)); + } + if (!clusteringChanged) { + return clusters; + } + assignPointsToClusters(newClusters, points); + clusters = newClusters; + } + return clusters; + } + + /** + * Adds the given points to the closest {@link Cluster}. + * + * @param <T> type of the points to cluster + * @param clusters the {@link Cluster}s to add the points to + * @param points the points to add to the given {@link Cluster}s + */ + private static <T extends Clusterable<T>> void + assignPointsToClusters(final Collection<Cluster<T>> clusters, final Collection<T> points) { + for (final T p : points) { + Cluster<T> cluster = getNearestCluster(clusters, p); + cluster.addPoint(p); + } + } + + /** + * Use K-means++ to choose the initial centers. + * + * @param <T> type of the points to cluster + * @param points the points to choose the initial centers from + * @param k the number of centers to choose + * @param random random generator to use + * @return the initial centers + */ + private static <T extends Clusterable<T>> List<Cluster<T>> + chooseInitialCenters(final Collection<T> points, final int k, final Random random) { + + final List<T> pointSet = new ArrayList<T>(points); + final List<Cluster<T>> resultSet = new ArrayList<Cluster<T>>(); + + // Choose one center uniformly at random from among the data points. + final T firstPoint = pointSet.remove(random.nextInt(pointSet.size())); + resultSet.add(new Cluster<T>(firstPoint)); + + final double[] dx2 = new double[pointSet.size()]; + while (resultSet.size() < k) { + // For each data point x, compute D(x), the distance between x and + // the nearest center that has already been chosen. + int sum = 0; + for (int i = 0; i < pointSet.size(); i++) { + final T p = pointSet.get(i); + final Cluster<T> nearest = getNearestCluster(resultSet, p); + final double d = p.distanceFrom(nearest.getCenter()); + sum += d * d; + dx2[i] = sum; + } + + // Add one new data point as a center. Each point x is chosen with + // probability proportional to D(x)2 + final double r = random.nextDouble() * sum; + for (int i = 0 ; i < dx2.length; i++) { + if (dx2[i] >= r) { + final T p = pointSet.remove(i); + resultSet.add(new Cluster<T>(p)); + break; + } + } + } + + return resultSet; + + } + + /** + * Get a random point from the {@link Cluster} with the largest distance variance. + * + * @param clusters the {@link Cluster}s to search + * @return a random point from the selected cluster + */ + private T getPointFromLargestVarianceCluster(final Collection<Cluster<T>> clusters) { + + double maxVariance = Double.NEGATIVE_INFINITY; + Cluster<T> selected = null; + for (final Cluster<T> cluster : clusters) { + if (!cluster.getPoints().isEmpty()) { + + // compute the distance variance of the current cluster + final T center = cluster.getCenter(); + final Variance stat = new Variance(); + for (final T point : cluster.getPoints()) { + stat.increment(point.distanceFrom(center)); + } + final double variance = stat.getResult(); + + // select the cluster with the largest variance + if (variance > maxVariance) { + maxVariance = variance; + selected = cluster; + } + + } + } + + // did we find at least one non-empty cluster ? + if (selected == null) { + throw new ConvergenceException(LocalizedFormats.EMPTY_CLUSTER_IN_K_MEANS); + } + + // extract a random point from the cluster + final List<T> selectedPoints = selected.getPoints(); + return selectedPoints.remove(random.nextInt(selectedPoints.size())); + + } + + /** + * Get a random point from the {@link Cluster} with the largest number of points + * + * @param clusters the {@link Cluster}s to search + * @return a random point from the selected cluster + */ + private T getPointFromLargestNumberCluster(final Collection<Cluster<T>> clusters) { + + int maxNumber = 0; + Cluster<T> selected = null; + for (final Cluster<T> cluster : clusters) { + + // get the number of points of the current cluster + final int number = cluster.getPoints().size(); + + // select the cluster with the largest number of points + if (number > maxNumber) { + maxNumber = number; + selected = cluster; + } + + } + + // did we find at least one non-empty cluster ? + if (selected == null) { + throw new ConvergenceException(LocalizedFormats.EMPTY_CLUSTER_IN_K_MEANS); + } + + // extract a random point from the cluster + final List<T> selectedPoints = selected.getPoints(); + return selectedPoints.remove(random.nextInt(selectedPoints.size())); + + } + + /** + * Get the point farthest to its cluster center + * + * @param clusters the {@link Cluster}s to search + * @return point farthest to its cluster center + */ + private T getFarthestPoint(final Collection<Cluster<T>> clusters) { + + double maxDistance = Double.NEGATIVE_INFINITY; + Cluster<T> selectedCluster = null; + int selectedPoint = -1; + for (final Cluster<T> cluster : clusters) { + + // get the farthest point + final T center = cluster.getCenter(); + final List<T> points = cluster.getPoints(); + for (int i = 0; i < points.size(); ++i) { + final double distance = points.get(i).distanceFrom(center); + if (distance > maxDistance) { + maxDistance = distance; + selectedCluster = cluster; + selectedPoint = i; + } + } + + } + + // did we find at least one non-empty cluster ? + if (selectedCluster == null) { + throw new ConvergenceException(LocalizedFormats.EMPTY_CLUSTER_IN_K_MEANS); + } + + return selectedCluster.getPoints().remove(selectedPoint); + + } + + /** + * Returns the nearest {@link Cluster} to the given point + * + * @param <T> type of the points to cluster + * @param clusters the {@link Cluster}s to search + * @param point the point to find the nearest {@link Cluster} for + * @return the nearest {@link Cluster} to the given point + */ + private static <T extends Clusterable<T>> Cluster<T> + getNearestCluster(final Collection<Cluster<T>> clusters, final T point) { + double minDistance = Double.MAX_VALUE; + Cluster<T> minCluster = null; + for (final Cluster<T> c : clusters) { + final double distance = point.distanceFrom(c.getCenter()); + if (distance < minDistance) { + minDistance = distance; + minCluster = c; + } + } + return minCluster; + } + +} diff --git a/src/main/java/org/apache/commons/math/stat/clustering/package.html b/src/main/java/org/apache/commons/math/stat/clustering/package.html new file mode 100644 index 0000000..21e9079 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/clustering/package.html @@ -0,0 +1,20 @@ +<html> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + --> + <!-- $Revision: 770979 $ $Date: 2009-05-02 21:34:51 +0200 (sam. 02 mai 2009) $ --> + <body>Clustering algorithms</body> +</html> diff --git a/src/main/java/org/apache/commons/math/stat/correlation/Covariance.java b/src/main/java/org/apache/commons/math/stat/correlation/Covariance.java new file mode 100644 index 0000000..393a02d --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/correlation/Covariance.java @@ -0,0 +1,274 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.correlation; + +import org.apache.commons.math.MathRuntimeException; +import org.apache.commons.math.exception.util.LocalizedFormats; +import org.apache.commons.math.linear.RealMatrix; +import org.apache.commons.math.linear.BlockRealMatrix; +import org.apache.commons.math.stat.descriptive.moment.Mean; +import org.apache.commons.math.stat.descriptive.moment.Variance; + +/** + * Computes covariances for pairs of arrays or columns of a matrix. + * + * <p>The constructors that take <code>RealMatrix</code> or + * <code>double[][]</code> arguments generate covariance matrices. The + * columns of the input matrices are assumed to represent variable values.</p> + * + * <p>The constructor argument <code>biasCorrected</code> determines whether or + * not computed covariances are bias-corrected.</p> + * + * <p>Unbiased covariances are given by the formula</p> + * <code>cov(X, Y) = Σ[(x<sub>i</sub> - E(X))(y<sub>i</sub> - E(Y))] / (n - 1)</code> + * where <code>E(X)</code> is the mean of <code>X</code> and <code>E(Y)</code> + * is the mean of the <code>Y</code> values. + * + * <p>Non-bias-corrected estimates use <code>n</code> in place of <code>n - 1</code> + * + * @version $Revision: 983921 $ $Date: 2010-08-10 12:46:06 +0200 (mar. 10 août 2010) $ + * @since 2.0 + */ +public class Covariance { + + /** covariance matrix */ + private final RealMatrix covarianceMatrix; + + /** + * Create an empty covariance matrix. + */ + /** Number of observations (length of covariate vectors) */ + private final int n; + + /** + * Create a Covariance with no data + */ + public Covariance() { + super(); + covarianceMatrix = null; + n = 0; + } + + /** + * Create a Covariance matrix from a rectangular array + * whose columns represent covariates. + * + * <p>The <code>biasCorrected</code> parameter determines whether or not + * covariance estimates are bias-corrected.</p> + * + * <p>The input array must be rectangular with at least two columns + * and two rows.</p> + * + * @param data rectangular array with columns representing covariates + * @param biasCorrected true means covariances are bias-corrected + * @throws IllegalArgumentException if the input data array is not + * rectangular with at least two rows and two columns. + */ + public Covariance(double[][] data, boolean biasCorrected) { + this(new BlockRealMatrix(data), biasCorrected); + } + + /** + * Create a Covariance matrix from a rectangular array + * whose columns represent covariates. + * + * <p>The input array must be rectangular with at least two columns + * and two rows</p> + * + * @param data rectangular array with columns representing covariates + * @throws IllegalArgumentException if the input data array is not + * rectangular with at least two rows and two columns. + */ + public Covariance(double[][] data) { + this(data, true); + } + + /** + * Create a covariance matrix from a matrix whose columns + * represent covariates. + * + * <p>The <code>biasCorrected</code> parameter determines whether or not + * covariance estimates are bias-corrected.</p> + * + * <p>The matrix must have at least two columns and two rows</p> + * + * @param matrix matrix with columns representing covariates + * @param biasCorrected true means covariances are bias-corrected + * @throws IllegalArgumentException if the input matrix does not have + * at least two rows and two columns + */ + public Covariance(RealMatrix matrix, boolean biasCorrected) { + checkSufficientData(matrix); + n = matrix.getRowDimension(); + covarianceMatrix = computeCovarianceMatrix(matrix, biasCorrected); + } + + /** + * Create a covariance matrix from a matrix whose columns + * represent covariates. + * + * <p>The matrix must have at least two columns and two rows</p> + * + * @param matrix matrix with columns representing covariates + * @throws IllegalArgumentException if the input matrix does not have + * at least two rows and two columns + */ + public Covariance(RealMatrix matrix) { + this(matrix, true); + } + + /** + * Returns the covariance matrix + * + * @return covariance matrix + */ + public RealMatrix getCovarianceMatrix() { + return covarianceMatrix; + } + + /** + * Returns the number of observations (length of covariate vectors) + * + * @return number of observations + */ + + public int getN() { + return n; + } + + /** + * Compute a covariance matrix from a matrix whose columns represent + * covariates. + * @param matrix input matrix (must have at least two columns and two rows) + * @param biasCorrected determines whether or not covariance estimates are bias-corrected + * @return covariance matrix + */ + protected RealMatrix computeCovarianceMatrix(RealMatrix matrix, boolean biasCorrected) { + int dimension = matrix.getColumnDimension(); + Variance variance = new Variance(biasCorrected); + RealMatrix outMatrix = new BlockRealMatrix(dimension, dimension); + for (int i = 0; i < dimension; i++) { + for (int j = 0; j < i; j++) { + double cov = covariance(matrix.getColumn(i), matrix.getColumn(j), biasCorrected); + outMatrix.setEntry(i, j, cov); + outMatrix.setEntry(j, i, cov); + } + outMatrix.setEntry(i, i, variance.evaluate(matrix.getColumn(i))); + } + return outMatrix; + } + + /** + * Create a covariance matrix from a matrix whose columns represent + * covariates. Covariances are computed using the bias-corrected formula. + * @param matrix input matrix (must have at least two columns and two rows) + * @return covariance matrix + * @see #Covariance + */ + protected RealMatrix computeCovarianceMatrix(RealMatrix matrix) { + return computeCovarianceMatrix(matrix, true); + } + + /** + * Compute a covariance matrix from a rectangular array whose columns represent + * covariates. + * @param data input array (must have at least two columns and two rows) + * @param biasCorrected determines whether or not covariance estimates are bias-corrected + * @return covariance matrix + */ + protected RealMatrix computeCovarianceMatrix(double[][] data, boolean biasCorrected) { + return computeCovarianceMatrix(new BlockRealMatrix(data), biasCorrected); + } + + /** + * Create a covariance matrix from a rectangual array whose columns represent + * covariates. Covariances are computed using the bias-corrected formula. + * @param data input array (must have at least two columns and two rows) + * @return covariance matrix + * @see #Covariance + */ + protected RealMatrix computeCovarianceMatrix(double[][] data) { + return computeCovarianceMatrix(data, true); + } + + /** + * Computes the covariance between the two arrays. + * + * <p>Array lengths must match and the common length must be at least 2.</p> + * + * @param xArray first data array + * @param yArray second data array + * @param biasCorrected if true, returned value will be bias-corrected + * @return returns the covariance for the two arrays + * @throws IllegalArgumentException if the arrays lengths do not match or + * there is insufficient data + */ + public double covariance(final double[] xArray, final double[] yArray, boolean biasCorrected) + throws IllegalArgumentException { + Mean mean = new Mean(); + double result = 0d; + int length = xArray.length; + if (length != yArray.length) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.DIMENSIONS_MISMATCH_SIMPLE, length, yArray.length); + } else if (length < 2) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.INSUFFICIENT_DIMENSION, length, 2); + } else { + double xMean = mean.evaluate(xArray); + double yMean = mean.evaluate(yArray); + for (int i = 0; i < length; i++) { + double xDev = xArray[i] - xMean; + double yDev = yArray[i] - yMean; + result += (xDev * yDev - result) / (i + 1); + } + } + return biasCorrected ? result * ((double) length / (double)(length - 1)) : result; + } + + /** + * Computes the covariance between the two arrays, using the bias-corrected + * formula. + * + * <p>Array lengths must match and the common length must be at least 2.</p> + * + * @param xArray first data array + * @param yArray second data array + * @return returns the covariance for the two arrays + * @throws IllegalArgumentException if the arrays lengths do not match or + * there is insufficient data + */ + public double covariance(final double[] xArray, final double[] yArray) + throws IllegalArgumentException { + return covariance(xArray, yArray, true); + } + + /** + * Throws IllegalArgumentException of the matrix does not have at least + * two columns and two rows + * @param matrix matrix to check + */ + private void checkSufficientData(final RealMatrix matrix) { + int nRows = matrix.getRowDimension(); + int nCols = matrix.getColumnDimension(); + if (nRows < 2 || nCols < 2) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.INSUFFICIENT_ROWS_AND_COLUMNS, + nRows, nCols); + } + } +} diff --git a/src/main/java/org/apache/commons/math/stat/correlation/PearsonsCorrelation.java b/src/main/java/org/apache/commons/math/stat/correlation/PearsonsCorrelation.java new file mode 100644 index 0000000..6467c69 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/correlation/PearsonsCorrelation.java @@ -0,0 +1,285 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.correlation; + +import org.apache.commons.math.MathException; +import org.apache.commons.math.MathRuntimeException; +import org.apache.commons.math.distribution.TDistribution; +import org.apache.commons.math.distribution.TDistributionImpl; +import org.apache.commons.math.exception.util.LocalizedFormats; +import org.apache.commons.math.exception.NullArgumentException; +import org.apache.commons.math.exception.DimensionMismatchException; +import org.apache.commons.math.linear.RealMatrix; +import org.apache.commons.math.linear.BlockRealMatrix; +import org.apache.commons.math.stat.regression.SimpleRegression; +import org.apache.commons.math.util.FastMath; + +/** + * Computes Pearson's product-moment correlation coefficients for pairs of arrays + * or columns of a matrix. + * + * <p>The constructors that take <code>RealMatrix</code> or + * <code>double[][]</code> arguments generate correlation matrices. The + * columns of the input matrices are assumed to represent variable values. + * Correlations are given by the formula</p> + * <code>cor(X, Y) = Σ[(x<sub>i</sub> - E(X))(y<sub>i</sub> - E(Y))] / [(n - 1)s(X)s(Y)]</code> + * where <code>E(X)</code> is the mean of <code>X</code>, <code>E(Y)</code> + * is the mean of the <code>Y</code> values and s(X), s(Y) are standard deviations. + * + * @version $Revision: 990655 $ $Date: 2010-08-29 23:49:40 +0200 (dim. 29 août 2010) $ + * @since 2.0 + */ +public class PearsonsCorrelation { + + /** correlation matrix */ + private final RealMatrix correlationMatrix; + + /** number of observations */ + private final int nObs; + + /** + * Create a PearsonsCorrelation instance without data + */ + public PearsonsCorrelation() { + super(); + correlationMatrix = null; + nObs = 0; + } + + /** + * Create a PearsonsCorrelation from a rectangular array + * whose columns represent values of variables to be correlated. + * + * @param data rectangular array with columns representing variables + * @throws IllegalArgumentException if the input data array is not + * rectangular with at least two rows and two columns. + */ + public PearsonsCorrelation(double[][] data) { + this(new BlockRealMatrix(data)); + } + + /** + * Create a PearsonsCorrelation from a RealMatrix whose columns + * represent variables to be correlated. + * + * @param matrix matrix with columns representing variables to correlate + */ + public PearsonsCorrelation(RealMatrix matrix) { + checkSufficientData(matrix); + nObs = matrix.getRowDimension(); + correlationMatrix = computeCorrelationMatrix(matrix); + } + + /** + * Create a PearsonsCorrelation from a {@link Covariance}. The correlation + * matrix is computed by scaling the Covariance's covariance matrix. + * The Covariance instance must have been created from a data matrix with + * columns representing variable values. + * + * @param covariance Covariance instance + */ + public PearsonsCorrelation(Covariance covariance) { + RealMatrix covarianceMatrix = covariance.getCovarianceMatrix(); + if (covarianceMatrix == null) { + throw new NullArgumentException(LocalizedFormats.COVARIANCE_MATRIX); + } + nObs = covariance.getN(); + correlationMatrix = covarianceToCorrelation(covarianceMatrix); + } + + /** + * Create a PearsonsCorrelation from a covariance matrix. The correlation + * matrix is computed by scaling the covariance matrix. + * + * @param covarianceMatrix covariance matrix + * @param numberOfObservations the number of observations in the dataset used to compute + * the covariance matrix + */ + public PearsonsCorrelation(RealMatrix covarianceMatrix, int numberOfObservations) { + nObs = numberOfObservations; + correlationMatrix = covarianceToCorrelation(covarianceMatrix); + + } + + /** + * Returns the correlation matrix + * + * @return correlation matrix + */ + public RealMatrix getCorrelationMatrix() { + return correlationMatrix; + } + + /** + * Returns a matrix of standard errors associated with the estimates + * in the correlation matrix.<br/> + * <code>getCorrelationStandardErrors().getEntry(i,j)</code> is the standard + * error associated with <code>getCorrelationMatrix.getEntry(i,j)</code> + * <p>The formula used to compute the standard error is <br/> + * <code>SE<sub>r</sub> = ((1 - r<sup>2</sup>) / (n - 2))<sup>1/2</sup></code> + * where <code>r</code> is the estimated correlation coefficient and + * <code>n</code> is the number of observations in the source dataset.</p> + * + * @return matrix of correlation standard errors + */ + public RealMatrix getCorrelationStandardErrors() { + int nVars = correlationMatrix.getColumnDimension(); + double[][] out = new double[nVars][nVars]; + for (int i = 0; i < nVars; i++) { + for (int j = 0; j < nVars; j++) { + double r = correlationMatrix.getEntry(i, j); + out[i][j] = FastMath.sqrt((1 - r * r) /(nObs - 2)); + } + } + return new BlockRealMatrix(out); + } + + /** + * Returns a matrix of p-values associated with the (two-sided) null + * hypothesis that the corresponding correlation coefficient is zero. + * <p><code>getCorrelationPValues().getEntry(i,j)</code> is the probability + * that a random variable distributed as <code>t<sub>n-2</sub></code> takes + * a value with absolute value greater than or equal to <br> + * <code>|r|((n - 2) / (1 - r<sup>2</sup>))<sup>1/2</sup></code></p> + * <p>The values in the matrix are sometimes referred to as the + * <i>significance</i> of the corresponding correlation coefficients.</p> + * + * @return matrix of p-values + * @throws MathException if an error occurs estimating probabilities + */ + public RealMatrix getCorrelationPValues() throws MathException { + TDistribution tDistribution = new TDistributionImpl(nObs - 2); + int nVars = correlationMatrix.getColumnDimension(); + double[][] out = new double[nVars][nVars]; + for (int i = 0; i < nVars; i++) { + for (int j = 0; j < nVars; j++) { + if (i == j) { + out[i][j] = 0d; + } else { + double r = correlationMatrix.getEntry(i, j); + double t = FastMath.abs(r * FastMath.sqrt((nObs - 2)/(1 - r * r))); + out[i][j] = 2 * tDistribution.cumulativeProbability(-t); + } + } + } + return new BlockRealMatrix(out); + } + + + /** + * Computes the correlation matrix for the columns of the + * input matrix. + * + * @param matrix matrix with columns representing variables to correlate + * @return correlation matrix + */ + public RealMatrix computeCorrelationMatrix(RealMatrix matrix) { + int nVars = matrix.getColumnDimension(); + RealMatrix outMatrix = new BlockRealMatrix(nVars, nVars); + for (int i = 0; i < nVars; i++) { + for (int j = 0; j < i; j++) { + double corr = correlation(matrix.getColumn(i), matrix.getColumn(j)); + outMatrix.setEntry(i, j, corr); + outMatrix.setEntry(j, i, corr); + } + outMatrix.setEntry(i, i, 1d); + } + return outMatrix; + } + + /** + * Computes the correlation matrix for the columns of the + * input rectangular array. The colums of the array represent values + * of variables to be correlated. + * + * @param data matrix with columns representing variables to correlate + * @return correlation matrix + */ + public RealMatrix computeCorrelationMatrix(double[][] data) { + return computeCorrelationMatrix(new BlockRealMatrix(data)); + } + + /** + * Computes the Pearson's product-moment correlation coefficient between the two arrays. + * + * </p>Throws IllegalArgumentException if the arrays do not have the same length + * or their common length is less than 2</p> + * + * @param xArray first data array + * @param yArray second data array + * @return Returns Pearson's correlation coefficient for the two arrays + * @throws IllegalArgumentException if the arrays lengths do not match or + * there is insufficient data + */ + public double correlation(final double[] xArray, final double[] yArray) throws IllegalArgumentException { + SimpleRegression regression = new SimpleRegression(); + if (xArray.length != yArray.length) { + throw new DimensionMismatchException(xArray.length, yArray.length); + } else if (xArray.length < 2) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.INSUFFICIENT_DIMENSION, xArray.length, 2); + } else { + for(int i=0; i<xArray.length; i++) { + regression.addData(xArray[i], yArray[i]); + } + return regression.getR(); + } + } + + /** + * Derives a correlation matrix from a covariance matrix. + * + * <p>Uses the formula <br/> + * <code>r(X,Y) = cov(X,Y)/s(X)s(Y)</code> where + * <code>r(·,·)</code> is the correlation coefficient and + * <code>s(·)</code> means standard deviation.</p> + * + * @param covarianceMatrix the covariance matrix + * @return correlation matrix + */ + public RealMatrix covarianceToCorrelation(RealMatrix covarianceMatrix) { + int nVars = covarianceMatrix.getColumnDimension(); + RealMatrix outMatrix = new BlockRealMatrix(nVars, nVars); + for (int i = 0; i < nVars; i++) { + double sigma = FastMath.sqrt(covarianceMatrix.getEntry(i, i)); + outMatrix.setEntry(i, i, 1d); + for (int j = 0; j < i; j++) { + double entry = covarianceMatrix.getEntry(i, j) / + (sigma * FastMath.sqrt(covarianceMatrix.getEntry(j, j))); + outMatrix.setEntry(i, j, entry); + outMatrix.setEntry(j, i, entry); + } + } + return outMatrix; + } + + /** + * Throws IllegalArgumentException of the matrix does not have at least + * two columns and two rows + * + * @param matrix matrix to check for sufficiency + */ + private void checkSufficientData(final RealMatrix matrix) { + int nRows = matrix.getRowDimension(); + int nCols = matrix.getColumnDimension(); + if (nRows < 2 || nCols < 2) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.INSUFFICIENT_ROWS_AND_COLUMNS, + nRows, nCols); + } + } +} diff --git a/src/main/java/org/apache/commons/math/stat/correlation/SpearmansCorrelation.java b/src/main/java/org/apache/commons/math/stat/correlation/SpearmansCorrelation.java new file mode 100644 index 0000000..fe121fe --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/correlation/SpearmansCorrelation.java @@ -0,0 +1,172 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.math.stat.correlation; + +import org.apache.commons.math.MathRuntimeException; +import org.apache.commons.math.exception.util.LocalizedFormats; +import org.apache.commons.math.linear.BlockRealMatrix; +import org.apache.commons.math.linear.RealMatrix; +import org.apache.commons.math.stat.ranking.NaturalRanking; +import org.apache.commons.math.stat.ranking.RankingAlgorithm; + +/** + * <p>Spearman's rank correlation. This implementation performs a rank + * transformation on the input data and then computes {@link PearsonsCorrelation} + * on the ranked data.</p> + * + * <p>By default, ranks are computed using {@link NaturalRanking} with default + * strategies for handling NaNs and ties in the data (NaNs maximal, ties averaged). + * The ranking algorithm can be set using a constructor argument.</p> + * + * @since 2.0 + * @version $Revision: 983921 $ $Date: 2010-08-10 12:46:06 +0200 (mar. 10 août 2010) $ + */ + +public class SpearmansCorrelation { + + /** Input data */ + private final RealMatrix data; + + /** Ranking algorithm */ + private final RankingAlgorithm rankingAlgorithm; + + /** Rank correlation */ + private final PearsonsCorrelation rankCorrelation; + + /** + * Create a SpearmansCorrelation with the given input data matrix + * and ranking algorithm. + * + * @param dataMatrix matrix of data with columns representing + * variables to correlate + * @param rankingAlgorithm ranking algorithm + */ + public SpearmansCorrelation(final RealMatrix dataMatrix, final RankingAlgorithm rankingAlgorithm) { + this.data = dataMatrix.copy(); + this.rankingAlgorithm = rankingAlgorithm; + rankTransform(data); + rankCorrelation = new PearsonsCorrelation(data); + } + + /** + * Create a SpearmansCorrelation from the given data matrix. + * + * @param dataMatrix matrix of data with columns representing + * variables to correlate + */ + public SpearmansCorrelation(final RealMatrix dataMatrix) { + this(dataMatrix, new NaturalRanking()); + } + + /** + * Create a SpearmansCorrelation without data. + */ + public SpearmansCorrelation() { + data = null; + this.rankingAlgorithm = new NaturalRanking(); + rankCorrelation = null; + } + + /** + * Calculate the Spearman Rank Correlation Matrix. + * + * @return Spearman Rank Correlation Matrix + */ + public RealMatrix getCorrelationMatrix() { + return rankCorrelation.getCorrelationMatrix(); + } + + /** + * Returns a {@link PearsonsCorrelation} instance constructed from the + * ranked input data. That is, + * <code>new SpearmansCorrelation(matrix).getRankCorrelation()</code> + * is equivalent to + * <code>new PearsonsCorrelation(rankTransform(matrix))</code> where + * <code>rankTransform(matrix)</code> is the result of applying the + * configured <code>RankingAlgorithm</code> to each of the columns of + * <code>matrix.</code> + * + * @return PearsonsCorrelation among ranked column data + */ + public PearsonsCorrelation getRankCorrelation() { + return rankCorrelation; + } + + /** + * Computes the Spearman's rank correlation matrix for the columns of the + * input matrix. + * + * @param matrix matrix with columns representing variables to correlate + * @return correlation matrix + */ + public RealMatrix computeCorrelationMatrix(RealMatrix matrix) { + RealMatrix matrixCopy = matrix.copy(); + rankTransform(matrixCopy); + return new PearsonsCorrelation().computeCorrelationMatrix(matrixCopy); + } + + /** + * Computes the Spearman's rank correlation matrix for the columns of the + * input rectangular array. The columns of the array represent values + * of variables to be correlated. + * + * @param matrix matrix with columns representing variables to correlate + * @return correlation matrix + */ + public RealMatrix computeCorrelationMatrix(double[][] matrix) { + return computeCorrelationMatrix(new BlockRealMatrix(matrix)); + } + + /** + * Computes the Spearman's rank correlation coefficient between the two arrays. + * + * </p>Throws IllegalArgumentException if the arrays do not have the same length + * or their common length is less than 2</p> + * + * @param xArray first data array + * @param yArray second data array + * @return Returns Spearman's rank correlation coefficient for the two arrays + * @throws IllegalArgumentException if the arrays lengths do not match or + * there is insufficient data + */ + public double correlation(final double[] xArray, final double[] yArray) + throws IllegalArgumentException { + if (xArray.length != yArray.length) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.DIMENSIONS_MISMATCH_SIMPLE, xArray.length, yArray.length); + } else if (xArray.length < 2) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.INSUFFICIENT_DIMENSION, xArray.length, 2); + } else { + return new PearsonsCorrelation().correlation(rankingAlgorithm.rank(xArray), + rankingAlgorithm.rank(yArray)); + } + } + + /** + * Applies rank transform to each of the columns of <code>matrix</code> + * using the current <code>rankingAlgorithm</code> + * + * @param matrix matrix to transform + */ + private void rankTransform(RealMatrix matrix) { + for (int i = 0; i < matrix.getColumnDimension(); i++) { + matrix.setColumn(i, rankingAlgorithm.rank(matrix.getColumn(i))); + } + } +} diff --git a/src/main/java/org/apache/commons/math/stat/correlation/package.html b/src/main/java/org/apache/commons/math/stat/correlation/package.html new file mode 100644 index 0000000..8b12fc2 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/correlation/package.html @@ -0,0 +1,22 @@ +<html> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + --> + <!-- $Revision: 744716 $ $Date: 2009-02-15 19:38:49 +0100 (dim. 15 févr. 2009) $ --> + <body> + Correlations/Covariance computations. + </body> +</html> diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/AbstractStorelessUnivariateStatistic.java b/src/main/java/org/apache/commons/math/stat/descriptive/AbstractStorelessUnivariateStatistic.java new file mode 100644 index 0000000..9e721ea --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/descriptive/AbstractStorelessUnivariateStatistic.java @@ -0,0 +1,183 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.descriptive; + +import org.apache.commons.math.exception.util.LocalizedFormats; +import org.apache.commons.math.exception.NullArgumentException; +import org.apache.commons.math.util.MathUtils; + +/** + * + * Abstract implementation of the {@link StorelessUnivariateStatistic} interface. + * <p> + * Provides default <code>evaluate()</code> and <code>incrementAll(double[])<code> + * implementations.</p> + * <p> + * <strong>Note that these implementations are not synchronized.</strong></p> + * + * @version $Revision: 983921 $ $Date: 2010-08-10 12:46:06 +0200 (mar. 10 août 2010) $ + */ +public abstract class AbstractStorelessUnivariateStatistic + extends AbstractUnivariateStatistic + implements StorelessUnivariateStatistic { + + /** + * This default implementation calls {@link #clear}, then invokes + * {@link #increment} in a loop over the the input array, and then uses + * {@link #getResult} to compute the return value. + * <p> + * Note that this implementation changes the internal state of the + * statistic. Its side effects are the same as invoking {@link #clear} and + * then {@link #incrementAll(double[])}.</p> + * <p> + * Implementations may override this method with a more efficient and + * possibly more accurate implementation that works directly with the + * input array.</p> + * <p> + * If the array is null, an IllegalArgumentException is thrown.</p> + * @param values input array + * @return the value of the statistic applied to the input array + * @see org.apache.commons.math.stat.descriptive.UnivariateStatistic#evaluate(double[]) + */ + @Override + public double evaluate(final double[] values) { + if (values == null) { + throw new NullArgumentException(LocalizedFormats.INPUT_ARRAY); + } + return evaluate(values, 0, values.length); + } + + /** + * This default implementation calls {@link #clear}, then invokes + * {@link #increment} in a loop over the specified portion of the input + * array, and then uses {@link #getResult} to compute the return value. + * <p> + * Note that this implementation changes the internal state of the + * statistic. Its side effects are the same as invoking {@link #clear} and + * then {@link #incrementAll(double[], int, int)}.</p> + * <p> + * Implementations may override this method with a more efficient and + * possibly more accurate implementation that works directly with the + * input array.</p> + * <p> + * If the array is null or the index parameters are not valid, an + * IllegalArgumentException is thrown.</p> + * @param values the input array + * @param begin the index of the first element to include + * @param length the number of elements to include + * @return the value of the statistic applied to the included array entries + * @see org.apache.commons.math.stat.descriptive.UnivariateStatistic#evaluate(double[], int, int) + */ + @Override + public double evaluate(final double[] values, final int begin, final int length) { + if (test(values, begin, length)) { + clear(); + incrementAll(values, begin, length); + } + return getResult(); + } + + /** + * {@inheritDoc} + */ + @Override + public abstract StorelessUnivariateStatistic copy(); + + /** + * {@inheritDoc} + */ + public abstract void clear(); + + /** + * {@inheritDoc} + */ + public abstract double getResult(); + + /** + * {@inheritDoc} + */ + public abstract void increment(final double d); + + /** + * This default implementation just calls {@link #increment} in a loop over + * the input array. + * <p> + * Throws IllegalArgumentException if the input values array is null.</p> + * + * @param values values to add + * @throws IllegalArgumentException if values is null + * @see org.apache.commons.math.stat.descriptive.StorelessUnivariateStatistic#incrementAll(double[]) + */ + public void incrementAll(double[] values) { + if (values == null) { + throw new NullArgumentException(LocalizedFormats.INPUT_ARRAY); + } + incrementAll(values, 0, values.length); + } + + /** + * This default implementation just calls {@link #increment} in a loop over + * the specified portion of the input array. + * <p> + * Throws IllegalArgumentException if the input values array is null.</p> + * + * @param values array holding values to add + * @param begin index of the first array element to add + * @param length number of array elements to add + * @throws IllegalArgumentException if values is null + * @see org.apache.commons.math.stat.descriptive.StorelessUnivariateStatistic#incrementAll(double[], int, int) + */ + public void incrementAll(double[] values, int begin, int length) { + if (test(values, begin, length)) { + int k = begin + length; + for (int i = begin; i < k; i++) { + increment(values[i]); + } + } + } + + /** + * Returns true iff <code>object</code> is an + * <code>AbstractStorelessUnivariateStatistic</code> returning the same + * values as this for <code>getResult()</code> and <code>getN()</code> + * @param object object to test equality against. + * @return true if object returns the same value as this + */ + @Override + public boolean equals(Object object) { + if (object == this ) { + return true; + } + if (object instanceof AbstractStorelessUnivariateStatistic == false) { + return false; + } + AbstractStorelessUnivariateStatistic stat = (AbstractStorelessUnivariateStatistic) object; + return MathUtils.equalsIncludingNaN(stat.getResult(), this.getResult()) && + MathUtils.equalsIncludingNaN(stat.getN(), this.getN()); + } + + /** + * Returns hash code based on getResult() and getN() + * + * @return hash code + */ + @Override + public int hashCode() { + return 31* (31 + MathUtils.hash(getResult())) + MathUtils.hash(getN()); + } + +} diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/AbstractUnivariateStatistic.java b/src/main/java/org/apache/commons/math/stat/descriptive/AbstractUnivariateStatistic.java new file mode 100644 index 0000000..354dee6 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/descriptive/AbstractUnivariateStatistic.java @@ -0,0 +1,232 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.descriptive; + +import org.apache.commons.math.MathRuntimeException; +import org.apache.commons.math.exception.DimensionMismatchException; +import org.apache.commons.math.exception.NotPositiveException; +import org.apache.commons.math.exception.NullArgumentException; +import org.apache.commons.math.exception.util.LocalizedFormats; + +/** + * Abstract base class for all implementations of the + * {@link UnivariateStatistic} interface. + * <p> + * Provides a default implementation of <code>evaluate(double[]),</code> + * delegating to <code>evaluate(double[], int, int)</code> in the natural way. + * </p> + * <p> + * Also includes a <code>test</code> method that performs generic parameter + * validation for the <code>evaluate</code> methods.</p> + * + * @version $Revision: 1006299 $ $Date: 2010-10-10 16:47:17 +0200 (dim. 10 oct. 2010) $ + */ +public abstract class AbstractUnivariateStatistic + implements UnivariateStatistic { + + /** Stored data. */ + private double[] storedData; + + /** + * Set the data array. + * <p> + * The stored value is a copy of the parameter array, not the array itself + * </p> + * @param values data array to store (may be null to remove stored data) + * @see #evaluate() + */ + public void setData(final double[] values) { + storedData = (values == null) ? null : values.clone(); + } + + /** + * Get a copy of the stored data array. + * @return copy of the stored data array (may be null) + */ + public double[] getData() { + return (storedData == null) ? null : storedData.clone(); + } + + /** + * Get a reference to the stored data array. + * @return reference to the stored data array (may be null) + */ + protected double[] getDataRef() { + return storedData; + } + + /** + * Set the data array. + * @param values data array to store + * @param begin the index of the first element to include + * @param length the number of elements to include + * @see #evaluate() + */ + public void setData(final double[] values, final int begin, final int length) { + storedData = new double[length]; + System.arraycopy(values, begin, storedData, 0, length); + } + + /** + * Returns the result of evaluating the statistic over the stored data. + * <p> + * The stored array is the one which was set by previous calls to + * </p> + * @return the value of the statistic applied to the stored data + */ + public double evaluate() { + return evaluate(storedData); + } + + /** + * {@inheritDoc} + */ + public double evaluate(final double[] values) { + test(values, 0, 0); + return evaluate(values, 0, values.length); + } + + /** + * {@inheritDoc} + */ + public abstract double evaluate(final double[] values, final int begin, final int length); + + /** + * {@inheritDoc} + */ + public abstract UnivariateStatistic copy(); + + /** + * This method is used by <code>evaluate(double[], int, int)</code> methods + * to verify that the input parameters designate a subarray of positive length. + * <p> + * <ul> + * <li>returns <code>true</code> iff the parameters designate a subarray of + * positive length</li> + * <li>throws <code>IllegalArgumentException</code> if the array is null or + * or the indices are invalid</li> + * <li>returns <code>false</li> if the array is non-null, but + * <code>length</code> is 0. + * </ul></p> + * + * @param values the input array + * @param begin index of the first array element to include + * @param length the number of elements to include + * @return true if the parameters are valid and designate a subarray of positive length + * @throws IllegalArgumentException if the indices are invalid or the array is null + */ + protected boolean test( + final double[] values, + final int begin, + final int length) { + + if (values == null) { + throw new NullArgumentException(LocalizedFormats.INPUT_ARRAY); + } + + if (begin < 0) { + throw new NotPositiveException(LocalizedFormats.START_POSITION, begin); + } + + if (length < 0) { + throw new NotPositiveException(LocalizedFormats.LENGTH, length); + } + + if (begin + length > values.length) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.SUBARRAY_ENDS_AFTER_ARRAY_END); + } + + if (length == 0) { + return false; + } + + return true; + + } + + /** + * This method is used by <code>evaluate(double[], double[], int, int)</code> methods + * to verify that the begin and length parameters designate a subarray of positive length + * and the weights are all non-negative, non-NaN, finite, and not all zero. + * <p> + * <ul> + * <li>returns <code>true</code> iff the parameters designate a subarray of + * positive length and the weights array contains legitimate values.</li> + * <li>throws <code>IllegalArgumentException</code> if any of the following are true: + * <ul><li>the values array is null</li> + * <li>the weights array is null</li> + * <li>the weights array does not have the same length as the values array</li> + * <li>the weights array contains one or more infinite values</li> + * <li>the weights array contains one or more NaN values</li> + * <li>the weights array contains negative values</li> + * <li>the start and length arguments do not determine a valid array</li></ul> + * </li> + * <li>returns <code>false</li> if the array is non-null, but + * <code>length</code> is 0. + * </ul></p> + * + * @param values the input array + * @param weights the weights array + * @param begin index of the first array element to include + * @param length the number of elements to include + * @return true if the parameters are valid and designate a subarray of positive length + * @throws IllegalArgumentException if the indices are invalid or the array is null + * @since 2.1 + */ + protected boolean test( + final double[] values, + final double[] weights, + final int begin, + final int length) { + + if (weights == null) { + throw new NullArgumentException(LocalizedFormats.INPUT_ARRAY); + } + + if (weights.length != values.length) { + throw new DimensionMismatchException(weights.length, values.length); + } + + boolean containsPositiveWeight = false; + for (int i = begin; i < begin + length; i++) { + if (Double.isNaN(weights[i])) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.NAN_ELEMENT_AT_INDEX, i); + } + if (Double.isInfinite(weights[i])) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.INFINITE_ARRAY_ELEMENT, weights[i], i); + } + if (weights[i] < 0) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.NEGATIVE_ELEMENT_AT_INDEX, i, weights[i]); + } + if (!containsPositiveWeight && weights[i] > 0.0) { + containsPositiveWeight = true; + } + } + + if (!containsPositiveWeight) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.WEIGHT_AT_LEAST_ONE_NON_ZERO); + } + + return test(values, begin, length); + } +} + diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/AggregateSummaryStatistics.java b/src/main/java/org/apache/commons/math/stat/descriptive/AggregateSummaryStatistics.java new file mode 100644 index 0000000..98c58c8 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/descriptive/AggregateSummaryStatistics.java @@ -0,0 +1,416 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.math.stat.descriptive; + +import java.io.Serializable; +import java.util.Collection; +import java.util.Iterator; + +/** + * <p> + * An aggregator for {@code SummaryStatistics} from several data sets or + * data set partitions. In its simplest usage mode, the client creates an + * instance via the zero-argument constructor, then uses + * {@link #createContributingStatistics()} to obtain a {@code SummaryStatistics} + * for each individual data set / partition. The per-set statistics objects + * are used as normal, and at any time the aggregate statistics for all the + * contributors can be obtained from this object. + * </p><p> + * Clients with specialized requirements can use alternative constructors to + * control the statistics implementations and initial values used by the + * contributing and the internal aggregate {@code SummaryStatistics} objects. + * </p><p> + * A static {@link #aggregate(Collection)} method is also included that computes + * aggregate statistics directly from a Collection of SummaryStatistics instances. + * </p><p> + * When {@link #createContributingStatistics()} is used to create SummaryStatistics + * instances to be aggregated concurrently, the created instances' + * {@link SummaryStatistics#addValue(double)} methods must synchronize on the aggregating + * instance maintained by this class. In multithreaded environments, if the functionality + * provided by {@link #aggregate(Collection)} is adequate, that method should be used + * to avoid unecessary computation and synchronization delays.</p> + * + * @since 2.0 + * @version $Revision: 811833 $ $Date: 2009-09-06 18:27:50 +0200 (dim. 06 sept. 2009) $ + * + */ +public class AggregateSummaryStatistics implements StatisticalSummary, + Serializable { + + + /** Serializable version identifier */ + private static final long serialVersionUID = -8207112444016386906L; + + /** + * A SummaryStatistics serving as a prototype for creating SummaryStatistics + * contributing to this aggregate + */ + private final SummaryStatistics statisticsPrototype; + + /** + * The SummaryStatistics in which aggregate statistics are accumulated. + */ + private final SummaryStatistics statistics; + + /** + * Initializes a new AggregateSummaryStatistics with default statistics + * implementations. + * + */ + public AggregateSummaryStatistics() { + this(new SummaryStatistics()); + } + + /** + * Initializes a new AggregateSummaryStatistics with the specified statistics + * object as a prototype for contributing statistics and for the internal + * aggregate statistics. This provides for customized statistics implementations + * to be used by contributing and aggregate statistics. + * + * @param prototypeStatistics a {@code SummaryStatistics} serving as a + * prototype both for the internal aggregate statistics and for + * contributing statistics obtained via the + * {@code createContributingStatistics()} method. Being a prototype + * means that other objects are initialized by copying this object's state. + * If {@code null}, a new, default statistics object is used. Any statistic + * values in the prototype are propagated to contributing statistics + * objects and (once) into these aggregate statistics. + * @see #createContributingStatistics() + */ + public AggregateSummaryStatistics(SummaryStatistics prototypeStatistics) { + this(prototypeStatistics, + prototypeStatistics == null ? null : new SummaryStatistics(prototypeStatistics)); + } + + /** + * Initializes a new AggregateSummaryStatistics with the specified statistics + * object as a prototype for contributing statistics and for the internal + * aggregate statistics. This provides for different statistics implementations + * to be used by contributing and aggregate statistics and for an initial + * state to be supplied for the aggregate statistics. + * + * @param prototypeStatistics a {@code SummaryStatistics} serving as a + * prototype both for the internal aggregate statistics and for + * contributing statistics obtained via the + * {@code createContributingStatistics()} method. Being a prototype + * means that other objects are initialized by copying this object's state. + * If {@code null}, a new, default statistics object is used. Any statistic + * values in the prototype are propagated to contributing statistics + * objects, but not into these aggregate statistics. + * @param initialStatistics a {@code SummaryStatistics} to serve as the + * internal aggregate statistics object. If {@code null}, a new, default + * statistics object is used. + * @see #createContributingStatistics() + */ + public AggregateSummaryStatistics(SummaryStatistics prototypeStatistics, + SummaryStatistics initialStatistics) { + this.statisticsPrototype = + (prototypeStatistics == null) ? new SummaryStatistics() : prototypeStatistics; + this.statistics = + (initialStatistics == null) ? new SummaryStatistics() : initialStatistics; + } + + /** + * {@inheritDoc}. This version returns the maximum over all the aggregated + * data. + * + * @see StatisticalSummary#getMax() + */ + public double getMax() { + synchronized (statistics) { + return statistics.getMax(); + } + } + + /** + * {@inheritDoc}. This version returns the mean of all the aggregated data. + * + * @see StatisticalSummary#getMean() + */ + public double getMean() { + synchronized (statistics) { + return statistics.getMean(); + } + } + + /** + * {@inheritDoc}. This version returns the minimum over all the aggregated + * data. + * + * @see StatisticalSummary#getMin() + */ + public double getMin() { + synchronized (statistics) { + return statistics.getMin(); + } + } + + /** + * {@inheritDoc}. This version returns a count of all the aggregated data. + * + * @see StatisticalSummary#getN() + */ + public long getN() { + synchronized (statistics) { + return statistics.getN(); + } + } + + /** + * {@inheritDoc}. This version returns the standard deviation of all the + * aggregated data. + * + * @see StatisticalSummary#getStandardDeviation() + */ + public double getStandardDeviation() { + synchronized (statistics) { + return statistics.getStandardDeviation(); + } + } + + /** + * {@inheritDoc}. This version returns a sum of all the aggregated data. + * + * @see StatisticalSummary#getSum() + */ + public double getSum() { + synchronized (statistics) { + return statistics.getSum(); + } + } + + /** + * {@inheritDoc}. This version returns the variance of all the aggregated + * data. + * + * @see StatisticalSummary#getVariance() + */ + public double getVariance() { + synchronized (statistics) { + return statistics.getVariance(); + } + } + + /** + * Returns the sum of the logs of all the aggregated data. + * + * @return the sum of logs + * @see SummaryStatistics#getSumOfLogs() + */ + public double getSumOfLogs() { + synchronized (statistics) { + return statistics.getSumOfLogs(); + } + } + + /** + * Returns the geometric mean of all the aggregated data. + * + * @return the geometric mean + * @see SummaryStatistics#getGeometricMean() + */ + public double getGeometricMean() { + synchronized (statistics) { + return statistics.getGeometricMean(); + } + } + + /** + * Returns the sum of the squares of all the aggregated data. + * + * @return The sum of squares + * @see SummaryStatistics#getSumsq() + */ + public double getSumsq() { + synchronized (statistics) { + return statistics.getSumsq(); + } + } + + /** + * Returns a statistic related to the Second Central Moment. Specifically, + * what is returned is the sum of squared deviations from the sample mean + * among the all of the aggregated data. + * + * @return second central moment statistic + * @see SummaryStatistics#getSecondMoment() + */ + public double getSecondMoment() { + synchronized (statistics) { + return statistics.getSecondMoment(); + } + } + + /** + * Return a {@link StatisticalSummaryValues} instance reporting current + * aggregate statistics. + * + * @return Current values of aggregate statistics + */ + public StatisticalSummary getSummary() { + synchronized (statistics) { + return new StatisticalSummaryValues(getMean(), getVariance(), getN(), + getMax(), getMin(), getSum()); + } + } + + /** + * Creates and returns a {@code SummaryStatistics} whose data will be + * aggregated with those of this {@code AggregateSummaryStatistics}. + * + * @return a {@code SummaryStatistics} whose data will be aggregated with + * those of this {@code AggregateSummaryStatistics}. The initial state + * is a copy of the configured prototype statistics. + */ + public SummaryStatistics createContributingStatistics() { + SummaryStatistics contributingStatistics + = new AggregatingSummaryStatistics(statistics); + + SummaryStatistics.copy(statisticsPrototype, contributingStatistics); + + return contributingStatistics; + } + + /** + * Computes aggregate summary statistics. This method can be used to combine statistics + * computed over partitions or subsamples - i.e., the StatisticalSummaryValues returned + * should contain the same values that would have been obtained by computing a single + * StatisticalSummary over the combined dataset. + * <p> + * Returns null if the collection is empty or null. + * </p> + * + * @param statistics collection of SummaryStatistics to aggregate + * @return summary statistics for the combined dataset + */ + public static StatisticalSummaryValues aggregate(Collection<SummaryStatistics> statistics) { + if (statistics == null) { + return null; + } + Iterator<SummaryStatistics> iterator = statistics.iterator(); + if (!iterator.hasNext()) { + return null; + } + SummaryStatistics current = iterator.next(); + long n = current.getN(); + double min = current.getMin(); + double sum = current.getSum(); + double max = current.getMax(); + double m2 = current.getSecondMoment(); + double mean = current.getMean(); + while (iterator.hasNext()) { + current = iterator.next(); + if (current.getMin() < min || Double.isNaN(min)) { + min = current.getMin(); + } + if (current.getMax() > max || Double.isNaN(max)) { + max = current.getMax(); + } + sum += current.getSum(); + final double oldN = n; + final double curN = current.getN(); + n += curN; + final double meanDiff = current.getMean() - mean; + mean = sum / n; + m2 = m2 + current.getSecondMoment() + meanDiff * meanDiff * oldN * curN / n; + } + final double variance; + if (n == 0) { + variance = Double.NaN; + } else if (n == 1) { + variance = 0d; + } else { + variance = m2 / (n - 1); + } + return new StatisticalSummaryValues(mean, variance, n, max, min, sum); + } + + /** + * A SummaryStatistics that also forwards all values added to it to a second + * {@code SummaryStatistics} for aggregation. + * + * @since 2.0 + */ + private static class AggregatingSummaryStatistics extends SummaryStatistics { + + /** + * The serialization version of this class + */ + private static final long serialVersionUID = 1L; + + /** + * An additional SummaryStatistics into which values added to these + * statistics (and possibly others) are aggregated + */ + private final SummaryStatistics aggregateStatistics; + + /** + * Initializes a new AggregatingSummaryStatistics with the specified + * aggregate statistics object + * + * @param aggregateStatistics a {@code SummaryStatistics} into which + * values added to this statistics object should be aggregated + */ + public AggregatingSummaryStatistics(SummaryStatistics aggregateStatistics) { + this.aggregateStatistics = aggregateStatistics; + } + + /** + * {@inheritDoc}. This version adds the provided value to the configured + * aggregate after adding it to these statistics. + * + * @see SummaryStatistics#addValue(double) + */ + @Override + public void addValue(double value) { + super.addValue(value); + synchronized (aggregateStatistics) { + aggregateStatistics.addValue(value); + } + } + + /** + * Returns true iff <code>object</code> is a + * <code>SummaryStatistics</code> instance and all statistics have the + * same values as this. + * @param object the object to test equality against. + * @return true if object equals this + */ + @Override + public boolean equals(Object object) { + if (object == this) { + return true; + } + if (object instanceof AggregatingSummaryStatistics == false) { + return false; + } + AggregatingSummaryStatistics stat = (AggregatingSummaryStatistics)object; + return super.equals(stat) && + aggregateStatistics.equals(stat.aggregateStatistics); + } + + /** + * Returns hash code based on values of statistics + * @return hash code + */ + @Override + public int hashCode() { + return 123 + super.hashCode() + aggregateStatistics.hashCode(); + } + } +} diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/DescriptiveStatistics.java b/src/main/java/org/apache/commons/math/stat/descriptive/DescriptiveStatistics.java new file mode 100644 index 0000000..e5a18dc --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/descriptive/DescriptiveStatistics.java @@ -0,0 +1,721 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.descriptive; + +import java.io.Serializable; +import java.lang.reflect.InvocationTargetException; +import java.util.Arrays; + +import org.apache.commons.math.MathRuntimeException; +import org.apache.commons.math.exception.util.LocalizedFormats; +import org.apache.commons.math.stat.descriptive.moment.GeometricMean; +import org.apache.commons.math.stat.descriptive.moment.Kurtosis; +import org.apache.commons.math.stat.descriptive.moment.Mean; +import org.apache.commons.math.stat.descriptive.moment.Skewness; +import org.apache.commons.math.stat.descriptive.moment.Variance; +import org.apache.commons.math.stat.descriptive.rank.Max; +import org.apache.commons.math.stat.descriptive.rank.Min; +import org.apache.commons.math.stat.descriptive.rank.Percentile; +import org.apache.commons.math.stat.descriptive.summary.Sum; +import org.apache.commons.math.stat.descriptive.summary.SumOfSquares; +import org.apache.commons.math.util.ResizableDoubleArray; +import org.apache.commons.math.util.FastMath; + + +/** + * Maintains a dataset of values of a single variable and computes descriptive + * statistics based on stored data. The {@link #getWindowSize() windowSize} + * property sets a limit on the number of values that can be stored in the + * dataset. The default value, INFINITE_WINDOW, puts no limit on the size of + * the dataset. This value should be used with caution, as the backing store + * will grow without bound in this case. For very large datasets, + * {@link SummaryStatistics}, which does not store the dataset, should be used + * instead of this class. If <code>windowSize</code> is not INFINITE_WINDOW and + * more values are added than can be stored in the dataset, new values are + * added in a "rolling" manner, with new values replacing the "oldest" values + * in the dataset. + * + * <p>Note: this class is not threadsafe. Use + * {@link SynchronizedDescriptiveStatistics} if concurrent access from multiple + * threads is required.</p> + * + * @version $Revision: 1054186 $ $Date: 2011-01-01 03:28:46 +0100 (sam. 01 janv. 2011) $ + */ +public class DescriptiveStatistics implements StatisticalSummary, Serializable { + + /** + * Represents an infinite window size. When the {@link #getWindowSize()} + * returns this value, there is no limit to the number of data values + * that can be stored in the dataset. + */ + public static final int INFINITE_WINDOW = -1; + + /** Serialization UID */ + private static final long serialVersionUID = 4133067267405273064L; + + /** Name of the setQuantile method. */ + private static final String SET_QUANTILE_METHOD_NAME = "setQuantile"; + + /** hold the window size **/ + protected int windowSize = INFINITE_WINDOW; + + /** + * Stored data values + */ + protected ResizableDoubleArray eDA = new ResizableDoubleArray(); + + /** Mean statistic implementation - can be reset by setter. */ + private UnivariateStatistic meanImpl = new Mean(); + + /** Geometric mean statistic implementation - can be reset by setter. */ + private UnivariateStatistic geometricMeanImpl = new GeometricMean(); + + /** Kurtosis statistic implementation - can be reset by setter. */ + private UnivariateStatistic kurtosisImpl = new Kurtosis(); + + /** Maximum statistic implementation - can be reset by setter. */ + private UnivariateStatistic maxImpl = new Max(); + + /** Minimum statistic implementation - can be reset by setter. */ + private UnivariateStatistic minImpl = new Min(); + + /** Percentile statistic implementation - can be reset by setter. */ + private UnivariateStatistic percentileImpl = new Percentile(); + + /** Skewness statistic implementation - can be reset by setter. */ + private UnivariateStatistic skewnessImpl = new Skewness(); + + /** Variance statistic implementation - can be reset by setter. */ + private UnivariateStatistic varianceImpl = new Variance(); + + /** Sum of squares statistic implementation - can be reset by setter. */ + private UnivariateStatistic sumsqImpl = new SumOfSquares(); + + /** Sum statistic implementation - can be reset by setter. */ + private UnivariateStatistic sumImpl = new Sum(); + + /** + * Construct a DescriptiveStatistics instance with an infinite window + */ + public DescriptiveStatistics() { + } + + /** + * Construct a DescriptiveStatistics instance with the specified window + * + * @param window the window size. + */ + public DescriptiveStatistics(int window) { + setWindowSize(window); + } + + /** + * Construct a DescriptiveStatistics instance with an infinite window + * and the initial data values in double[] initialDoubleArray. + * If initialDoubleArray is null, then this constructor corresponds to + * DescriptiveStatistics() + * + * @param initialDoubleArray the initial double[]. + */ + public DescriptiveStatistics(double[] initialDoubleArray) { + if (initialDoubleArray != null) { + eDA = new ResizableDoubleArray(initialDoubleArray); + } + } + + /** + * Copy constructor. Construct a new DescriptiveStatistics instance that + * is a copy of original. + * + * @param original DescriptiveStatistics instance to copy + */ + public DescriptiveStatistics(DescriptiveStatistics original) { + copy(original, this); + } + + /** + * Adds the value to the dataset. If the dataset is at the maximum size + * (i.e., the number of stored elements equals the currently configured + * windowSize), the first (oldest) element in the dataset is discarded + * to make room for the new value. + * + * @param v the value to be added + */ + public void addValue(double v) { + if (windowSize != INFINITE_WINDOW) { + if (getN() == windowSize) { + eDA.addElementRolling(v); + } else if (getN() < windowSize) { + eDA.addElement(v); + } + } else { + eDA.addElement(v); + } + } + + /** + * Removes the most recent value from the dataset. + */ + public void removeMostRecentValue() { + eDA.discardMostRecentElements(1); + } + + /** + * Replaces the most recently stored value with the given value. + * There must be at least one element stored to call this method. + * + * @param v the value to replace the most recent stored value + * @return replaced value + */ + public double replaceMostRecentValue(double v) { + return eDA.substituteMostRecentElement(v); + } + + /** + * Returns the <a href="http://www.xycoon.com/arithmetic_mean.htm"> + * arithmetic mean </a> of the available values + * @return The mean or Double.NaN if no values have been added. + */ + public double getMean() { + return apply(meanImpl); + } + + /** + * Returns the <a href="http://www.xycoon.com/geometric_mean.htm"> + * geometric mean </a> of the available values + * @return The geometricMean, Double.NaN if no values have been added, + * or if the product of the available values is less than or equal to 0. + */ + public double getGeometricMean() { + return apply(geometricMeanImpl); + } + + /** + * Returns the variance of the available values. + * @return The variance, Double.NaN if no values have been added + * or 0.0 for a single value set. + */ + public double getVariance() { + return apply(varianceImpl); + } + + /** + * Returns the standard deviation of the available values. + * @return The standard deviation, Double.NaN if no values have been added + * or 0.0 for a single value set. + */ + public double getStandardDeviation() { + double stdDev = Double.NaN; + if (getN() > 0) { + if (getN() > 1) { + stdDev = FastMath.sqrt(getVariance()); + } else { + stdDev = 0.0; + } + } + return stdDev; + } + + /** + * Returns the skewness of the available values. Skewness is a + * measure of the asymmetry of a given distribution. + * @return The skewness, Double.NaN if no values have been added + * or 0.0 for a value set <=2. + */ + public double getSkewness() { + return apply(skewnessImpl); + } + + /** + * Returns the Kurtosis of the available values. Kurtosis is a + * measure of the "peakedness" of a distribution + * @return The kurtosis, Double.NaN if no values have been added, or 0.0 + * for a value set <=3. + */ + public double getKurtosis() { + return apply(kurtosisImpl); + } + + /** + * Returns the maximum of the available values + * @return The max or Double.NaN if no values have been added. + */ + public double getMax() { + return apply(maxImpl); + } + + /** + * Returns the minimum of the available values + * @return The min or Double.NaN if no values have been added. + */ + public double getMin() { + return apply(minImpl); + } + + /** + * Returns the number of available values + * @return The number of available values + */ + public long getN() { + return eDA.getNumElements(); + } + + /** + * Returns the sum of the values that have been added to Univariate. + * @return The sum or Double.NaN if no values have been added + */ + public double getSum() { + return apply(sumImpl); + } + + /** + * Returns the sum of the squares of the available values. + * @return The sum of the squares or Double.NaN if no + * values have been added. + */ + public double getSumsq() { + return apply(sumsqImpl); + } + + /** + * Resets all statistics and storage + */ + public void clear() { + eDA.clear(); + } + + + /** + * Returns the maximum number of values that can be stored in the + * dataset, or INFINITE_WINDOW (-1) if there is no limit. + * + * @return The current window size or -1 if its Infinite. + */ + public int getWindowSize() { + return windowSize; + } + + /** + * WindowSize controls the number of values which contribute + * to the reported statistics. For example, if + * windowSize is set to 3 and the values {1,2,3,4,5} + * have been added <strong> in that order</strong> + * then the <i>available values</i> are {3,4,5} and all + * reported statistics will be based on these values + * @param windowSize sets the size of the window. + */ + public void setWindowSize(int windowSize) { + if (windowSize < 1) { + if (windowSize != INFINITE_WINDOW) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.NOT_POSITIVE_WINDOW_SIZE, windowSize); + } + } + + this.windowSize = windowSize; + + // We need to check to see if we need to discard elements + // from the front of the array. If the windowSize is less than + // the current number of elements. + if (windowSize != INFINITE_WINDOW && windowSize < eDA.getNumElements()) { + eDA.discardFrontElements(eDA.getNumElements() - windowSize); + } + } + + /** + * Returns the current set of values in an array of double primitives. + * The order of addition is preserved. The returned array is a fresh + * copy of the underlying data -- i.e., it is not a reference to the + * stored data. + * + * @return returns the current set of numbers in the order in which they + * were added to this set + */ + public double[] getValues() { + return eDA.getElements(); + } + + /** + * Returns the current set of values in an array of double primitives, + * sorted in ascending order. The returned array is a fresh + * copy of the underlying data -- i.e., it is not a reference to the + * stored data. + * @return returns the current set of + * numbers sorted in ascending order + */ + public double[] getSortedValues() { + double[] sort = getValues(); + Arrays.sort(sort); + return sort; + } + + /** + * Returns the element at the specified index + * @param index The Index of the element + * @return return the element at the specified index + */ + public double getElement(int index) { + return eDA.getElement(index); + } + + /** + * Returns an estimate for the pth percentile of the stored values. + * <p> + * The implementation provided here follows the first estimation procedure presented + * <a href="http://www.itl.nist.gov/div898/handbook/prc/section2/prc252.htm">here.</a> + * </p><p> + * <strong>Preconditions</strong>:<ul> + * <li><code>0 < p ≤ 100</code> (otherwise an + * <code>IllegalArgumentException</code> is thrown)</li> + * <li>at least one value must be stored (returns <code>Double.NaN + * </code> otherwise)</li> + * </ul></p> + * + * @param p the requested percentile (scaled from 0 - 100) + * @return An estimate for the pth percentile of the stored data + * @throws IllegalStateException if percentile implementation has been + * overridden and the supplied implementation does not support setQuantile + * values + */ + public double getPercentile(double p) { + if (percentileImpl instanceof Percentile) { + ((Percentile) percentileImpl).setQuantile(p); + } else { + try { + percentileImpl.getClass().getMethod(SET_QUANTILE_METHOD_NAME, + new Class[] {Double.TYPE}).invoke(percentileImpl, + new Object[] {Double.valueOf(p)}); + } catch (NoSuchMethodException e1) { // Setter guard should prevent + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.PERCENTILE_IMPLEMENTATION_UNSUPPORTED_METHOD, + percentileImpl.getClass().getName(), SET_QUANTILE_METHOD_NAME); + } catch (IllegalAccessException e2) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.PERCENTILE_IMPLEMENTATION_CANNOT_ACCESS_METHOD, + SET_QUANTILE_METHOD_NAME, percentileImpl.getClass().getName()); + } catch (InvocationTargetException e3) { + throw MathRuntimeException.createIllegalArgumentException(e3.getCause()); + } + } + return apply(percentileImpl); + } + + /** + * Generates a text report displaying univariate statistics from values + * that have been added. Each statistic is displayed on a separate + * line. + * + * @return String with line feeds displaying statistics + */ + @Override + public String toString() { + StringBuilder outBuffer = new StringBuilder(); + String endl = "\n"; + outBuffer.append("DescriptiveStatistics:").append(endl); + outBuffer.append("n: ").append(getN()).append(endl); + outBuffer.append("min: ").append(getMin()).append(endl); + outBuffer.append("max: ").append(getMax()).append(endl); + outBuffer.append("mean: ").append(getMean()).append(endl); + outBuffer.append("std dev: ").append(getStandardDeviation()) + .append(endl); + outBuffer.append("median: ").append(getPercentile(50)).append(endl); + outBuffer.append("skewness: ").append(getSkewness()).append(endl); + outBuffer.append("kurtosis: ").append(getKurtosis()).append(endl); + return outBuffer.toString(); + } + + /** + * Apply the given statistic to the data associated with this set of statistics. + * @param stat the statistic to apply + * @return the computed value of the statistic. + */ + public double apply(UnivariateStatistic stat) { + return stat.evaluate(eDA.getInternalValues(), eDA.start(), eDA.getNumElements()); + } + + // Implementation getters and setter + + /** + * Returns the currently configured mean implementation. + * + * @return the UnivariateStatistic implementing the mean + * @since 1.2 + */ + public synchronized UnivariateStatistic getMeanImpl() { + return meanImpl; + } + + /** + * <p>Sets the implementation for the mean.</p> + * + * @param meanImpl the UnivariateStatistic instance to use + * for computing the mean + * @since 1.2 + */ + public synchronized void setMeanImpl(UnivariateStatistic meanImpl) { + this.meanImpl = meanImpl; + } + + /** + * Returns the currently configured geometric mean implementation. + * + * @return the UnivariateStatistic implementing the geometric mean + * @since 1.2 + */ + public synchronized UnivariateStatistic getGeometricMeanImpl() { + return geometricMeanImpl; + } + + /** + * <p>Sets the implementation for the gemoetric mean.</p> + * + * @param geometricMeanImpl the UnivariateStatistic instance to use + * for computing the geometric mean + * @since 1.2 + */ + public synchronized void setGeometricMeanImpl( + UnivariateStatistic geometricMeanImpl) { + this.geometricMeanImpl = geometricMeanImpl; + } + + /** + * Returns the currently configured kurtosis implementation. + * + * @return the UnivariateStatistic implementing the kurtosis + * @since 1.2 + */ + public synchronized UnivariateStatistic getKurtosisImpl() { + return kurtosisImpl; + } + + /** + * <p>Sets the implementation for the kurtosis.</p> + * + * @param kurtosisImpl the UnivariateStatistic instance to use + * for computing the kurtosis + * @since 1.2 + */ + public synchronized void setKurtosisImpl(UnivariateStatistic kurtosisImpl) { + this.kurtosisImpl = kurtosisImpl; + } + + /** + * Returns the currently configured maximum implementation. + * + * @return the UnivariateStatistic implementing the maximum + * @since 1.2 + */ + public synchronized UnivariateStatistic getMaxImpl() { + return maxImpl; + } + + /** + * <p>Sets the implementation for the maximum.</p> + * + * @param maxImpl the UnivariateStatistic instance to use + * for computing the maximum + * @since 1.2 + */ + public synchronized void setMaxImpl(UnivariateStatistic maxImpl) { + this.maxImpl = maxImpl; + } + + /** + * Returns the currently configured minimum implementation. + * + * @return the UnivariateStatistic implementing the minimum + * @since 1.2 + */ + public synchronized UnivariateStatistic getMinImpl() { + return minImpl; + } + + /** + * <p>Sets the implementation for the minimum.</p> + * + * @param minImpl the UnivariateStatistic instance to use + * for computing the minimum + * @since 1.2 + */ + public synchronized void setMinImpl(UnivariateStatistic minImpl) { + this.minImpl = minImpl; + } + + /** + * Returns the currently configured percentile implementation. + * + * @return the UnivariateStatistic implementing the percentile + * @since 1.2 + */ + public synchronized UnivariateStatistic getPercentileImpl() { + return percentileImpl; + } + + /** + * Sets the implementation to be used by {@link #getPercentile(double)}. + * The supplied <code>UnivariateStatistic</code> must provide a + * <code>setQuantile(double)</code> method; otherwise + * <code>IllegalArgumentException</code> is thrown. + * + * @param percentileImpl the percentileImpl to set + * @throws IllegalArgumentException if the supplied implementation does not + * provide a <code>setQuantile</code> method + * @since 1.2 + */ + public synchronized void setPercentileImpl( + UnivariateStatistic percentileImpl) { + try { + percentileImpl.getClass().getMethod(SET_QUANTILE_METHOD_NAME, + new Class[] {Double.TYPE}).invoke(percentileImpl, + new Object[] {Double.valueOf(50.0d)}); + } catch (NoSuchMethodException e1) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.PERCENTILE_IMPLEMENTATION_UNSUPPORTED_METHOD, + percentileImpl.getClass().getName(), SET_QUANTILE_METHOD_NAME); + } catch (IllegalAccessException e2) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.PERCENTILE_IMPLEMENTATION_CANNOT_ACCESS_METHOD, + SET_QUANTILE_METHOD_NAME, percentileImpl.getClass().getName()); + } catch (InvocationTargetException e3) { + throw MathRuntimeException.createIllegalArgumentException(e3.getCause()); + } + this.percentileImpl = percentileImpl; + } + + /** + * Returns the currently configured skewness implementation. + * + * @return the UnivariateStatistic implementing the skewness + * @since 1.2 + */ + public synchronized UnivariateStatistic getSkewnessImpl() { + return skewnessImpl; + } + + /** + * <p>Sets the implementation for the skewness.</p> + * + * @param skewnessImpl the UnivariateStatistic instance to use + * for computing the skewness + * @since 1.2 + */ + public synchronized void setSkewnessImpl( + UnivariateStatistic skewnessImpl) { + this.skewnessImpl = skewnessImpl; + } + + /** + * Returns the currently configured variance implementation. + * + * @return the UnivariateStatistic implementing the variance + * @since 1.2 + */ + public synchronized UnivariateStatistic getVarianceImpl() { + return varianceImpl; + } + + /** + * <p>Sets the implementation for the variance.</p> + * + * @param varianceImpl the UnivariateStatistic instance to use + * for computing the variance + * @since 1.2 + */ + public synchronized void setVarianceImpl( + UnivariateStatistic varianceImpl) { + this.varianceImpl = varianceImpl; + } + + /** + * Returns the currently configured sum of squares implementation. + * + * @return the UnivariateStatistic implementing the sum of squares + * @since 1.2 + */ + public synchronized UnivariateStatistic getSumsqImpl() { + return sumsqImpl; + } + + /** + * <p>Sets the implementation for the sum of squares.</p> + * + * @param sumsqImpl the UnivariateStatistic instance to use + * for computing the sum of squares + * @since 1.2 + */ + public synchronized void setSumsqImpl(UnivariateStatistic sumsqImpl) { + this.sumsqImpl = sumsqImpl; + } + + /** + * Returns the currently configured sum implementation. + * + * @return the UnivariateStatistic implementing the sum + * @since 1.2 + */ + public synchronized UnivariateStatistic getSumImpl() { + return sumImpl; + } + + /** + * <p>Sets the implementation for the sum.</p> + * + * @param sumImpl the UnivariateStatistic instance to use + * for computing the sum + * @since 1.2 + */ + public synchronized void setSumImpl(UnivariateStatistic sumImpl) { + this.sumImpl = sumImpl; + } + + /** + * Returns a copy of this DescriptiveStatistics instance with the same internal state. + * + * @return a copy of this + */ + public DescriptiveStatistics copy() { + DescriptiveStatistics result = new DescriptiveStatistics(); + copy(this, result); + return result; + } + + /** + * Copies source to dest. + * <p>Neither source nor dest can be null.</p> + * + * @param source DescriptiveStatistics to copy + * @param dest DescriptiveStatistics to copy to + * @throws NullPointerException if either source or dest is null + */ + public static void copy(DescriptiveStatistics source, DescriptiveStatistics dest) { + // Copy data and window size + dest.eDA = source.eDA.copy(); + dest.windowSize = source.windowSize; + + // Copy implementations + dest.maxImpl = source.maxImpl.copy(); + dest.meanImpl = source.meanImpl.copy(); + dest.minImpl = source.minImpl.copy(); + dest.sumImpl = source.sumImpl.copy(); + dest.varianceImpl = source.varianceImpl.copy(); + dest.sumsqImpl = source.sumsqImpl.copy(); + dest.geometricMeanImpl = source.geometricMeanImpl.copy(); + dest.kurtosisImpl = source.kurtosisImpl; + dest.skewnessImpl = source.skewnessImpl; + dest.percentileImpl = source.percentileImpl; + } +} diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/MultivariateSummaryStatistics.java b/src/main/java/org/apache/commons/math/stat/descriptive/MultivariateSummaryStatistics.java new file mode 100644 index 0000000..8062f5b --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/descriptive/MultivariateSummaryStatistics.java @@ -0,0 +1,637 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.descriptive; + +import java.io.Serializable; +import java.util.Arrays; + +import org.apache.commons.math.DimensionMismatchException; +import org.apache.commons.math.MathRuntimeException; +import org.apache.commons.math.exception.util.LocalizedFormats; +import org.apache.commons.math.linear.RealMatrix; +import org.apache.commons.math.stat.descriptive.moment.GeometricMean; +import org.apache.commons.math.stat.descriptive.moment.Mean; +import org.apache.commons.math.stat.descriptive.moment.VectorialCovariance; +import org.apache.commons.math.stat.descriptive.rank.Max; +import org.apache.commons.math.stat.descriptive.rank.Min; +import org.apache.commons.math.stat.descriptive.summary.Sum; +import org.apache.commons.math.stat.descriptive.summary.SumOfLogs; +import org.apache.commons.math.stat.descriptive.summary.SumOfSquares; +import org.apache.commons.math.util.MathUtils; +import org.apache.commons.math.util.FastMath; + +/** + * <p>Computes summary statistics for a stream of n-tuples added using the + * {@link #addValue(double[]) addValue} method. The data values are not stored + * in memory, so this class can be used to compute statistics for very large + * n-tuple streams.</p> + * + * <p>The {@link StorelessUnivariateStatistic} instances used to maintain + * summary state and compute statistics are configurable via setters. + * For example, the default implementation for the mean can be overridden by + * calling {@link #setMeanImpl(StorelessUnivariateStatistic[])}. Actual + * parameters to these methods must implement the + * {@link StorelessUnivariateStatistic} interface and configuration must be + * completed before <code>addValue</code> is called. No configuration is + * necessary to use the default, commons-math provided implementations.</p> + * + * <p>To compute statistics for a stream of n-tuples, construct a + * MultivariateStatistics instance with dimension n and then use + * {@link #addValue(double[])} to add n-tuples. The <code>getXxx</code> + * methods where Xxx is a statistic return an array of <code>double</code> + * values, where for <code>i = 0,...,n-1</code> the i<sup>th</sup> array element is the + * value of the given statistic for data range consisting of the i<sup>th</sup> element of + * each of the input n-tuples. For example, if <code>addValue</code> is called + * with actual parameters {0, 1, 2}, then {3, 4, 5} and finally {6, 7, 8}, + * <code>getSum</code> will return a three-element array with values + * {0+3+6, 1+4+7, 2+5+8}</p> + * + * <p>Note: This class is not thread-safe. Use + * {@link SynchronizedMultivariateSummaryStatistics} if concurrent access from multiple + * threads is required.</p> + * + * @since 1.2 + * @version $Revision: 1042376 $ $Date: 2010-12-05 16:54:55 +0100 (dim. 05 déc. 2010) $ + */ +public class MultivariateSummaryStatistics + implements StatisticalMultivariateSummary, Serializable { + + /** Serialization UID */ + private static final long serialVersionUID = 2271900808994826718L; + + /** Dimension of the data. */ + private int k; + + /** Count of values that have been added */ + private long n = 0; + + /** Sum statistic implementation - can be reset by setter. */ + private StorelessUnivariateStatistic[] sumImpl; + + /** Sum of squares statistic implementation - can be reset by setter. */ + private StorelessUnivariateStatistic[] sumSqImpl; + + /** Minimum statistic implementation - can be reset by setter. */ + private StorelessUnivariateStatistic[] minImpl; + + /** Maximum statistic implementation - can be reset by setter. */ + private StorelessUnivariateStatistic[] maxImpl; + + /** Sum of log statistic implementation - can be reset by setter. */ + private StorelessUnivariateStatistic[] sumLogImpl; + + /** Geometric mean statistic implementation - can be reset by setter. */ + private StorelessUnivariateStatistic[] geoMeanImpl; + + /** Mean statistic implementation - can be reset by setter. */ + private StorelessUnivariateStatistic[] meanImpl; + + /** Covariance statistic implementation - cannot be reset. */ + private VectorialCovariance covarianceImpl; + + /** + * Construct a MultivariateSummaryStatistics instance + * @param k dimension of the data + * @param isCovarianceBiasCorrected if true, the unbiased sample + * covariance is computed, otherwise the biased population covariance + * is computed + */ + public MultivariateSummaryStatistics(int k, boolean isCovarianceBiasCorrected) { + this.k = k; + + sumImpl = new StorelessUnivariateStatistic[k]; + sumSqImpl = new StorelessUnivariateStatistic[k]; + minImpl = new StorelessUnivariateStatistic[k]; + maxImpl = new StorelessUnivariateStatistic[k]; + sumLogImpl = new StorelessUnivariateStatistic[k]; + geoMeanImpl = new StorelessUnivariateStatistic[k]; + meanImpl = new StorelessUnivariateStatistic[k]; + + for (int i = 0; i < k; ++i) { + sumImpl[i] = new Sum(); + sumSqImpl[i] = new SumOfSquares(); + minImpl[i] = new Min(); + maxImpl[i] = new Max(); + sumLogImpl[i] = new SumOfLogs(); + geoMeanImpl[i] = new GeometricMean(); + meanImpl[i] = new Mean(); + } + + covarianceImpl = + new VectorialCovariance(k, isCovarianceBiasCorrected); + + } + + /** + * Add an n-tuple to the data + * + * @param value the n-tuple to add + * @throws DimensionMismatchException if the length of the array + * does not match the one used at construction + */ + public void addValue(double[] value) + throws DimensionMismatchException { + checkDimension(value.length); + for (int i = 0; i < k; ++i) { + double v = value[i]; + sumImpl[i].increment(v); + sumSqImpl[i].increment(v); + minImpl[i].increment(v); + maxImpl[i].increment(v); + sumLogImpl[i].increment(v); + geoMeanImpl[i].increment(v); + meanImpl[i].increment(v); + } + covarianceImpl.increment(value); + n++; + } + + /** + * Returns the dimension of the data + * @return The dimension of the data + */ + public int getDimension() { + return k; + } + + /** + * Returns the number of available values + * @return The number of available values + */ + public long getN() { + return n; + } + + /** + * Returns an array of the results of a statistic. + * @param stats univariate statistic array + * @return results array + */ + private double[] getResults(StorelessUnivariateStatistic[] stats) { + double[] results = new double[stats.length]; + for (int i = 0; i < results.length; ++i) { + results[i] = stats[i].getResult(); + } + return results; + } + + /** + * Returns an array whose i<sup>th</sup> entry is the sum of the + * i<sup>th</sup> entries of the arrays that have been added using + * {@link #addValue(double[])} + * + * @return the array of component sums + */ + public double[] getSum() { + return getResults(sumImpl); + } + + /** + * Returns an array whose i<sup>th</sup> entry is the sum of squares of the + * i<sup>th</sup> entries of the arrays that have been added using + * {@link #addValue(double[])} + * + * @return the array of component sums of squares + */ + public double[] getSumSq() { + return getResults(sumSqImpl); + } + + /** + * Returns an array whose i<sup>th</sup> entry is the sum of logs of the + * i<sup>th</sup> entries of the arrays that have been added using + * {@link #addValue(double[])} + * + * @return the array of component log sums + */ + public double[] getSumLog() { + return getResults(sumLogImpl); + } + + /** + * Returns an array whose i<sup>th</sup> entry is the mean of the + * i<sup>th</sup> entries of the arrays that have been added using + * {@link #addValue(double[])} + * + * @return the array of component means + */ + public double[] getMean() { + return getResults(meanImpl); + } + + /** + * Returns an array whose i<sup>th</sup> entry is the standard deviation of the + * i<sup>th</sup> entries of the arrays that have been added using + * {@link #addValue(double[])} + * + * @return the array of component standard deviations + */ + public double[] getStandardDeviation() { + double[] stdDev = new double[k]; + if (getN() < 1) { + Arrays.fill(stdDev, Double.NaN); + } else if (getN() < 2) { + Arrays.fill(stdDev, 0.0); + } else { + RealMatrix matrix = covarianceImpl.getResult(); + for (int i = 0; i < k; ++i) { + stdDev[i] = FastMath.sqrt(matrix.getEntry(i, i)); + } + } + return stdDev; + } + + /** + * Returns the covariance matrix of the values that have been added. + * + * @return the covariance matrix + */ + public RealMatrix getCovariance() { + return covarianceImpl.getResult(); + } + + /** + * Returns an array whose i<sup>th</sup> entry is the maximum of the + * i<sup>th</sup> entries of the arrays that have been added using + * {@link #addValue(double[])} + * + * @return the array of component maxima + */ + public double[] getMax() { + return getResults(maxImpl); + } + + /** + * Returns an array whose i<sup>th</sup> entry is the minimum of the + * i<sup>th</sup> entries of the arrays that have been added using + * {@link #addValue(double[])} + * + * @return the array of component minima + */ + public double[] getMin() { + return getResults(minImpl); + } + + /** + * Returns an array whose i<sup>th</sup> entry is the geometric mean of the + * i<sup>th</sup> entries of the arrays that have been added using + * {@link #addValue(double[])} + * + * @return the array of component geometric means + */ + public double[] getGeometricMean() { + return getResults(geoMeanImpl); + } + + /** + * Generates a text report displaying + * summary statistics from values that + * have been added. + * @return String with line feeds displaying statistics + */ + @Override + public String toString() { + final String separator = ", "; + final String suffix = System.getProperty("line.separator"); + StringBuilder outBuffer = new StringBuilder(); + outBuffer.append("MultivariateSummaryStatistics:" + suffix); + outBuffer.append("n: " + getN() + suffix); + append(outBuffer, getMin(), "min: ", separator, suffix); + append(outBuffer, getMax(), "max: ", separator, suffix); + append(outBuffer, getMean(), "mean: ", separator, suffix); + append(outBuffer, getGeometricMean(), "geometric mean: ", separator, suffix); + append(outBuffer, getSumSq(), "sum of squares: ", separator, suffix); + append(outBuffer, getSumLog(), "sum of logarithms: ", separator, suffix); + append(outBuffer, getStandardDeviation(), "standard deviation: ", separator, suffix); + outBuffer.append("covariance: " + getCovariance().toString() + suffix); + return outBuffer.toString(); + } + + /** + * Append a text representation of an array to a buffer. + * @param buffer buffer to fill + * @param data data array + * @param prefix text prefix + * @param separator elements separator + * @param suffix text suffix + */ + private void append(StringBuilder buffer, double[] data, + String prefix, String separator, String suffix) { + buffer.append(prefix); + for (int i = 0; i < data.length; ++i) { + if (i > 0) { + buffer.append(separator); + } + buffer.append(data[i]); + } + buffer.append(suffix); + } + + /** + * Resets all statistics and storage + */ + public void clear() { + this.n = 0; + for (int i = 0; i < k; ++i) { + minImpl[i].clear(); + maxImpl[i].clear(); + sumImpl[i].clear(); + sumLogImpl[i].clear(); + sumSqImpl[i].clear(); + geoMeanImpl[i].clear(); + meanImpl[i].clear(); + } + covarianceImpl.clear(); + } + + /** + * Returns true iff <code>object</code> is a <code>MultivariateSummaryStatistics</code> + * instance and all statistics have the same values as this. + * @param object the object to test equality against. + * @return true if object equals this + */ + @Override + public boolean equals(Object object) { + if (object == this ) { + return true; + } + if (object instanceof MultivariateSummaryStatistics == false) { + return false; + } + MultivariateSummaryStatistics stat = (MultivariateSummaryStatistics) object; + return MathUtils.equalsIncludingNaN(stat.getGeometricMean(), getGeometricMean()) && + MathUtils.equalsIncludingNaN(stat.getMax(), getMax()) && + MathUtils.equalsIncludingNaN(stat.getMean(), getMean()) && + MathUtils.equalsIncludingNaN(stat.getMin(), getMin()) && + MathUtils.equalsIncludingNaN(stat.getN(), getN()) && + MathUtils.equalsIncludingNaN(stat.getSum(), getSum()) && + MathUtils.equalsIncludingNaN(stat.getSumSq(), getSumSq()) && + MathUtils.equalsIncludingNaN(stat.getSumLog(), getSumLog()) && + stat.getCovariance().equals( getCovariance()); + } + + /** + * Returns hash code based on values of statistics + * + * @return hash code + */ + @Override + public int hashCode() { + int result = 31 + MathUtils.hash(getGeometricMean()); + result = result * 31 + MathUtils.hash(getGeometricMean()); + result = result * 31 + MathUtils.hash(getMax()); + result = result * 31 + MathUtils.hash(getMean()); + result = result * 31 + MathUtils.hash(getMin()); + result = result * 31 + MathUtils.hash(getN()); + result = result * 31 + MathUtils.hash(getSum()); + result = result * 31 + MathUtils.hash(getSumSq()); + result = result * 31 + MathUtils.hash(getSumLog()); + result = result * 31 + getCovariance().hashCode(); + return result; + } + + // Getters and setters for statistics implementations + /** + * Sets statistics implementations. + * @param newImpl new implementations for statistics + * @param oldImpl old implementations for statistics + * @throws DimensionMismatchException if the array dimension + * does not match the one used at construction + * @throws IllegalStateException if data has already been added + * (i.e if n > 0) + */ + private void setImpl(StorelessUnivariateStatistic[] newImpl, + StorelessUnivariateStatistic[] oldImpl) + throws DimensionMismatchException, IllegalStateException { + checkEmpty(); + checkDimension(newImpl.length); + System.arraycopy(newImpl, 0, oldImpl, 0, newImpl.length); + } + + /** + * Returns the currently configured Sum implementation + * + * @return the StorelessUnivariateStatistic implementing the sum + */ + public StorelessUnivariateStatistic[] getSumImpl() { + return sumImpl.clone(); + } + + /** + * <p>Sets the implementation for the Sum.</p> + * <p>This method must be activated before any data has been added - i.e., + * before {@link #addValue(double[]) addValue} has been used to add data; + * otherwise an IllegalStateException will be thrown.</p> + * + * @param sumImpl the StorelessUnivariateStatistic instance to use + * for computing the Sum + * @throws DimensionMismatchException if the array dimension + * does not match the one used at construction + * @throws IllegalStateException if data has already been added + * (i.e if n > 0) + */ + public void setSumImpl(StorelessUnivariateStatistic[] sumImpl) + throws DimensionMismatchException { + setImpl(sumImpl, this.sumImpl); + } + + /** + * Returns the currently configured sum of squares implementation + * + * @return the StorelessUnivariateStatistic implementing the sum of squares + */ + public StorelessUnivariateStatistic[] getSumsqImpl() { + return sumSqImpl.clone(); + } + + /** + * <p>Sets the implementation for the sum of squares.</p> + * <p>This method must be activated before any data has been added - i.e., + * before {@link #addValue(double[]) addValue} has been used to add data; + * otherwise an IllegalStateException will be thrown.</p> + * + * @param sumsqImpl the StorelessUnivariateStatistic instance to use + * for computing the sum of squares + * @throws DimensionMismatchException if the array dimension + * does not match the one used at construction + * @throws IllegalStateException if data has already been added + * (i.e if n > 0) + */ + public void setSumsqImpl(StorelessUnivariateStatistic[] sumsqImpl) + throws DimensionMismatchException { + setImpl(sumsqImpl, this.sumSqImpl); + } + + /** + * Returns the currently configured minimum implementation + * + * @return the StorelessUnivariateStatistic implementing the minimum + */ + public StorelessUnivariateStatistic[] getMinImpl() { + return minImpl.clone(); + } + + /** + * <p>Sets the implementation for the minimum.</p> + * <p>This method must be activated before any data has been added - i.e., + * before {@link #addValue(double[]) addValue} has been used to add data; + * otherwise an IllegalStateException will be thrown.</p> + * + * @param minImpl the StorelessUnivariateStatistic instance to use + * for computing the minimum + * @throws DimensionMismatchException if the array dimension + * does not match the one used at construction + * @throws IllegalStateException if data has already been added + * (i.e if n > 0) + */ + public void setMinImpl(StorelessUnivariateStatistic[] minImpl) + throws DimensionMismatchException { + setImpl(minImpl, this.minImpl); + } + + /** + * Returns the currently configured maximum implementation + * + * @return the StorelessUnivariateStatistic implementing the maximum + */ + public StorelessUnivariateStatistic[] getMaxImpl() { + return maxImpl.clone(); + } + + /** + * <p>Sets the implementation for the maximum.</p> + * <p>This method must be activated before any data has been added - i.e., + * before {@link #addValue(double[]) addValue} has been used to add data; + * otherwise an IllegalStateException will be thrown.</p> + * + * @param maxImpl the StorelessUnivariateStatistic instance to use + * for computing the maximum + * @throws DimensionMismatchException if the array dimension + * does not match the one used at construction + * @throws IllegalStateException if data has already been added + * (i.e if n > 0) + */ + public void setMaxImpl(StorelessUnivariateStatistic[] maxImpl) + throws DimensionMismatchException { + setImpl(maxImpl, this.maxImpl); + } + + /** + * Returns the currently configured sum of logs implementation + * + * @return the StorelessUnivariateStatistic implementing the log sum + */ + public StorelessUnivariateStatistic[] getSumLogImpl() { + return sumLogImpl.clone(); + } + + /** + * <p>Sets the implementation for the sum of logs.</p> + * <p>This method must be activated before any data has been added - i.e., + * before {@link #addValue(double[]) addValue} has been used to add data; + * otherwise an IllegalStateException will be thrown.</p> + * + * @param sumLogImpl the StorelessUnivariateStatistic instance to use + * for computing the log sum + * @throws DimensionMismatchException if the array dimension + * does not match the one used at construction + * @throws IllegalStateException if data has already been added + * (i.e if n > 0) + */ + public void setSumLogImpl(StorelessUnivariateStatistic[] sumLogImpl) + throws DimensionMismatchException { + setImpl(sumLogImpl, this.sumLogImpl); + } + + /** + * Returns the currently configured geometric mean implementation + * + * @return the StorelessUnivariateStatistic implementing the geometric mean + */ + public StorelessUnivariateStatistic[] getGeoMeanImpl() { + return geoMeanImpl.clone(); + } + + /** + * <p>Sets the implementation for the geometric mean.</p> + * <p>This method must be activated before any data has been added - i.e., + * before {@link #addValue(double[]) addValue} has been used to add data; + * otherwise an IllegalStateException will be thrown.</p> + * + * @param geoMeanImpl the StorelessUnivariateStatistic instance to use + * for computing the geometric mean + * @throws DimensionMismatchException if the array dimension + * does not match the one used at construction + * @throws IllegalStateException if data has already been added + * (i.e if n > 0) + */ + public void setGeoMeanImpl(StorelessUnivariateStatistic[] geoMeanImpl) + throws DimensionMismatchException { + setImpl(geoMeanImpl, this.geoMeanImpl); + } + + /** + * Returns the currently configured mean implementation + * + * @return the StorelessUnivariateStatistic implementing the mean + */ + public StorelessUnivariateStatistic[] getMeanImpl() { + return meanImpl.clone(); + } + + /** + * <p>Sets the implementation for the mean.</p> + * <p>This method must be activated before any data has been added - i.e., + * before {@link #addValue(double[]) addValue} has been used to add data; + * otherwise an IllegalStateException will be thrown.</p> + * + * @param meanImpl the StorelessUnivariateStatistic instance to use + * for computing the mean + * @throws DimensionMismatchException if the array dimension + * does not match the one used at construction + * @throws IllegalStateException if data has already been added + * (i.e if n > 0) + */ + public void setMeanImpl(StorelessUnivariateStatistic[] meanImpl) + throws DimensionMismatchException { + setImpl(meanImpl, this.meanImpl); + } + + /** + * Throws IllegalStateException if n > 0. + */ + private void checkEmpty() { + if (n > 0) { + throw MathRuntimeException.createIllegalStateException( + LocalizedFormats.VALUES_ADDED_BEFORE_CONFIGURING_STATISTIC, + n); + } + } + + /** + * Throws DimensionMismatchException if dimension != k. + * @param dimension dimension to check + * @throws DimensionMismatchException if dimension != k + */ + private void checkDimension(int dimension) + throws DimensionMismatchException { + if (dimension != k) { + throw new DimensionMismatchException(dimension, k); + } + } + +} diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/StatisticalMultivariateSummary.java b/src/main/java/org/apache/commons/math/stat/descriptive/StatisticalMultivariateSummary.java new file mode 100644 index 0000000..517788c --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/descriptive/StatisticalMultivariateSummary.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.descriptive; + +import org.apache.commons.math.linear.RealMatrix; + +/** + * Reporting interface for basic multivariate statistics. + * + * @since 1.2 + * @version $Revision: 811786 $ $Date: 2009-09-06 11:36:08 +0200 (dim. 06 sept. 2009) $ + */ +public interface StatisticalMultivariateSummary { + + /** + * Returns the dimension of the data + * @return The dimension of the data + */ + int getDimension(); + + /** + * Returns an array whose i<sup>th</sup> entry is the + * mean of the i<sup>th</sup> entries of the arrays + * that correspond to each multivariate sample + * + * @return the array of component means + */ + double[] getMean(); + + /** + * Returns the covariance of the available values. + * @return The covariance, null if no multivariate sample + * have been added or a zeroed matrix for a single value set. + */ + RealMatrix getCovariance(); + + /** + * Returns an array whose i<sup>th</sup> entry is the + * standard deviation of the i<sup>th</sup> entries of the arrays + * that correspond to each multivariate sample + * + * @return the array of component standard deviations + */ + double[] getStandardDeviation(); + + /** + * Returns an array whose i<sup>th</sup> entry is the + * maximum of the i<sup>th</sup> entries of the arrays + * that correspond to each multivariate sample + * + * @return the array of component maxima + */ + double[] getMax(); + + /** + * Returns an array whose i<sup>th</sup> entry is the + * minimum of the i<sup>th</sup> entries of the arrays + * that correspond to each multivariate sample + * + * @return the array of component minima + */ + double[] getMin(); + + /** + * Returns the number of available values + * @return The number of available values + */ + long getN(); + + /** + * Returns an array whose i<sup>th</sup> entry is the + * geometric mean of the i<sup>th</sup> entries of the arrays + * that correspond to each multivariate sample + * + * @return the array of component geometric means + */ + double[] getGeometricMean(); + + /** + * Returns an array whose i<sup>th</sup> entry is the + * sum of the i<sup>th</sup> entries of the arrays + * that correspond to each multivariate sample + * + * @return the array of component sums + */ + double[] getSum(); + + /** + * Returns an array whose i<sup>th</sup> entry is the + * sum of squares of the i<sup>th</sup> entries of the arrays + * that correspond to each multivariate sample + * + * @return the array of component sums of squares + */ + double[] getSumSq(); + + /** + * Returns an array whose i<sup>th</sup> entry is the + * sum of logs of the i<sup>th</sup> entries of the arrays + * that correspond to each multivariate sample + * + * @return the array of component log sums + */ + double[] getSumLog(); + +} diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/StatisticalSummary.java b/src/main/java/org/apache/commons/math/stat/descriptive/StatisticalSummary.java new file mode 100644 index 0000000..5592053 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/descriptive/StatisticalSummary.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.descriptive; + +/** + * Reporting interface for basic univariate statistics. + * + * @version $Revision: 811786 $ $Date: 2009-09-06 11:36:08 +0200 (dim. 06 sept. 2009) $ + */ +public interface StatisticalSummary { + + /** + * Returns the <a href="http://www.xycoon.com/arithmetic_mean.htm"> + * arithmetic mean </a> of the available values + * @return The mean or Double.NaN if no values have been added. + */ + double getMean(); + /** + * Returns the variance of the available values. + * @return The variance, Double.NaN if no values have been added + * or 0.0 for a single value set. + */ + double getVariance(); + /** + * Returns the standard deviation of the available values. + * @return The standard deviation, Double.NaN if no values have been added + * or 0.0 for a single value set. + */ + double getStandardDeviation(); + /** + * Returns the maximum of the available values + * @return The max or Double.NaN if no values have been added. + */ + double getMax(); + /** + * Returns the minimum of the available values + * @return The min or Double.NaN if no values have been added. + */ + double getMin(); + /** + * Returns the number of available values + * @return The number of available values + */ + long getN(); + /** + * Returns the sum of the values that have been added to Univariate. + * @return The sum or Double.NaN if no values have been added + */ + double getSum(); + +} diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/StatisticalSummaryValues.java b/src/main/java/org/apache/commons/math/stat/descriptive/StatisticalSummaryValues.java new file mode 100644 index 0000000..e72639a --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/descriptive/StatisticalSummaryValues.java @@ -0,0 +1,186 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.descriptive; + +import java.io.Serializable; + +import org.apache.commons.math.util.FastMath; +import org.apache.commons.math.util.MathUtils; + +/** + * Value object representing the results of a univariate statistical summary. + * + * @version $Revision: 1054186 $ $Date: 2011-01-01 03:28:46 +0100 (sam. 01 janv. 2011) $ + */ +public class StatisticalSummaryValues implements Serializable, + StatisticalSummary { + + /** Serialization id */ + private static final long serialVersionUID = -5108854841843722536L; + + /** The sample mean */ + private final double mean; + + /** The sample variance */ + private final double variance; + + /** The number of observations in the sample */ + private final long n; + + /** The maximum value */ + private final double max; + + /** The minimum value */ + private final double min; + + /** The sum of the sample values */ + private final double sum; + + /** + * Constructor + * + * @param mean the sample mean + * @param variance the sample variance + * @param n the number of observations in the sample + * @param max the maximum value + * @param min the minimum value + * @param sum the sum of the values + */ + public StatisticalSummaryValues(double mean, double variance, long n, + double max, double min, double sum) { + super(); + this.mean = mean; + this.variance = variance; + this.n = n; + this.max = max; + this.min = min; + this.sum = sum; + } + + /** + * @return Returns the max. + */ + public double getMax() { + return max; + } + + /** + * @return Returns the mean. + */ + public double getMean() { + return mean; + } + + /** + * @return Returns the min. + */ + public double getMin() { + return min; + } + + /** + * @return Returns the number of values. + */ + public long getN() { + return n; + } + + /** + * @return Returns the sum. + */ + public double getSum() { + return sum; + } + + /** + * @return Returns the standard deviation + */ + public double getStandardDeviation() { + return FastMath.sqrt(variance); + } + + /** + * @return Returns the variance. + */ + public double getVariance() { + return variance; + } + + /** + * Returns true iff <code>object</code> is a + * <code>StatisticalSummaryValues</code> instance and all statistics have + * the same values as this. + * + * @param object the object to test equality against. + * @return true if object equals this + */ + @Override + public boolean equals(Object object) { + if (object == this ) { + return true; + } + if (object instanceof StatisticalSummaryValues == false) { + return false; + } + StatisticalSummaryValues stat = (StatisticalSummaryValues) object; + return MathUtils.equalsIncludingNaN(stat.getMax(), getMax()) && + MathUtils.equalsIncludingNaN(stat.getMean(), getMean()) && + MathUtils.equalsIncludingNaN(stat.getMin(), getMin()) && + MathUtils.equalsIncludingNaN(stat.getN(), getN()) && + MathUtils.equalsIncludingNaN(stat.getSum(), getSum()) && + MathUtils.equalsIncludingNaN(stat.getVariance(), getVariance()); + } + + /** + * Returns hash code based on values of statistics + * + * @return hash code + */ + @Override + public int hashCode() { + int result = 31 + MathUtils.hash(getMax()); + result = result * 31 + MathUtils.hash(getMean()); + result = result * 31 + MathUtils.hash(getMin()); + result = result * 31 + MathUtils.hash(getN()); + result = result * 31 + MathUtils.hash(getSum()); + result = result * 31 + MathUtils.hash(getVariance()); + return result; + } + + /** + * Generates a text report displaying values of statistics. + * Each statistic is displayed on a separate line. + * + * @return String with line feeds displaying statistics + */ + @Override + public String toString() { + StringBuilder outBuffer = new StringBuilder(); + String endl = "\n"; + outBuffer.append("StatisticalSummaryValues:").append(endl); + outBuffer.append("n: ").append(getN()).append(endl); + outBuffer.append("min: ").append(getMin()).append(endl); + outBuffer.append("max: ").append(getMax()).append(endl); + outBuffer.append("mean: ").append(getMean()).append(endl); + outBuffer.append("std dev: ").append(getStandardDeviation()) + .append(endl); + outBuffer.append("variance: ").append(getVariance()).append(endl); + outBuffer.append("sum: ").append(getSum()).append(endl); + return outBuffer.toString(); + } + +} diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/StorelessUnivariateStatistic.java b/src/main/java/org/apache/commons/math/stat/descriptive/StorelessUnivariateStatistic.java new file mode 100644 index 0000000..9b9fcb4 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/descriptive/StorelessUnivariateStatistic.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.descriptive; + +/** + * Extends the definition of {@link UnivariateStatistic} with + * {@link #increment} and {@link #incrementAll(double[])} methods for adding + * values and updating internal state. + * <p> + * This interface is designed to be used for calculating statistics that can be + * computed in one pass through the data without storing the full array of + * sample values.</p> + * + * @version $Revision: 811685 $ $Date: 2009-09-05 19:36:48 +0200 (sam. 05 sept. 2009) $ + */ +public interface StorelessUnivariateStatistic extends UnivariateStatistic { + + /** + * Updates the internal state of the statistic to reflect the addition of the new value. + * @param d the new value. + */ + void increment(double d); + + /** + * Updates the internal state of the statistic to reflect addition of + * all values in the values array. Does not clear the statistic first -- + * i.e., the values are added <strong>incrementally</strong> to the dataset. + * + * @param values array holding the new values to add + * @throws IllegalArgumentException if the array is null + */ + void incrementAll(double[] values); + + /** + * Updates the internal state of the statistic to reflect addition of + * the values in the designated portion of the values array. Does not + * clear the statistic first -- i.e., the values are added + * <strong>incrementally</strong> to the dataset. + * + * @param values array holding the new values to add + * @param start the array index of the first value to add + * @param length the number of elements to add + * @throws IllegalArgumentException if the array is null or the index + */ + void incrementAll(double[] values, int start, int length); + + /** + * Returns the current value of the Statistic. + * @return value of the statistic, <code>Double.NaN</code> if it + * has been cleared or just instantiated. + */ + double getResult(); + + /** + * Returns the number of values that have been added. + * @return the number of values. + */ + long getN(); + + /** + * Clears the internal state of the Statistic + */ + void clear(); + + /** + * Returns a copy of the statistic with the same internal state. + * + * @return a copy of the statistic + */ + StorelessUnivariateStatistic copy(); + +} diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/SummaryStatistics.java b/src/main/java/org/apache/commons/math/stat/descriptive/SummaryStatistics.java new file mode 100644 index 0000000..017a84d --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/descriptive/SummaryStatistics.java @@ -0,0 +1,717 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.descriptive; + +import java.io.Serializable; + +import org.apache.commons.math.MathRuntimeException; +import org.apache.commons.math.exception.util.LocalizedFormats; +import org.apache.commons.math.stat.descriptive.moment.GeometricMean; +import org.apache.commons.math.stat.descriptive.moment.Mean; +import org.apache.commons.math.stat.descriptive.moment.SecondMoment; +import org.apache.commons.math.stat.descriptive.moment.Variance; +import org.apache.commons.math.stat.descriptive.rank.Max; +import org.apache.commons.math.stat.descriptive.rank.Min; +import org.apache.commons.math.stat.descriptive.summary.Sum; +import org.apache.commons.math.stat.descriptive.summary.SumOfLogs; +import org.apache.commons.math.stat.descriptive.summary.SumOfSquares; +import org.apache.commons.math.util.MathUtils; +import org.apache.commons.math.util.FastMath; + +/** + * <p> + * Computes summary statistics for a stream of data values added using the + * {@link #addValue(double) addValue} method. The data values are not stored in + * memory, so this class can be used to compute statistics for very large data + * streams. + * </p> + * <p> + * The {@link StorelessUnivariateStatistic} instances used to maintain summary + * state and compute statistics are configurable via setters. For example, the + * default implementation for the variance can be overridden by calling + * {@link #setVarianceImpl(StorelessUnivariateStatistic)}. Actual parameters to + * these methods must implement the {@link StorelessUnivariateStatistic} + * interface and configuration must be completed before <code>addValue</code> + * is called. No configuration is necessary to use the default, commons-math + * provided implementations. + * </p> + * <p> + * Note: This class is not thread-safe. Use + * {@link SynchronizedSummaryStatistics} if concurrent access from multiple + * threads is required. + * </p> + * @version $Revision: 1042376 $ $Date: 2010-12-05 16:54:55 +0100 (dim. 05 déc. 2010) $ + */ +public class SummaryStatistics implements StatisticalSummary, Serializable { + + /** Serialization UID */ + private static final long serialVersionUID = -2021321786743555871L; + + /** count of values that have been added */ + protected long n = 0; + + /** SecondMoment is used to compute the mean and variance */ + protected SecondMoment secondMoment = new SecondMoment(); + + /** sum of values that have been added */ + protected Sum sum = new Sum(); + + /** sum of the square of each value that has been added */ + protected SumOfSquares sumsq = new SumOfSquares(); + + /** min of values that have been added */ + protected Min min = new Min(); + + /** max of values that have been added */ + protected Max max = new Max(); + + /** sumLog of values that have been added */ + protected SumOfLogs sumLog = new SumOfLogs(); + + /** geoMean of values that have been added */ + protected GeometricMean geoMean = new GeometricMean(sumLog); + + /** mean of values that have been added */ + protected Mean mean = new Mean(); + + /** variance of values that have been added */ + protected Variance variance = new Variance(); + + /** Sum statistic implementation - can be reset by setter. */ + private StorelessUnivariateStatistic sumImpl = sum; + + /** Sum of squares statistic implementation - can be reset by setter. */ + private StorelessUnivariateStatistic sumsqImpl = sumsq; + + /** Minimum statistic implementation - can be reset by setter. */ + private StorelessUnivariateStatistic minImpl = min; + + /** Maximum statistic implementation - can be reset by setter. */ + private StorelessUnivariateStatistic maxImpl = max; + + /** Sum of log statistic implementation - can be reset by setter. */ + private StorelessUnivariateStatistic sumLogImpl = sumLog; + + /** Geometric mean statistic implementation - can be reset by setter. */ + private StorelessUnivariateStatistic geoMeanImpl = geoMean; + + /** Mean statistic implementation - can be reset by setter. */ + private StorelessUnivariateStatistic meanImpl = mean; + + /** Variance statistic implementation - can be reset by setter. */ + private StorelessUnivariateStatistic varianceImpl = variance; + + /** + * Construct a SummaryStatistics instance + */ + public SummaryStatistics() { + } + + /** + * A copy constructor. Creates a deep-copy of the {@code original}. + * + * @param original the {@code SummaryStatistics} instance to copy + */ + public SummaryStatistics(SummaryStatistics original) { + copy(original, this); + } + + /** + * Return a {@link StatisticalSummaryValues} instance reporting current + * statistics. + * @return Current values of statistics + */ + public StatisticalSummary getSummary() { + return new StatisticalSummaryValues(getMean(), getVariance(), getN(), + getMax(), getMin(), getSum()); + } + + /** + * Add a value to the data + * @param value the value to add + */ + public void addValue(double value) { + sumImpl.increment(value); + sumsqImpl.increment(value); + minImpl.increment(value); + maxImpl.increment(value); + sumLogImpl.increment(value); + secondMoment.increment(value); + // If mean, variance or geomean have been overridden, + // need to increment these + if (!(meanImpl instanceof Mean)) { + meanImpl.increment(value); + } + if (!(varianceImpl instanceof Variance)) { + varianceImpl.increment(value); + } + if (!(geoMeanImpl instanceof GeometricMean)) { + geoMeanImpl.increment(value); + } + n++; + } + + /** + * Returns the number of available values + * @return The number of available values + */ + public long getN() { + return n; + } + + /** + * Returns the sum of the values that have been added + * @return The sum or <code>Double.NaN</code> if no values have been added + */ + public double getSum() { + return sumImpl.getResult(); + } + + /** + * Returns the sum of the squares of the values that have been added. + * <p> + * Double.NaN is returned if no values have been added. + * </p> + * @return The sum of squares + */ + public double getSumsq() { + return sumsqImpl.getResult(); + } + + /** + * Returns the mean of the values that have been added. + * <p> + * Double.NaN is returned if no values have been added. + * </p> + * @return the mean + */ + public double getMean() { + if (mean == meanImpl) { + return new Mean(secondMoment).getResult(); + } else { + return meanImpl.getResult(); + } + } + + /** + * Returns the standard deviation of the values that have been added. + * <p> + * Double.NaN is returned if no values have been added. + * </p> + * @return the standard deviation + */ + public double getStandardDeviation() { + double stdDev = Double.NaN; + if (getN() > 0) { + if (getN() > 1) { + stdDev = FastMath.sqrt(getVariance()); + } else { + stdDev = 0.0; + } + } + return stdDev; + } + + /** + * Returns the variance of the values that have been added. + * <p> + * Double.NaN is returned if no values have been added. + * </p> + * @return the variance + */ + public double getVariance() { + if (varianceImpl == variance) { + return new Variance(secondMoment).getResult(); + } else { + return varianceImpl.getResult(); + } + } + + /** + * Returns the maximum of the values that have been added. + * <p> + * Double.NaN is returned if no values have been added. + * </p> + * @return the maximum + */ + public double getMax() { + return maxImpl.getResult(); + } + + /** + * Returns the minimum of the values that have been added. + * <p> + * Double.NaN is returned if no values have been added. + * </p> + * @return the minimum + */ + public double getMin() { + return minImpl.getResult(); + } + + /** + * Returns the geometric mean of the values that have been added. + * <p> + * Double.NaN is returned if no values have been added. + * </p> + * @return the geometric mean + */ + public double getGeometricMean() { + return geoMeanImpl.getResult(); + } + + /** + * Returns the sum of the logs of the values that have been added. + * <p> + * Double.NaN is returned if no values have been added. + * </p> + * @return the sum of logs + * @since 1.2 + */ + public double getSumOfLogs() { + return sumLogImpl.getResult(); + } + + /** + * Returns a statistic related to the Second Central Moment. Specifically, + * what is returned is the sum of squared deviations from the sample mean + * among the values that have been added. + * <p> + * Returns <code>Double.NaN</code> if no data values have been added and + * returns <code>0</code> if there is just one value in the data set.</p> + * <p> + * @return second central moment statistic + * @since 2.0 + */ + public double getSecondMoment() { + return secondMoment.getResult(); + } + + /** + * Generates a text report displaying summary statistics from values that + * have been added. + * @return String with line feeds displaying statistics + * @since 1.2 + */ + @Override + public String toString() { + StringBuilder outBuffer = new StringBuilder(); + String endl = "\n"; + outBuffer.append("SummaryStatistics:").append(endl); + outBuffer.append("n: ").append(getN()).append(endl); + outBuffer.append("min: ").append(getMin()).append(endl); + outBuffer.append("max: ").append(getMax()).append(endl); + outBuffer.append("mean: ").append(getMean()).append(endl); + outBuffer.append("geometric mean: ").append(getGeometricMean()) + .append(endl); + outBuffer.append("variance: ").append(getVariance()).append(endl); + outBuffer.append("sum of squares: ").append(getSumsq()).append(endl); + outBuffer.append("standard deviation: ").append(getStandardDeviation()) + .append(endl); + outBuffer.append("sum of logs: ").append(getSumOfLogs()).append(endl); + return outBuffer.toString(); + } + + /** + * Resets all statistics and storage + */ + public void clear() { + this.n = 0; + minImpl.clear(); + maxImpl.clear(); + sumImpl.clear(); + sumLogImpl.clear(); + sumsqImpl.clear(); + geoMeanImpl.clear(); + secondMoment.clear(); + if (meanImpl != mean) { + meanImpl.clear(); + } + if (varianceImpl != variance) { + varianceImpl.clear(); + } + } + + /** + * Returns true iff <code>object</code> is a + * <code>SummaryStatistics</code> instance and all statistics have the + * same values as this. + * @param object the object to test equality against. + * @return true if object equals this + */ + @Override + public boolean equals(Object object) { + if (object == this) { + return true; + } + if (object instanceof SummaryStatistics == false) { + return false; + } + SummaryStatistics stat = (SummaryStatistics)object; + return MathUtils.equalsIncludingNaN(stat.getGeometricMean(), getGeometricMean()) && + MathUtils.equalsIncludingNaN(stat.getMax(), getMax()) && + MathUtils.equalsIncludingNaN(stat.getMean(), getMean()) && + MathUtils.equalsIncludingNaN(stat.getMin(), getMin()) && + MathUtils.equalsIncludingNaN(stat.getN(), getN()) && + MathUtils.equalsIncludingNaN(stat.getSum(), getSum()) && + MathUtils.equalsIncludingNaN(stat.getSumsq(), getSumsq()) && + MathUtils.equalsIncludingNaN(stat.getVariance(), getVariance()); + } + + /** + * Returns hash code based on values of statistics + * @return hash code + */ + @Override + public int hashCode() { + int result = 31 + MathUtils.hash(getGeometricMean()); + result = result * 31 + MathUtils.hash(getGeometricMean()); + result = result * 31 + MathUtils.hash(getMax()); + result = result * 31 + MathUtils.hash(getMean()); + result = result * 31 + MathUtils.hash(getMin()); + result = result * 31 + MathUtils.hash(getN()); + result = result * 31 + MathUtils.hash(getSum()); + result = result * 31 + MathUtils.hash(getSumsq()); + result = result * 31 + MathUtils.hash(getVariance()); + return result; + } + + // Getters and setters for statistics implementations + /** + * Returns the currently configured Sum implementation + * @return the StorelessUnivariateStatistic implementing the sum + * @since 1.2 + */ + public StorelessUnivariateStatistic getSumImpl() { + return sumImpl; + } + + /** + * <p> + * Sets the implementation for the Sum. + * </p> + * <p> + * This method must be activated before any data has been added - i.e., + * before {@link #addValue(double) addValue} has been used to add data; + * otherwise an IllegalStateException will be thrown. + * </p> + * @param sumImpl the StorelessUnivariateStatistic instance to use for + * computing the Sum + * @throws IllegalStateException if data has already been added (i.e if n > + * 0) + * @since 1.2 + */ + public void setSumImpl(StorelessUnivariateStatistic sumImpl) { + checkEmpty(); + this.sumImpl = sumImpl; + } + + /** + * Returns the currently configured sum of squares implementation + * @return the StorelessUnivariateStatistic implementing the sum of squares + * @since 1.2 + */ + public StorelessUnivariateStatistic getSumsqImpl() { + return sumsqImpl; + } + + /** + * <p> + * Sets the implementation for the sum of squares. + * </p> + * <p> + * This method must be activated before any data has been added - i.e., + * before {@link #addValue(double) addValue} has been used to add data; + * otherwise an IllegalStateException will be thrown. + * </p> + * @param sumsqImpl the StorelessUnivariateStatistic instance to use for + * computing the sum of squares + * @throws IllegalStateException if data has already been added (i.e if n > + * 0) + * @since 1.2 + */ + public void setSumsqImpl(StorelessUnivariateStatistic sumsqImpl) { + checkEmpty(); + this.sumsqImpl = sumsqImpl; + } + + /** + * Returns the currently configured minimum implementation + * @return the StorelessUnivariateStatistic implementing the minimum + * @since 1.2 + */ + public StorelessUnivariateStatistic getMinImpl() { + return minImpl; + } + + /** + * <p> + * Sets the implementation for the minimum. + * </p> + * <p> + * This method must be activated before any data has been added - i.e., + * before {@link #addValue(double) addValue} has been used to add data; + * otherwise an IllegalStateException will be thrown. + * </p> + * @param minImpl the StorelessUnivariateStatistic instance to use for + * computing the minimum + * @throws IllegalStateException if data has already been added (i.e if n > + * 0) + * @since 1.2 + */ + public void setMinImpl(StorelessUnivariateStatistic minImpl) { + checkEmpty(); + this.minImpl = minImpl; + } + + /** + * Returns the currently configured maximum implementation + * @return the StorelessUnivariateStatistic implementing the maximum + * @since 1.2 + */ + public StorelessUnivariateStatistic getMaxImpl() { + return maxImpl; + } + + /** + * <p> + * Sets the implementation for the maximum. + * </p> + * <p> + * This method must be activated before any data has been added - i.e., + * before {@link #addValue(double) addValue} has been used to add data; + * otherwise an IllegalStateException will be thrown. + * </p> + * @param maxImpl the StorelessUnivariateStatistic instance to use for + * computing the maximum + * @throws IllegalStateException if data has already been added (i.e if n > + * 0) + * @since 1.2 + */ + public void setMaxImpl(StorelessUnivariateStatistic maxImpl) { + checkEmpty(); + this.maxImpl = maxImpl; + } + + /** + * Returns the currently configured sum of logs implementation + * @return the StorelessUnivariateStatistic implementing the log sum + * @since 1.2 + */ + public StorelessUnivariateStatistic getSumLogImpl() { + return sumLogImpl; + } + + /** + * <p> + * Sets the implementation for the sum of logs. + * </p> + * <p> + * This method must be activated before any data has been added - i.e., + * before {@link #addValue(double) addValue} has been used to add data; + * otherwise an IllegalStateException will be thrown. + * </p> + * @param sumLogImpl the StorelessUnivariateStatistic instance to use for + * computing the log sum + * @throws IllegalStateException if data has already been added (i.e if n > + * 0) + * @since 1.2 + */ + public void setSumLogImpl(StorelessUnivariateStatistic sumLogImpl) { + checkEmpty(); + this.sumLogImpl = sumLogImpl; + geoMean.setSumLogImpl(sumLogImpl); + } + + /** + * Returns the currently configured geometric mean implementation + * @return the StorelessUnivariateStatistic implementing the geometric mean + * @since 1.2 + */ + public StorelessUnivariateStatistic getGeoMeanImpl() { + return geoMeanImpl; + } + + /** + * <p> + * Sets the implementation for the geometric mean. + * </p> + * <p> + * This method must be activated before any data has been added - i.e., + * before {@link #addValue(double) addValue} has been used to add data; + * otherwise an IllegalStateException will be thrown. + * </p> + * @param geoMeanImpl the StorelessUnivariateStatistic instance to use for + * computing the geometric mean + * @throws IllegalStateException if data has already been added (i.e if n > + * 0) + * @since 1.2 + */ + public void setGeoMeanImpl(StorelessUnivariateStatistic geoMeanImpl) { + checkEmpty(); + this.geoMeanImpl = geoMeanImpl; + } + + /** + * Returns the currently configured mean implementation + * @return the StorelessUnivariateStatistic implementing the mean + * @since 1.2 + */ + public StorelessUnivariateStatistic getMeanImpl() { + return meanImpl; + } + + /** + * <p> + * Sets the implementation for the mean. + * </p> + * <p> + * This method must be activated before any data has been added - i.e., + * before {@link #addValue(double) addValue} has been used to add data; + * otherwise an IllegalStateException will be thrown. + * </p> + * @param meanImpl the StorelessUnivariateStatistic instance to use for + * computing the mean + * @throws IllegalStateException if data has already been added (i.e if n > + * 0) + * @since 1.2 + */ + public void setMeanImpl(StorelessUnivariateStatistic meanImpl) { + checkEmpty(); + this.meanImpl = meanImpl; + } + + /** + * Returns the currently configured variance implementation + * @return the StorelessUnivariateStatistic implementing the variance + * @since 1.2 + */ + public StorelessUnivariateStatistic getVarianceImpl() { + return varianceImpl; + } + + /** + * <p> + * Sets the implementation for the variance. + * </p> + * <p> + * This method must be activated before any data has been added - i.e., + * before {@link #addValue(double) addValue} has been used to add data; + * otherwise an IllegalStateException will be thrown. + * </p> + * @param varianceImpl the StorelessUnivariateStatistic instance to use for + * computing the variance + * @throws IllegalStateException if data has already been added (i.e if n > + * 0) + * @since 1.2 + */ + public void setVarianceImpl(StorelessUnivariateStatistic varianceImpl) { + checkEmpty(); + this.varianceImpl = varianceImpl; + } + + /** + * Throws IllegalStateException if n > 0. + */ + private void checkEmpty() { + if (n > 0) { + throw MathRuntimeException.createIllegalStateException( + LocalizedFormats.VALUES_ADDED_BEFORE_CONFIGURING_STATISTIC, + n); + } + } + + /** + * Returns a copy of this SummaryStatistics instance with the same internal state. + * + * @return a copy of this + */ + public SummaryStatistics copy() { + SummaryStatistics result = new SummaryStatistics(); + copy(this, result); + return result; + } + + /** + * Copies source to dest. + * <p>Neither source nor dest can be null.</p> + * + * @param source SummaryStatistics to copy + * @param dest SummaryStatistics to copy to + * @throws NullPointerException if either source or dest is null + */ + public static void copy(SummaryStatistics source, SummaryStatistics dest) { + dest.maxImpl = source.maxImpl.copy(); + dest.meanImpl = source.meanImpl.copy(); + dest.minImpl = source.minImpl.copy(); + dest.sumImpl = source.sumImpl.copy(); + dest.varianceImpl = source.varianceImpl.copy(); + dest.sumLogImpl = source.sumLogImpl.copy(); + dest.sumsqImpl = source.sumsqImpl.copy(); + if (source.getGeoMeanImpl() instanceof GeometricMean) { + // Keep geoMeanImpl, sumLogImpl in synch + dest.geoMeanImpl = new GeometricMean((SumOfLogs) dest.sumLogImpl); + } else { + dest.geoMeanImpl = source.geoMeanImpl.copy(); + } + SecondMoment.copy(source.secondMoment, dest.secondMoment); + dest.n = source.n; + + // Make sure that if stat == statImpl in source, same + // holds in dest; otherwise copy stat + if (source.geoMean == source.geoMeanImpl) { + dest.geoMean = (GeometricMean) dest.geoMeanImpl; + } else { + GeometricMean.copy(source.geoMean, dest.geoMean); + } + if (source.max == source.maxImpl) { + dest.max = (Max) dest.maxImpl; + } else { + Max.copy(source.max, dest.max); + } + if (source.mean == source.meanImpl) { + dest.mean = (Mean) dest.meanImpl; + } else { + Mean.copy(source.mean, dest.mean); + } + if (source.min == source.minImpl) { + dest.min = (Min) dest.minImpl; + } else { + Min.copy(source.min, dest.min); + } + if (source.sum == source.sumImpl) { + dest.sum = (Sum) dest.sumImpl; + } else { + Sum.copy(source.sum, dest.sum); + } + if (source.variance == source.varianceImpl) { + dest.variance = (Variance) dest.varianceImpl; + } else { + Variance.copy(source.variance, dest.variance); + } + if (source.sumLog == source.sumLogImpl) { + dest.sumLog = (SumOfLogs) dest.sumLogImpl; + } else { + SumOfLogs.copy(source.sumLog, dest.sumLog); + } + if (source.sumsq == source.sumsqImpl) { + dest.sumsq = (SumOfSquares) dest.sumsqImpl; + } else { + SumOfSquares.copy(source.sumsq, dest.sumsq); + } + } +} diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/SynchronizedDescriptiveStatistics.java b/src/main/java/org/apache/commons/math/stat/descriptive/SynchronizedDescriptiveStatistics.java new file mode 100644 index 0000000..f1a932d --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/descriptive/SynchronizedDescriptiveStatistics.java @@ -0,0 +1,172 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.descriptive; + +/** + * Implementation of + * {@link org.apache.commons.math.stat.descriptive.DescriptiveStatistics} that + * is safe to use in a multithreaded environment. Multiple threads can safely + * operate on a single instance without causing runtime exceptions due to race + * conditions. In effect, this implementation makes modification and access + * methods atomic operations for a single instance. That is to say, as one + * thread is computing a statistic from the instance, no other thread can modify + * the instance nor compute another statistic. + * + * @since 1.2 + * @version $Revision: 811685 $ $Date: 2009-09-05 19:36:48 +0200 (sam. 05 sept. 2009) $ + */ +public class SynchronizedDescriptiveStatistics extends DescriptiveStatistics { + + /** Serialization UID */ + private static final long serialVersionUID = 1L; + + /** + * Construct an instance with infinite window + */ + public SynchronizedDescriptiveStatistics() { + this(INFINITE_WINDOW); + } + + /** + * Construct an instance with finite window + * @param window the finite window size. + */ + public SynchronizedDescriptiveStatistics(int window) { + super(window); + } + + /** + * A copy constructor. Creates a deep-copy of the {@code original}. + * + * @param original the {@code SynchronizedDescriptiveStatistics} instance to copy + */ + public SynchronizedDescriptiveStatistics(SynchronizedDescriptiveStatistics original) { + copy(original, this); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized void addValue(double v) { + super.addValue(v); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized double apply(UnivariateStatistic stat) { + return super.apply(stat); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized void clear() { + super.clear(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized double getElement(int index) { + return super.getElement(index); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized long getN() { + return super.getN(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized double getStandardDeviation() { + return super.getStandardDeviation(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized double[] getValues() { + return super.getValues(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized int getWindowSize() { + return super.getWindowSize(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized void setWindowSize(int windowSize) { + super.setWindowSize(windowSize); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized String toString() { + return super.toString(); + } + + /** + * Returns a copy of this SynchronizedDescriptiveStatistics instance with the + * same internal state. + * + * @return a copy of this + */ + @Override + public synchronized SynchronizedDescriptiveStatistics copy() { + SynchronizedDescriptiveStatistics result = + new SynchronizedDescriptiveStatistics(); + copy(this, result); + return result; + } + + /** + * Copies source to dest. + * <p>Neither source nor dest can be null.</p> + * <p>Acquires synchronization lock on source, then dest before copying.</p> + * + * @param source SynchronizedDescriptiveStatistics to copy + * @param dest SynchronizedDescriptiveStatistics to copy to + * @throws NullPointerException if either source or dest is null + */ + public static void copy(SynchronizedDescriptiveStatistics source, + SynchronizedDescriptiveStatistics dest) { + synchronized (source) { + synchronized (dest) { + DescriptiveStatistics.copy(source, dest); + } + } + } +} diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/SynchronizedMultivariateSummaryStatistics.java b/src/main/java/org/apache/commons/math/stat/descriptive/SynchronizedMultivariateSummaryStatistics.java new file mode 100644 index 0000000..190e092 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/descriptive/SynchronizedMultivariateSummaryStatistics.java @@ -0,0 +1,299 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.descriptive; + +import org.apache.commons.math.DimensionMismatchException; +import org.apache.commons.math.linear.RealMatrix; + +/** + * Implementation of + * {@link org.apache.commons.math.stat.descriptive.MultivariateSummaryStatistics} that + * is safe to use in a multithreaded environment. Multiple threads can safely + * operate on a single instance without causing runtime exceptions due to race + * conditions. In effect, this implementation makes modification and access + * methods atomic operations for a single instance. That is to say, as one + * thread is computing a statistic from the instance, no other thread can modify + * the instance nor compute another statistic. + * @since 1.2 + * @version $Revision: 811685 $ $Date: 2009-09-05 19:36:48 +0200 (sam. 05 sept. 2009) $ + */ +public class SynchronizedMultivariateSummaryStatistics + extends MultivariateSummaryStatistics { + + /** Serialization UID */ + private static final long serialVersionUID = 7099834153347155363L; + + /** + * Construct a SynchronizedMultivariateSummaryStatistics instance + * @param k dimension of the data + * @param isCovarianceBiasCorrected if true, the unbiased sample + * covariance is computed, otherwise the biased population covariance + * is computed + */ + public SynchronizedMultivariateSummaryStatistics(int k, boolean isCovarianceBiasCorrected) { + super(k, isCovarianceBiasCorrected); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized void addValue(double[] value) + throws DimensionMismatchException { + super.addValue(value); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized int getDimension() { + return super.getDimension(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized long getN() { + return super.getN(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized double[] getSum() { + return super.getSum(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized double[] getSumSq() { + return super.getSumSq(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized double[] getSumLog() { + return super.getSumLog(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized double[] getMean() { + return super.getMean(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized double[] getStandardDeviation() { + return super.getStandardDeviation(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized RealMatrix getCovariance() { + return super.getCovariance(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized double[] getMax() { + return super.getMax(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized double[] getMin() { + return super.getMin(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized double[] getGeometricMean() { + return super.getGeometricMean(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized String toString() { + return super.toString(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized void clear() { + super.clear(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized boolean equals(Object object) { + return super.equals(object); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized int hashCode() { + return super.hashCode(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized StorelessUnivariateStatistic[] getSumImpl() { + return super.getSumImpl(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized void setSumImpl(StorelessUnivariateStatistic[] sumImpl) + throws DimensionMismatchException { + super.setSumImpl(sumImpl); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized StorelessUnivariateStatistic[] getSumsqImpl() { + return super.getSumsqImpl(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized void setSumsqImpl(StorelessUnivariateStatistic[] sumsqImpl) + throws DimensionMismatchException { + super.setSumsqImpl(sumsqImpl); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized StorelessUnivariateStatistic[] getMinImpl() { + return super.getMinImpl(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized void setMinImpl(StorelessUnivariateStatistic[] minImpl) + throws DimensionMismatchException { + super.setMinImpl(minImpl); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized StorelessUnivariateStatistic[] getMaxImpl() { + return super.getMaxImpl(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized void setMaxImpl(StorelessUnivariateStatistic[] maxImpl) + throws DimensionMismatchException { + super.setMaxImpl(maxImpl); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized StorelessUnivariateStatistic[] getSumLogImpl() { + return super.getSumLogImpl(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized void setSumLogImpl(StorelessUnivariateStatistic[] sumLogImpl) + throws DimensionMismatchException { + super.setSumLogImpl(sumLogImpl); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized StorelessUnivariateStatistic[] getGeoMeanImpl() { + return super.getGeoMeanImpl(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized void setGeoMeanImpl(StorelessUnivariateStatistic[] geoMeanImpl) + throws DimensionMismatchException { + super.setGeoMeanImpl(geoMeanImpl); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized StorelessUnivariateStatistic[] getMeanImpl() { + return super.getMeanImpl(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized void setMeanImpl(StorelessUnivariateStatistic[] meanImpl) + throws DimensionMismatchException { + super.setMeanImpl(meanImpl); + } + +} diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/SynchronizedSummaryStatistics.java b/src/main/java/org/apache/commons/math/stat/descriptive/SynchronizedSummaryStatistics.java new file mode 100644 index 0000000..07bfbf2 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/descriptive/SynchronizedSummaryStatistics.java @@ -0,0 +1,333 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.descriptive; + +/** + * Implementation of + * {@link org.apache.commons.math.stat.descriptive.SummaryStatistics} that + * is safe to use in a multithreaded environment. Multiple threads can safely + * operate on a single instance without causing runtime exceptions due to race + * conditions. In effect, this implementation makes modification and access + * methods atomic operations for a single instance. That is to say, as one + * thread is computing a statistic from the instance, no other thread can modify + * the instance nor compute another statistic. + * + * @since 1.2 + * @version $Revision: 811685 $ $Date: 2009-09-05 19:36:48 +0200 (sam. 05 sept. 2009) $ + */ +public class SynchronizedSummaryStatistics extends SummaryStatistics { + + /** Serialization UID */ + private static final long serialVersionUID = 1909861009042253704L; + + /** + * Construct a SynchronizedSummaryStatistics instance + */ + public SynchronizedSummaryStatistics() { + super(); + } + + /** + * A copy constructor. Creates a deep-copy of the {@code original}. + * + * @param original the {@code SynchronizedSummaryStatistics} instance to copy + */ + public SynchronizedSummaryStatistics(SynchronizedSummaryStatistics original) { + copy(original, this); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized StatisticalSummary getSummary() { + return super.getSummary(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized void addValue(double value) { + super.addValue(value); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized long getN() { + return super.getN(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized double getSum() { + return super.getSum(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized double getSumsq() { + return super.getSumsq(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized double getMean() { + return super.getMean(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized double getStandardDeviation() { + return super.getStandardDeviation(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized double getVariance() { + return super.getVariance(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized double getMax() { + return super.getMax(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized double getMin() { + return super.getMin(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized double getGeometricMean() { + return super.getGeometricMean(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized String toString() { + return super.toString(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized void clear() { + super.clear(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized boolean equals(Object object) { + return super.equals(object); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized int hashCode() { + return super.hashCode(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized StorelessUnivariateStatistic getSumImpl() { + return super.getSumImpl(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized void setSumImpl(StorelessUnivariateStatistic sumImpl) { + super.setSumImpl(sumImpl); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized StorelessUnivariateStatistic getSumsqImpl() { + return super.getSumsqImpl(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized void setSumsqImpl(StorelessUnivariateStatistic sumsqImpl) { + super.setSumsqImpl(sumsqImpl); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized StorelessUnivariateStatistic getMinImpl() { + return super.getMinImpl(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized void setMinImpl(StorelessUnivariateStatistic minImpl) { + super.setMinImpl(minImpl); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized StorelessUnivariateStatistic getMaxImpl() { + return super.getMaxImpl(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized void setMaxImpl(StorelessUnivariateStatistic maxImpl) { + super.setMaxImpl(maxImpl); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized StorelessUnivariateStatistic getSumLogImpl() { + return super.getSumLogImpl(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized void setSumLogImpl(StorelessUnivariateStatistic sumLogImpl) { + super.setSumLogImpl(sumLogImpl); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized StorelessUnivariateStatistic getGeoMeanImpl() { + return super.getGeoMeanImpl(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized void setGeoMeanImpl(StorelessUnivariateStatistic geoMeanImpl) { + super.setGeoMeanImpl(geoMeanImpl); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized StorelessUnivariateStatistic getMeanImpl() { + return super.getMeanImpl(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized void setMeanImpl(StorelessUnivariateStatistic meanImpl) { + super.setMeanImpl(meanImpl); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized StorelessUnivariateStatistic getVarianceImpl() { + return super.getVarianceImpl(); + } + + /** + * {@inheritDoc} + */ + @Override + public synchronized void setVarianceImpl(StorelessUnivariateStatistic varianceImpl) { + super.setVarianceImpl(varianceImpl); + } + + /** + * Returns a copy of this SynchronizedSummaryStatistics instance with the + * same internal state. + * + * @return a copy of this + */ + @Override + public synchronized SynchronizedSummaryStatistics copy() { + SynchronizedSummaryStatistics result = + new SynchronizedSummaryStatistics(); + copy(this, result); + return result; + } + + /** + * Copies source to dest. + * <p>Neither source nor dest can be null.</p> + * <p>Acquires synchronization lock on source, then dest before copying.</p> + * + * @param source SynchronizedSummaryStatistics to copy + * @param dest SynchronizedSummaryStatistics to copy to + * @throws NullPointerException if either source or dest is null + */ + public static void copy(SynchronizedSummaryStatistics source, + SynchronizedSummaryStatistics dest) { + synchronized (source) { + synchronized (dest) { + SummaryStatistics.copy(source, dest); + } + } + } + +} diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/UnivariateStatistic.java b/src/main/java/org/apache/commons/math/stat/descriptive/UnivariateStatistic.java new file mode 100644 index 0000000..92c9ee2 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/descriptive/UnivariateStatistic.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.descriptive; + + +/** + * Base interface implemented by all statistics. + * + * @version $Revision: 811685 $ $Date: 2009-09-05 19:36:48 +0200 (sam. 05 sept. 2009) $ + */ +public interface UnivariateStatistic { + + /** + * Returns the result of evaluating the statistic over the input array. + * + * @param values input array + * @return the value of the statistic applied to the input array + */ + double evaluate(double[] values); + + /** + * Returns the result of evaluating the statistic over the specified entries + * in the input array. + * + * @param values the input array + * @param begin the index of the first element to include + * @param length the number of elements to include + * @return the value of the statistic applied to the included array entries + */ + double evaluate(double[] values, int begin, int length); + + /** + * Returns a copy of the statistic with the same internal state. + * + * @return a copy of the statistic + */ + UnivariateStatistic copy(); + +} diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/WeightedEvaluation.java b/src/main/java/org/apache/commons/math/stat/descriptive/WeightedEvaluation.java new file mode 100644 index 0000000..54a0216 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/descriptive/WeightedEvaluation.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.descriptive; + +/** + * Weighted evaluation for statistics. + * + * @since 2.1 + * @version $Revision: 894474 $ $Date: 2009-12-29 21:02:37 +0100 (mar. 29 déc. 2009) $ + */ +public interface WeightedEvaluation { + + /** + * Returns the result of evaluating the statistic over the input array, + * using the supplied weights. + * + * @param values input array + * @param weights array of weights + * @return the value of the weighted statistic applied to the input array + */ + double evaluate(double[] values, double[] weights); + + /** + * Returns the result of evaluating the statistic over the specified entries + * in the input array, using corresponding entries in the supplied weights array. + * + * @param values the input array + * @param weights array of weights + * @param begin the index of the first element to include + * @param length the number of elements to include + * @return the value of the weighted statistic applied to the included array entries + */ + double evaluate(double[] values, double[] weights, int begin, int length); + +} diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/moment/FirstMoment.java b/src/main/java/org/apache/commons/math/stat/descriptive/moment/FirstMoment.java new file mode 100644 index 0000000..4103e50 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/descriptive/moment/FirstMoment.java @@ -0,0 +1,160 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.descriptive.moment; + +import java.io.Serializable; +import org.apache.commons.math.stat.descriptive.AbstractStorelessUnivariateStatistic; + +/** + * Computes the first moment (arithmetic mean). Uses the definitional formula: + * <p> + * mean = sum(x_i) / n </p> + * <p> + * where <code>n</code> is the number of observations. </p> + * <p> + * To limit numeric errors, the value of the statistic is computed using the + * following recursive updating algorithm: </p> + * <p> + * <ol> + * <li>Initialize <code>m = </code> the first value</li> + * <li>For each additional value, update using <br> + * <code>m = m + (new value - m) / (number of observations)</code></li> + * </ol></p> + * <p> + * Returns <code>Double.NaN</code> if the dataset is empty.</p> + * <p> + * <strong>Note that this implementation is not synchronized.</strong> If + * multiple threads access an instance of this class concurrently, and at least + * one of the threads invokes the <code>increment()</code> or + * <code>clear()</code> method, it must be synchronized externally.</p> + * + * @version $Revision: 1006299 $ $Date: 2010-10-10 16:47:17 +0200 (dim. 10 oct. 2010) $ + */ +public class FirstMoment extends AbstractStorelessUnivariateStatistic + implements Serializable { + + /** Serializable version identifier */ + private static final long serialVersionUID = 6112755307178490473L; + + + /** Count of values that have been added */ + protected long n; + + /** First moment of values that have been added */ + protected double m1; + + /** + * Deviation of most recently added value from previous first moment. + * Retained to prevent repeated computation in higher order moments. + */ + protected double dev; + + /** + * Deviation of most recently added value from previous first moment, + * normalized by previous sample size. Retained to prevent repeated + * computation in higher order moments + */ + protected double nDev; + + /** + * Create a FirstMoment instance + */ + public FirstMoment() { + n = 0; + m1 = Double.NaN; + dev = Double.NaN; + nDev = Double.NaN; + } + + /** + * Copy constructor, creates a new {@code FirstMoment} identical + * to the {@code original} + * + * @param original the {@code FirstMoment} instance to copy + */ + public FirstMoment(FirstMoment original) { + super(); + copy(original, this); + } + + /** + * {@inheritDoc} + */ + @Override + public void increment(final double d) { + if (n == 0) { + m1 = 0.0; + } + n++; + double n0 = n; + dev = d - m1; + nDev = dev / n0; + m1 += nDev; + } + + /** + * {@inheritDoc} + */ + @Override + public void clear() { + m1 = Double.NaN; + n = 0; + dev = Double.NaN; + nDev = Double.NaN; + } + + /** + * {@inheritDoc} + */ + @Override + public double getResult() { + return m1; + } + + /** + * {@inheritDoc} + */ + public long getN() { + return n; + } + + /** + * {@inheritDoc} + */ + @Override + public FirstMoment copy() { + FirstMoment result = new FirstMoment(); + copy(this, result); + return result; + } + + /** + * Copies source to dest. + * <p>Neither source nor dest can be null.</p> + * + * @param source FirstMoment to copy + * @param dest FirstMoment to copy to + * @throws NullPointerException if either source or dest is null + */ + public static void copy(FirstMoment source, FirstMoment dest) { + dest.setData(source.getDataRef()); + dest.n = source.n; + dest.m1 = source.m1; + dest.dev = source.dev; + dest.nDev = source.nDev; + } +} diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/moment/FourthMoment.java b/src/main/java/org/apache/commons/math/stat/descriptive/moment/FourthMoment.java new file mode 100644 index 0000000..6e7d8d2 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/descriptive/moment/FourthMoment.java @@ -0,0 +1,142 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.descriptive.moment; + +import java.io.Serializable; + +/** + * Computes a statistic related to the Fourth Central Moment. Specifically, + * what is computed is the sum of + * <p> + * (x_i - xbar) ^ 4, </p> + * <p> + * where the x_i are the + * sample observations and xbar is the sample mean. </p> + * <p> + * The following recursive updating formula is used: </p> + * <p> + * Let <ul> + * <li> dev = (current obs - previous mean) </li> + * <li> m2 = previous value of {@link SecondMoment} </li> + * <li> m2 = previous value of {@link ThirdMoment} </li> + * <li> n = number of observations (including current obs) </li> + * </ul> + * Then </p> + * <p> + * new value = old value - 4 * (dev/n) * m3 + 6 * (dev/n)^2 * m2 + <br> + * [n^2 - 3 * (n-1)] * dev^4 * (n-1) / n^3 </p> + * <p> + * Returns <code>Double.NaN</code> if no data values have been added and + * returns <code>0</code> if there is just one value in the data set. </p> + * <p> + * <strong>Note that this implementation is not synchronized.</strong> If + * multiple threads access an instance of this class concurrently, and at least + * one of the threads invokes the <code>increment()</code> or + * <code>clear()</code> method, it must be synchronized externally. </p> + * + * @version $Revision: 811685 $ $Date: 2009-09-05 19:36:48 +0200 (sam. 05 sept. 2009) $ + */ +public class FourthMoment extends ThirdMoment implements Serializable{ + + /** Serializable version identifier */ + private static final long serialVersionUID = 4763990447117157611L; + + /** fourth moment of values that have been added */ + protected double m4; + + /** + * Create a FourthMoment instance + */ + public FourthMoment() { + super(); + m4 = Double.NaN; + } + + /** + * Copy constructor, creates a new {@code FourthMoment} identical + * to the {@code original} + * + * @param original the {@code FourthMoment} instance to copy + */ + public FourthMoment(FourthMoment original) { + super(); + copy(original, this); + } + + /** + * {@inheritDoc} + */ + @Override + public void increment(final double d) { + if (n < 1) { + m4 = 0.0; + m3 = 0.0; + m2 = 0.0; + m1 = 0.0; + } + + double prevM3 = m3; + double prevM2 = m2; + + super.increment(d); + + double n0 = n; + + m4 = m4 - 4.0 * nDev * prevM3 + 6.0 * nDevSq * prevM2 + + ((n0 * n0) - 3 * (n0 -1)) * (nDevSq * nDevSq * (n0 - 1) * n0); + } + + /** + * {@inheritDoc} + */ + @Override + public double getResult() { + return m4; + } + + /** + * {@inheritDoc} + */ + @Override + public void clear() { + super.clear(); + m4 = Double.NaN; + } + + /** + * {@inheritDoc} + */ + @Override + public FourthMoment copy() { + FourthMoment result = new FourthMoment(); + copy(this, result); + return result; + } + + /** + * Copies source to dest. + * <p>Neither source nor dest can be null.</p> + * + * @param source FourthMoment to copy + * @param dest FourthMoment to copy to + * @throws NullPointerException if either source or dest is null + */ + public static void copy(FourthMoment source, FourthMoment dest) { + ThirdMoment.copy(source, dest); + dest.m4 = source.m4; + } +} diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/moment/GeometricMean.java b/src/main/java/org/apache/commons/math/stat/descriptive/moment/GeometricMean.java new file mode 100644 index 0000000..a24a3c8 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/descriptive/moment/GeometricMean.java @@ -0,0 +1,205 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.descriptive.moment; + +import java.io.Serializable; + +import org.apache.commons.math.MathRuntimeException; +import org.apache.commons.math.exception.util.LocalizedFormats; +import org.apache.commons.math.stat.descriptive.AbstractStorelessUnivariateStatistic; +import org.apache.commons.math.stat.descriptive.StorelessUnivariateStatistic; +import org.apache.commons.math.stat.descriptive.summary.SumOfLogs; +import org.apache.commons.math.util.FastMath; + +/** + * Returns the <a href="http://www.xycoon.com/geometric_mean.htm"> + * geometric mean </a> of the available values. + * <p> + * Uses a {@link SumOfLogs} instance to compute sum of logs and returns + * <code> exp( 1/n (sum of logs) ).</code> Therefore, </p> + * <ul> + * <li>If any of values are < 0, the result is <code>NaN.</code></li> + * <li>If all values are non-negative and less than + * <code>Double.POSITIVE_INFINITY</code>, but at least one value is 0, the + * result is <code>0.</code></li> + * <li>If both <code>Double.POSITIVE_INFINITY</code> and + * <code>Double.NEGATIVE_INFINITY</code> are among the values, the result is + * <code>NaN.</code></li> + * </ul> </p> + * <p> + * <strong>Note that this implementation is not synchronized.</strong> If + * multiple threads access an instance of this class concurrently, and at least + * one of the threads invokes the <code>increment()</code> or + * <code>clear()</code> method, it must be synchronized externally.</p> + * + * + * @version $Revision: 1006299 $ $Date: 2010-10-10 16:47:17 +0200 (dim. 10 oct. 2010) $ + */ +public class GeometricMean extends AbstractStorelessUnivariateStatistic implements Serializable { + + /** Serializable version identifier */ + private static final long serialVersionUID = -8178734905303459453L; + + /** Wrapped SumOfLogs instance */ + private StorelessUnivariateStatistic sumOfLogs; + + /** + * Create a GeometricMean instance + */ + public GeometricMean() { + sumOfLogs = new SumOfLogs(); + } + + /** + * Copy constructor, creates a new {@code GeometricMean} identical + * to the {@code original} + * + * @param original the {@code GeometricMean} instance to copy + */ + public GeometricMean(GeometricMean original) { + super(); + copy(original, this); + } + + /** + * Create a GeometricMean instance using the given SumOfLogs instance + * @param sumOfLogs sum of logs instance to use for computation + */ + public GeometricMean(SumOfLogs sumOfLogs) { + this.sumOfLogs = sumOfLogs; + } + + /** + * {@inheritDoc} + */ + @Override + public GeometricMean copy() { + GeometricMean result = new GeometricMean(); + copy(this, result); + return result; + } + + /** + * {@inheritDoc} + */ + @Override + public void increment(final double d) { + sumOfLogs.increment(d); + } + + /** + * {@inheritDoc} + */ + @Override + public double getResult() { + if (sumOfLogs.getN() > 0) { + return FastMath.exp(sumOfLogs.getResult() / sumOfLogs.getN()); + } else { + return Double.NaN; + } + } + + /** + * {@inheritDoc} + */ + @Override + public void clear() { + sumOfLogs.clear(); + } + + /** + * Returns the geometric mean of the entries in the specified portion + * of the input array. + * <p> + * See {@link GeometricMean} for details on the computing algorithm.</p> + * <p> + * Throws <code>IllegalArgumentException</code> if the array is null.</p> + * + * @param values input array containing the values + * @param begin first array element to include + * @param length the number of elements to include + * @return the geometric mean or Double.NaN if length = 0 or + * any of the values are <= 0. + * @throws IllegalArgumentException if the input array is null or the array + * index parameters are not valid + */ + @Override + public double evaluate( + final double[] values, final int begin, final int length) { + return FastMath.exp( + sumOfLogs.evaluate(values, begin, length) / length); + } + + /** + * {@inheritDoc} + */ + public long getN() { + return sumOfLogs.getN(); + } + + /** + * <p>Sets the implementation for the sum of logs.</p> + * <p>This method must be activated before any data has been added - i.e., + * before {@link #increment(double) increment} has been used to add data; + * otherwise an IllegalStateException will be thrown.</p> + * + * @param sumLogImpl the StorelessUnivariateStatistic instance to use + * for computing the log sum + * @throws IllegalStateException if data has already been added + * (i.e if n > 0) + */ + public void setSumLogImpl( + StorelessUnivariateStatistic sumLogImpl) { + checkEmpty(); + this.sumOfLogs = sumLogImpl; + } + + /** + * Returns the currently configured sum of logs implementation + * + * @return the StorelessUnivariateStatistic implementing the log sum + */ + public StorelessUnivariateStatistic getSumLogImpl() { + return sumOfLogs; + } + + /** + * Copies source to dest. + * <p>Neither source nor dest can be null.</p> + * + * @param source GeometricMean to copy + * @param dest GeometricMean to copy to + * @throws NullPointerException if either source or dest is null + */ + public static void copy(GeometricMean source, GeometricMean dest) { + dest.setData(source.getDataRef()); + dest.sumOfLogs = source.sumOfLogs.copy(); + } + + + /** + * Throws IllegalStateException if n > 0. + */ + private void checkEmpty() { + if (getN() > 0) { + throw MathRuntimeException.createIllegalStateException( + LocalizedFormats.VALUES_ADDED_BEFORE_CONFIGURING_STATISTIC, + getN()); + } + } + +} diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/moment/Kurtosis.java b/src/main/java/org/apache/commons/math/stat/descriptive/moment/Kurtosis.java new file mode 100644 index 0000000..f648051 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/descriptive/moment/Kurtosis.java @@ -0,0 +1,222 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.descriptive.moment; + +import java.io.Serializable; + +import org.apache.commons.math.MathRuntimeException; +import org.apache.commons.math.exception.util.LocalizedFormats; +import org.apache.commons.math.stat.descriptive.AbstractStorelessUnivariateStatistic; +import org.apache.commons.math.util.FastMath; + + +/** + * Computes the Kurtosis of the available values. + * <p> + * We use the following (unbiased) formula to define kurtosis:</p> + * <p> + * kurtosis = { [n(n+1) / (n -1)(n - 2)(n-3)] sum[(x_i - mean)^4] / std^4 } - [3(n-1)^2 / (n-2)(n-3)] + * </p><p> + * where n is the number of values, mean is the {@link Mean} and std is the + * {@link StandardDeviation}</p> + * <p> + * Note that this statistic is undefined for n < 4. <code>Double.Nan</code> + * is returned when there is not sufficient data to compute the statistic.</p> + * <p> + * <strong>Note that this implementation is not synchronized.</strong> If + * multiple threads access an instance of this class concurrently, and at least + * one of the threads invokes the <code>increment()</code> or + * <code>clear()</code> method, it must be synchronized externally.</p> + * + * @version $Revision: 1006299 $ $Date: 2010-10-10 16:47:17 +0200 (dim. 10 oct. 2010) $ + */ +public class Kurtosis extends AbstractStorelessUnivariateStatistic implements Serializable { + + /** Serializable version identifier */ + private static final long serialVersionUID = 2784465764798260919L; + + /**Fourth Moment on which this statistic is based */ + protected FourthMoment moment; + + /** + * Determines whether or not this statistic can be incremented or cleared. + * <p> + * Statistics based on (constructed from) external moments cannot + * be incremented or cleared.</p> + */ + protected boolean incMoment; + + /** + * Construct a Kurtosis + */ + public Kurtosis() { + incMoment = true; + moment = new FourthMoment(); + } + + /** + * Construct a Kurtosis from an external moment + * + * @param m4 external Moment + */ + public Kurtosis(final FourthMoment m4) { + incMoment = false; + this.moment = m4; + } + + /** + * Copy constructor, creates a new {@code Kurtosis} identical + * to the {@code original} + * + * @param original the {@code Kurtosis} instance to copy + */ + public Kurtosis(Kurtosis original) { + copy(original, this); + } + + /** + * {@inheritDoc} + */ + @Override + public void increment(final double d) { + if (incMoment) { + moment.increment(d); + } else { + throw MathRuntimeException.createIllegalStateException( + LocalizedFormats.CANNOT_INCREMENT_STATISTIC_CONSTRUCTED_FROM_EXTERNAL_MOMENTS); + } + } + + /** + * {@inheritDoc} + */ + @Override + public double getResult() { + double kurtosis = Double.NaN; + if (moment.getN() > 3) { + double variance = moment.m2 / (moment.n - 1); + if (moment.n <= 3 || variance < 10E-20) { + kurtosis = 0.0; + } else { + double n = moment.n; + kurtosis = + (n * (n + 1) * moment.m4 - + 3 * moment.m2 * moment.m2 * (n - 1)) / + ((n - 1) * (n -2) * (n -3) * variance * variance); + } + } + return kurtosis; + } + + /** + * {@inheritDoc} + */ + @Override + public void clear() { + if (incMoment) { + moment.clear(); + } else { + throw MathRuntimeException.createIllegalStateException( + LocalizedFormats.CANNOT_CLEAR_STATISTIC_CONSTRUCTED_FROM_EXTERNAL_MOMENTS); + } + } + + /** + * {@inheritDoc} + */ + public long getN() { + return moment.getN(); + } + + /* UnvariateStatistic Approach */ + + /** + * Returns the kurtosis of the entries in the specified portion of the + * input array. + * <p> + * See {@link Kurtosis} for details on the computing algorithm.</p> + * <p> + * Throws <code>IllegalArgumentException</code> if the array is null.</p> + * + * @param values the input array + * @param begin index of the first array element to include + * @param length the number of elements to include + * @return the kurtosis of the values or Double.NaN if length is less than + * 4 + * @throws IllegalArgumentException if the input array is null or the array + * index parameters are not valid + */ + @Override + public double evaluate(final double[] values,final int begin, final int length) { + // Initialize the kurtosis + double kurt = Double.NaN; + + if (test(values, begin, length) && length > 3) { + + // Compute the mean and standard deviation + Variance variance = new Variance(); + variance.incrementAll(values, begin, length); + double mean = variance.moment.m1; + double stdDev = FastMath.sqrt(variance.getResult()); + + // Sum the ^4 of the distance from the mean divided by the + // standard deviation + double accum3 = 0.0; + for (int i = begin; i < begin + length; i++) { + accum3 += FastMath.pow(values[i] - mean, 4.0); + } + accum3 /= FastMath.pow(stdDev, 4.0d); + + // Get N + double n0 = length; + + double coefficientOne = + (n0 * (n0 + 1)) / ((n0 - 1) * (n0 - 2) * (n0 - 3)); + double termTwo = + (3 * FastMath.pow(n0 - 1, 2.0)) / ((n0 - 2) * (n0 - 3)); + + // Calculate kurtosis + kurt = (coefficientOne * accum3) - termTwo; + } + return kurt; + } + + /** + * {@inheritDoc} + */ + @Override + public Kurtosis copy() { + Kurtosis result = new Kurtosis(); + copy(this, result); + return result; + } + + /** + * Copies source to dest. + * <p>Neither source nor dest can be null.</p> + * + * @param source Kurtosis to copy + * @param dest Kurtosis to copy to + * @throws NullPointerException if either source or dest is null + */ + public static void copy(Kurtosis source, Kurtosis dest) { + dest.setData(source.getDataRef()); + dest.moment = source.moment.copy(); + dest.incMoment = source.incMoment; + } + +} diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/moment/Mean.java b/src/main/java/org/apache/commons/math/stat/descriptive/moment/Mean.java new file mode 100644 index 0000000..c5aa9da --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/descriptive/moment/Mean.java @@ -0,0 +1,272 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.descriptive.moment; + +import java.io.Serializable; + +import org.apache.commons.math.stat.descriptive.AbstractStorelessUnivariateStatistic; +import org.apache.commons.math.stat.descriptive.WeightedEvaluation; +import org.apache.commons.math.stat.descriptive.summary.Sum; + +/** + * <p>Computes the arithmetic mean of a set of values. Uses the definitional + * formula:</p> + * <p> + * mean = sum(x_i) / n + * </p> + * <p>where <code>n</code> is the number of observations. + * </p> + * <p>When {@link #increment(double)} is used to add data incrementally from a + * stream of (unstored) values, the value of the statistic that + * {@link #getResult()} returns is computed using the following recursive + * updating algorithm: </p> + * <ol> + * <li>Initialize <code>m = </code> the first value</li> + * <li>For each additional value, update using <br> + * <code>m = m + (new value - m) / (number of observations)</code></li> + * </ol> + * <p> If {@link #evaluate(double[])} is used to compute the mean of an array + * of stored values, a two-pass, corrected algorithm is used, starting with + * the definitional formula computed using the array of stored values and then + * correcting this by adding the mean deviation of the data values from the + * arithmetic mean. See, e.g. "Comparison of Several Algorithms for Computing + * Sample Means and Variances," Robert F. Ling, Journal of the American + * Statistical Association, Vol. 69, No. 348 (Dec., 1974), pp. 859-866. </p> + * <p> + * Returns <code>Double.NaN</code> if the dataset is empty. + * </p> + * <strong>Note that this implementation is not synchronized.</strong> If + * multiple threads access an instance of this class concurrently, and at least + * one of the threads invokes the <code>increment()</code> or + * <code>clear()</code> method, it must be synchronized externally. + * + * @version $Revision: 1006299 $ $Date: 2010-10-10 16:47:17 +0200 (dim. 10 oct. 2010) $ + */ +public class Mean extends AbstractStorelessUnivariateStatistic + implements Serializable, WeightedEvaluation { + + /** Serializable version identifier */ + private static final long serialVersionUID = -1296043746617791564L; + + /** First moment on which this statistic is based. */ + protected FirstMoment moment; + + /** + * Determines whether or not this statistic can be incremented or cleared. + * <p> + * Statistics based on (constructed from) external moments cannot + * be incremented or cleared.</p> + */ + protected boolean incMoment; + + /** Constructs a Mean. */ + public Mean() { + incMoment = true; + moment = new FirstMoment(); + } + + /** + * Constructs a Mean with an External Moment. + * + * @param m1 the moment + */ + public Mean(final FirstMoment m1) { + this.moment = m1; + incMoment = false; + } + + /** + * Copy constructor, creates a new {@code Mean} identical + * to the {@code original} + * + * @param original the {@code Mean} instance to copy + */ + public Mean(Mean original) { + copy(original, this); + } + + /** + * {@inheritDoc} + */ + @Override + public void increment(final double d) { + if (incMoment) { + moment.increment(d); + } + } + + /** + * {@inheritDoc} + */ + @Override + public void clear() { + if (incMoment) { + moment.clear(); + } + } + + /** + * {@inheritDoc} + */ + @Override + public double getResult() { + return moment.m1; + } + + /** + * {@inheritDoc} + */ + public long getN() { + return moment.getN(); + } + + /** + * Returns the arithmetic mean of the entries in the specified portion of + * the input array, or <code>Double.NaN</code> if the designated subarray + * is empty. + * <p> + * Throws <code>IllegalArgumentException</code> if the array is null.</p> + * <p> + * See {@link Mean} for details on the computing algorithm.</p> + * + * @param values the input array + * @param begin index of the first array element to include + * @param length the number of elements to include + * @return the mean of the values or Double.NaN if length = 0 + * @throws IllegalArgumentException if the array is null or the array index + * parameters are not valid + */ + @Override + public double evaluate(final double[] values,final int begin, final int length) { + if (test(values, begin, length)) { + Sum sum = new Sum(); + double sampleSize = length; + + // Compute initial estimate using definitional formula + double xbar = sum.evaluate(values, begin, length) / sampleSize; + + // Compute correction factor in second pass + double correction = 0; + for (int i = begin; i < begin + length; i++) { + correction += values[i] - xbar; + } + return xbar + (correction/sampleSize); + } + return Double.NaN; + } + + /** + * Returns the weighted arithmetic mean of the entries in the specified portion of + * the input array, or <code>Double.NaN</code> if the designated subarray + * is empty. + * <p> + * Throws <code>IllegalArgumentException</code> if either array is null.</p> + * <p> + * See {@link Mean} for details on the computing algorithm. The two-pass algorithm + * described above is used here, with weights applied in computing both the original + * estimate and the correction factor.</p> + * <p> + * Throws <code>IllegalArgumentException</code> if any of the following are true: + * <ul><li>the values array is null</li> + * <li>the weights array is null</li> + * <li>the weights array does not have the same length as the values array</li> + * <li>the weights array contains one or more infinite values</li> + * <li>the weights array contains one or more NaN values</li> + * <li>the weights array contains negative values</li> + * <li>the start and length arguments do not determine a valid array</li> + * </ul></p> + * + * @param values the input array + * @param weights the weights array + * @param begin index of the first array element to include + * @param length the number of elements to include + * @return the mean of the values or Double.NaN if length = 0 + * @throws IllegalArgumentException if the parameters are not valid + * @since 2.1 + */ + public double evaluate(final double[] values, final double[] weights, + final int begin, final int length) { + if (test(values, weights, begin, length)) { + Sum sum = new Sum(); + + // Compute initial estimate using definitional formula + double sumw = sum.evaluate(weights,begin,length); + double xbarw = sum.evaluate(values, weights, begin, length) / sumw; + + // Compute correction factor in second pass + double correction = 0; + for (int i = begin; i < begin + length; i++) { + correction += weights[i] * (values[i] - xbarw); + } + return xbarw + (correction/sumw); + } + return Double.NaN; + } + + /** + * Returns the weighted arithmetic mean of the entries in the input array. + * <p> + * Throws <code>IllegalArgumentException</code> if either array is null.</p> + * <p> + * See {@link Mean} for details on the computing algorithm. The two-pass algorithm + * described above is used here, with weights applied in computing both the original + * estimate and the correction factor.</p> + * <p> + * Throws <code>IllegalArgumentException</code> if any of the following are true: + * <ul><li>the values array is null</li> + * <li>the weights array is null</li> + * <li>the weights array does not have the same length as the values array</li> + * <li>the weights array contains one or more infinite values</li> + * <li>the weights array contains one or more NaN values</li> + * <li>the weights array contains negative values</li> + * </ul></p> + * + * @param values the input array + * @param weights the weights array + * @return the mean of the values or Double.NaN if length = 0 + * @throws IllegalArgumentException if the parameters are not valid + * @since 2.1 + */ + public double evaluate(final double[] values, final double[] weights) { + return evaluate(values, weights, 0, values.length); + } + + /** + * {@inheritDoc} + */ + @Override + public Mean copy() { + Mean result = new Mean(); + copy(this, result); + return result; + } + + + /** + * Copies source to dest. + * <p>Neither source nor dest can be null.</p> + * + * @param source Mean to copy + * @param dest Mean to copy to + * @throws NullPointerException if either source or dest is null + */ + public static void copy(Mean source, Mean dest) { + dest.setData(source.getDataRef()); + dest.incMoment = source.incMoment; + dest.moment = source.moment.copy(); + } +} diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/moment/SecondMoment.java b/src/main/java/org/apache/commons/math/stat/descriptive/moment/SecondMoment.java new file mode 100644 index 0000000..ae8ef8e --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/descriptive/moment/SecondMoment.java @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.descriptive.moment; + +import java.io.Serializable; + +/** + * Computes a statistic related to the Second Central Moment. Specifically, + * what is computed is the sum of squared deviations from the sample mean. + * <p> + * The following recursive updating formula is used:</p> + * <p> + * Let <ul> + * <li> dev = (current obs - previous mean) </li> + * <li> n = number of observations (including current obs) </li> + * </ul> + * Then</p> + * <p> + * new value = old value + dev^2 * (n -1) / n.</p> + * <p> + * Returns <code>Double.NaN</code> if no data values have been added and + * returns <code>0</code> if there is just one value in the data set.</p> + * <p> + * <strong>Note that this implementation is not synchronized.</strong> If + * multiple threads access an instance of this class concurrently, and at least + * one of the threads invokes the <code>increment()</code> or + * <code>clear()</code> method, it must be synchronized externally.</p> + * + * @version $Revision: 811685 $ $Date: 2009-09-05 19:36:48 +0200 (sam. 05 sept. 2009) $ + */ +public class SecondMoment extends FirstMoment implements Serializable { + + /** Serializable version identifier */ + private static final long serialVersionUID = 3942403127395076445L; + + /** second moment of values that have been added */ + protected double m2; + + /** + * Create a SecondMoment instance + */ + public SecondMoment() { + super(); + m2 = Double.NaN; + } + + /** + * Copy constructor, creates a new {@code SecondMoment} identical + * to the {@code original} + * + * @param original the {@code SecondMoment} instance to copy + */ + public SecondMoment(SecondMoment original) { + super(original); + this.m2 = original.m2; + } + + /** + * {@inheritDoc} + */ + @Override + public void increment(final double d) { + if (n < 1) { + m1 = m2 = 0.0; + } + super.increment(d); + m2 += ((double) n - 1) * dev * nDev; + } + + /** + * {@inheritDoc} + */ + @Override + public void clear() { + super.clear(); + m2 = Double.NaN; + } + + /** + * {@inheritDoc} + */ + @Override + public double getResult() { + return m2; + } + + /** + * {@inheritDoc} + */ + @Override + public SecondMoment copy() { + SecondMoment result = new SecondMoment(); + copy(this, result); + return result; + } + + /** + * Copies source to dest. + * <p>Neither source nor dest can be null.</p> + * + * @param source SecondMoment to copy + * @param dest SecondMoment to copy to + * @throws NullPointerException if either source or dest is null + */ + public static void copy(SecondMoment source, SecondMoment dest) { + FirstMoment.copy(source, dest); + dest.m2 = source.m2; + } + +} diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/moment/SemiVariance.java b/src/main/java/org/apache/commons/math/stat/descriptive/moment/SemiVariance.java new file mode 100644 index 0000000..04aa456 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/descriptive/moment/SemiVariance.java @@ -0,0 +1,379 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.math.stat.descriptive.moment; + +import java.io.Serializable; +import org.apache.commons.math.exception.NullArgumentException; +import org.apache.commons.math.exception.util.LocalizedFormats; +import org.apache.commons.math.stat.descriptive.AbstractUnivariateStatistic; + +/** + * <p>Computes the semivariance of a set of values with respect to a given cutoff value. + * We define the <i>downside semivariance</i> of a set of values <code>x</code> + * against the <i>cutoff value</i> <code>cutoff</code> to be <br/> + * <code>Σ (x[i] - target)<sup>2</sup> / df</code> <br/> + * where the sum is taken over all <code>i</code> such that <code>x[i] < cutoff</code> + * and <code>df</code> is the length of <code>x</code> (non-bias-corrected) or + * one less than this number (bias corrected). The <i>upside semivariance</i> + * is defined similarly, with the sum taken over values of <code>x</code> that + * exceed the cutoff value.</p> + * + * <p>The cutoff value defaults to the mean, bias correction defaults to <code>true</code> + * and the "variance direction" (upside or downside) defaults to downside. The variance direction + * and bias correction may be set using property setters or their values can provided as + * parameters to {@link #evaluate(double[], double, Direction, boolean, int, int)}.</p> + * + * <p>If the input array is null, <code>evaluate</code> methods throw + * <code>IllegalArgumentException.</code> If the array has length 1, <code>0</code> + * is returned, regardless of the value of the <code>cutoff.</code> + * + * <p><strong>Note that this class is not intended to be threadsafe.</strong> If + * multiple threads access an instance of this class concurrently, and one or + * more of these threads invoke property setters, external synchronization must + * be provided to ensure correct results.</p> + * + * @version $Revision: 1006299 $ $Date: 2010-10-10 16:47:17 +0200 (dim. 10 oct. 2010) $ + * @since 2.1 + */ + +public class SemiVariance extends AbstractUnivariateStatistic implements Serializable { + + /** + * The UPSIDE Direction is used to specify that the observations above the + * cutoff point will be used to calculate SemiVariance. + */ + public static final Direction UPSIDE_VARIANCE = Direction.UPSIDE; + + /** + * The DOWNSIDE Direction is used to specify that the observations below + * the cutoff point will be used to calculate SemiVariance + */ + public static final Direction DOWNSIDE_VARIANCE = Direction.DOWNSIDE; + + /** Serializable version identifier */ + private static final long serialVersionUID = -2653430366886024994L; + + /** + * Determines whether or not bias correction is applied when computing the + * value of the statisic. True means that bias is corrected. + */ + private boolean biasCorrected = true; + + /** + * Determines whether to calculate downside or upside SemiVariance. + */ + private Direction varianceDirection = Direction.DOWNSIDE; + + /** + * Constructs a SemiVariance with default (true) <code>biasCorrected</code> + * property and default (Downside) <code>varianceDirection</code> property. + */ + public SemiVariance() { + } + + /** + * Constructs a SemiVariance with the specified <code>biasCorrected</code> + * property and default (Downside) <code>varianceDirection</code> property. + * + * @param biasCorrected setting for bias correction - true means + * bias will be corrected and is equivalent to using the argumentless + * constructor + */ + public SemiVariance(final boolean biasCorrected) { + this.biasCorrected = biasCorrected; + } + + + /** + * Constructs a SemiVariance with the specified <code>Direction</code> property + * and default (true) <code>biasCorrected</code> property + * + * @param direction setting for the direction of the SemiVariance + * to calculate + */ + public SemiVariance(final Direction direction) { + this.varianceDirection = direction; + } + + + /** + * Constructs a SemiVariance with the specified <code>isBiasCorrected</code> + * property and the specified <code>Direction</code> property. + * + * @param corrected setting for bias correction - true means + * bias will be corrected and is equivalent to using the argumentless + * constructor + * + * @param direction setting for the direction of the SemiVariance + * to calculate + */ + public SemiVariance(final boolean corrected, final Direction direction) { + this.biasCorrected = corrected; + this.varianceDirection = direction; + } + + + /** + * Copy constructor, creates a new {@code SemiVariance} identical + * to the {@code original} + * + * @param original the {@code SemiVariance} instance to copy + */ + public SemiVariance(final SemiVariance original) { + copy(original, this); + } + + + /** + * {@inheritDoc} + */ + @Override + public SemiVariance copy() { + SemiVariance result = new SemiVariance(); + copy(this, result); + return result; + } + + + /** + * Copies source to dest. + * <p>Neither source nor dest can be null.</p> + * + * @param source SemiVariance to copy + * @param dest SemiVariance to copy to + * @throws NullPointerException if either source or dest is null + */ + public static void copy(final SemiVariance source, SemiVariance dest) { + dest.setData(source.getDataRef()); + dest.biasCorrected = source.biasCorrected; + dest.varianceDirection = source.varianceDirection; + } + + + /** + * This method calculates {@link SemiVariance} for the entire array against the mean, using + * instance properties varianceDirection and biasCorrection. + * + * @param values the input array + * @return the SemiVariance + * @throws IllegalArgumentException if values is null + * + */ + @Override + public double evaluate(final double[] values) { + if (values == null) { + throw new NullArgumentException(LocalizedFormats.INPUT_ARRAY); + } + return evaluate(values, 0, values.length); + } + + + /** + * <p>Returns the {@link SemiVariance} of the designated values against the mean, using + * instance properties varianceDirection and biasCorrection.</p> + * + * <p>Returns <code>NaN</code> if the array is empty and throws + * <code>IllegalArgumentException</code> if the array is null.</p> + * + * @param values the input array + * @param start index of the first array element to include + * @param length the number of elements to include + * @return the SemiVariance + * @throws IllegalArgumentException if the parameters are not valid + * + */ + @Override + public double evaluate(final double[] values, final int start, final int length) { + double m = (new Mean()).evaluate(values, start, length); + return evaluate(values, m, varianceDirection, biasCorrected, 0, values.length); + } + + + /** + * This method calculates {@link SemiVariance} for the entire array against the mean, using + * the current value of the biasCorrection instance property. + * + * @param values the input array + * @param direction the {@link Direction} of the semivariance + * @return the SemiVariance + * @throws IllegalArgumentException if values is null + * + */ + public double evaluate(final double[] values, Direction direction) { + double m = (new Mean()).evaluate(values); + return evaluate (values, m, direction, biasCorrected, 0, values.length); + } + + /** + * <p>Returns the {@link SemiVariance} of the designated values against the cutoff, using + * instance properties variancDirection and biasCorrection.</p> + * + * <p>Returns <code>NaN</code> if the array is empty and throws + * <code>IllegalArgumentException</code> if the array is null.</p> + * + * @param values the input array + * @param cutoff the reference point + * @return the SemiVariance + * @throws IllegalArgumentException if values is null + */ + public double evaluate(final double[] values, final double cutoff) { + return evaluate(values, cutoff, varianceDirection, biasCorrected, 0, values.length); + } + + /** + * <p>Returns the {@link SemiVariance} of the designated values against the cutoff in the + * given direction, using the current value of the biasCorrection instance property.</p> + * + * <p>Returns <code>NaN</code> if the array is empty and throws + * <code>IllegalArgumentException</code> if the array is null.</p> + * + * @param values the input array + * @param cutoff the reference point + * @param direction the {@link Direction} of the semivariance + * @return the SemiVariance + * @throws IllegalArgumentException if values is null + */ + public double evaluate(final double[] values, final double cutoff, final Direction direction) { + return evaluate(values, cutoff, direction, biasCorrected, 0, values.length); + } + + + /** + * <p>Returns the {@link SemiVariance} of the designated values against the cutoff + * in the given direction with the provided bias correction.</p> + * + * <p>Returns <code>NaN</code> if the array is empty and throws + * <code>IllegalArgumentException</code> if the array is null.</p> + * + * @param values the input array + * @param cutoff the reference point + * @param direction the {@link Direction} of the semivariance + * @param corrected the BiasCorrection flag + * @param start index of the first array element to include + * @param length the number of elements to include + * @return the SemiVariance + * @throws IllegalArgumentException if the parameters are not valid + * + */ + public double evaluate (final double[] values, final double cutoff, final Direction direction, + final boolean corrected, final int start, final int length) { + + test(values, start, length); + if (values.length == 0) { + return Double.NaN; + } else { + if (values.length == 1) { + return 0.0; + } else { + final boolean booleanDirection = direction.getDirection(); + + double dev = 0.0; + double sumsq = 0.0; + for (int i = start; i < length; i++) { + if ((values[i] > cutoff) == booleanDirection) { + dev = values[i] - cutoff; + sumsq += dev * dev; + } + } + + if (corrected) { + return sumsq / (length - 1.0); + } else { + return sumsq / length; + } + } + } + } + + /** + * Returns true iff biasCorrected property is set to true. + * + * @return the value of biasCorrected. + */ + public boolean isBiasCorrected() { + return biasCorrected; + } + + /** + * Sets the biasCorrected property. + * + * @param biasCorrected new biasCorrected property value + */ + public void setBiasCorrected(boolean biasCorrected) { + this.biasCorrected = biasCorrected; + } + + /** + * Returns the varianceDirection property. + * + * @return the varianceDirection + */ + public Direction getVarianceDirection () { + return varianceDirection; + } + + /** + * Sets the variance direction + * + * @param varianceDirection the direction of the semivariance + */ + public void setVarianceDirection(Direction varianceDirection) { + this.varianceDirection = varianceDirection; + } + + /** + * The direction of the semivariance - either upside or downside. The direction + * is represented by boolean, with true corresponding to UPSIDE semivariance. + */ + public enum Direction { + /** + * The UPSIDE Direction is used to specify that the observations above the + * cutoff point will be used to calculate SemiVariance + */ + UPSIDE (true), + + /** + * The DOWNSIDE Direction is used to specify that the observations below + * the cutoff point will be used to calculate SemiVariance + */ + DOWNSIDE (false); + + /** + * boolean value UPSIDE <-> true + */ + private boolean direction; + + /** + * Create a Direction with the given value. + * + * @param b boolean value representing the Direction. True corresponds to UPSIDE. + */ + Direction (boolean b) { + direction = b; + } + + /** + * Returns the value of this Direction. True corresponds to UPSIDE. + * + * @return true if direction is UPSIDE; false otherwise + */ + boolean getDirection () { + return direction; + } + } +} diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/moment/Skewness.java b/src/main/java/org/apache/commons/math/stat/descriptive/moment/Skewness.java new file mode 100644 index 0000000..d16f956 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/descriptive/moment/Skewness.java @@ -0,0 +1,213 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.descriptive.moment; + +import java.io.Serializable; + +import org.apache.commons.math.stat.descriptive.AbstractStorelessUnivariateStatistic; +import org.apache.commons.math.util.FastMath; + +/** + * Computes the skewness of the available values. + * <p> + * We use the following (unbiased) formula to define skewness:</p> + * <p> + * skewness = [n / (n -1) (n - 2)] sum[(x_i - mean)^3] / std^3 </p> + * <p> + * where n is the number of values, mean is the {@link Mean} and std is the + * {@link StandardDeviation} </p> + * <p> + * <strong>Note that this implementation is not synchronized.</strong> If + * multiple threads access an instance of this class concurrently, and at least + * one of the threads invokes the <code>increment()</code> or + * <code>clear()</code> method, it must be synchronized externally. </p> + * + * @version $Revision: 1006299 $ $Date: 2010-10-10 16:47:17 +0200 (dim. 10 oct. 2010) $ + */ +public class Skewness extends AbstractStorelessUnivariateStatistic implements Serializable { + + /** Serializable version identifier */ + private static final long serialVersionUID = 7101857578996691352L; + + /** Third moment on which this statistic is based */ + protected ThirdMoment moment = null; + + /** + * Determines whether or not this statistic can be incremented or cleared. + * <p> + * Statistics based on (constructed from) external moments cannot + * be incremented or cleared.</p> + */ + protected boolean incMoment; + + /** + * Constructs a Skewness + */ + public Skewness() { + incMoment = true; + moment = new ThirdMoment(); + } + + /** + * Constructs a Skewness with an external moment + * @param m3 external moment + */ + public Skewness(final ThirdMoment m3) { + incMoment = false; + this.moment = m3; + } + + /** + * Copy constructor, creates a new {@code Skewness} identical + * to the {@code original} + * + * @param original the {@code Skewness} instance to copy + */ + public Skewness(Skewness original) { + copy(original, this); + } + + /** + * {@inheritDoc} + */ + @Override + public void increment(final double d) { + if (incMoment) { + moment.increment(d); + } + } + + /** + * Returns the value of the statistic based on the values that have been added. + * <p> + * See {@link Skewness} for the definition used in the computation.</p> + * + * @return the skewness of the available values. + */ + @Override + public double getResult() { + + if (moment.n < 3) { + return Double.NaN; + } + double variance = moment.m2 / (moment.n - 1); + if (variance < 10E-20) { + return 0.0d; + } else { + double n0 = moment.getN(); + return (n0 * moment.m3) / + ((n0 - 1) * (n0 -2) * FastMath.sqrt(variance) * variance); + } + } + + /** + * {@inheritDoc} + */ + public long getN() { + return moment.getN(); + } + + /** + * {@inheritDoc} + */ + @Override + public void clear() { + if (incMoment) { + moment.clear(); + } + } + + /** + * Returns the Skewness of the entries in the specifed portion of the + * input array. + * <p> + * See {@link Skewness} for the definition used in the computation.</p> + * <p> + * Throws <code>IllegalArgumentException</code> if the array is null.</p> + * + * @param values the input array + * @param begin the index of the first array element to include + * @param length the number of elements to include + * @return the skewness of the values or Double.NaN if length is less than + * 3 + * @throws IllegalArgumentException if the array is null or the array index + * parameters are not valid + */ + @Override + public double evaluate(final double[] values,final int begin, + final int length) { + + // Initialize the skewness + double skew = Double.NaN; + + if (test(values, begin, length) && length > 2 ){ + Mean mean = new Mean(); + // Get the mean and the standard deviation + double m = mean.evaluate(values, begin, length); + + // Calc the std, this is implemented here instead + // of using the standardDeviation method eliminate + // a duplicate pass to get the mean + double accum = 0.0; + double accum2 = 0.0; + for (int i = begin; i < begin + length; i++) { + final double d = values[i] - m; + accum += d * d; + accum2 += d; + } + final double variance = (accum - (accum2 * accum2 / length)) / (length - 1); + + double accum3 = 0.0; + for (int i = begin; i < begin + length; i++) { + final double d = values[i] - m; + accum3 += d * d * d; + } + accum3 /= variance * FastMath.sqrt(variance); + + // Get N + double n0 = length; + + // Calculate skewness + skew = (n0 / ((n0 - 1) * (n0 - 2))) * accum3; + } + return skew; + } + + /** + * {@inheritDoc} + */ + @Override + public Skewness copy() { + Skewness result = new Skewness(); + copy(this, result); + return result; + } + + /** + * Copies source to dest. + * <p>Neither source nor dest can be null.</p> + * + * @param source Skewness to copy + * @param dest Skewness to copy to + * @throws NullPointerException if either source or dest is null + */ + public static void copy(Skewness source, Skewness dest) { + dest.setData(source.getDataRef()); + dest.moment = new ThirdMoment(source.moment.copy()); + dest.incMoment = source.incMoment; + } +} diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/moment/StandardDeviation.java b/src/main/java/org/apache/commons/math/stat/descriptive/moment/StandardDeviation.java new file mode 100644 index 0000000..837ae3b --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/descriptive/moment/StandardDeviation.java @@ -0,0 +1,271 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.descriptive.moment; + +import java.io.Serializable; + +import org.apache.commons.math.stat.descriptive.AbstractStorelessUnivariateStatistic; +import org.apache.commons.math.util.FastMath; + +/** + * Computes the sample standard deviation. The standard deviation + * is the positive square root of the variance. This implementation wraps a + * {@link Variance} instance. The <code>isBiasCorrected</code> property of the + * wrapped Variance instance is exposed, so that this class can be used to + * compute both the "sample standard deviation" (the square root of the + * bias-corrected "sample variance") or the "population standard deviation" + * (the square root of the non-bias-corrected "population variance"). See + * {@link Variance} for more information. + * <p> + * <strong>Note that this implementation is not synchronized.</strong> If + * multiple threads access an instance of this class concurrently, and at least + * one of the threads invokes the <code>increment()</code> or + * <code>clear()</code> method, it must be synchronized externally.</p> + * + * @version $Revision: 1006299 $ $Date: 2010-10-10 16:47:17 +0200 (dim. 10 oct. 2010) $ + */ +public class StandardDeviation extends AbstractStorelessUnivariateStatistic + implements Serializable { + + /** Serializable version identifier */ + private static final long serialVersionUID = 5728716329662425188L; + + /** Wrapped Variance instance */ + private Variance variance = null; + + /** + * Constructs a StandardDeviation. Sets the underlying {@link Variance} + * instance's <code>isBiasCorrected</code> property to true. + */ + public StandardDeviation() { + variance = new Variance(); + } + + /** + * Constructs a StandardDeviation from an external second moment. + * + * @param m2 the external moment + */ + public StandardDeviation(final SecondMoment m2) { + variance = new Variance(m2); + } + + /** + * Copy constructor, creates a new {@code StandardDeviation} identical + * to the {@code original} + * + * @param original the {@code StandardDeviation} instance to copy + */ + public StandardDeviation(StandardDeviation original) { + copy(original, this); + } + + /** + * Contructs a StandardDeviation with the specified value for the + * <code>isBiasCorrected</code> property. If this property is set to + * <code>true</code>, the {@link Variance} used in computing results will + * use the bias-corrected, or "sample" formula. See {@link Variance} for + * details. + * + * @param isBiasCorrected whether or not the variance computation will use + * the bias-corrected formula + */ + public StandardDeviation(boolean isBiasCorrected) { + variance = new Variance(isBiasCorrected); + } + + /** + * Contructs a StandardDeviation with the specified value for the + * <code>isBiasCorrected</code> property and the supplied external moment. + * If <code>isBiasCorrected</code> is set to <code>true</code>, the + * {@link Variance} used in computing results will use the bias-corrected, + * or "sample" formula. See {@link Variance} for details. + * + * @param isBiasCorrected whether or not the variance computation will use + * the bias-corrected formula + * @param m2 the external moment + */ + public StandardDeviation(boolean isBiasCorrected, SecondMoment m2) { + variance = new Variance(isBiasCorrected, m2); + } + + /** + * {@inheritDoc} + */ + @Override + public void increment(final double d) { + variance.increment(d); + } + + /** + * {@inheritDoc} + */ + public long getN() { + return variance.getN(); + } + + /** + * {@inheritDoc} + */ + @Override + public double getResult() { + return FastMath.sqrt(variance.getResult()); + } + + /** + * {@inheritDoc} + */ + @Override + public void clear() { + variance.clear(); + } + + /** + * Returns the Standard Deviation of the entries in the input array, or + * <code>Double.NaN</code> if the array is empty. + * <p> + * Returns 0 for a single-value (i.e. length = 1) sample.</p> + * <p> + * Throws <code>IllegalArgumentException</code> if the array is null.</p> + * <p> + * Does not change the internal state of the statistic.</p> + * + * @param values the input array + * @return the standard deviation of the values or Double.NaN if length = 0 + * @throws IllegalArgumentException if the array is null + */ + @Override + public double evaluate(final double[] values) { + return FastMath.sqrt(variance.evaluate(values)); + } + + /** + * Returns the Standard Deviation of the entries in the specified portion of + * the input array, or <code>Double.NaN</code> if the designated subarray + * is empty. + * <p> + * Returns 0 for a single-value (i.e. length = 1) sample. </p> + * <p> + * Throws <code>IllegalArgumentException</code> if the array is null.</p> + * <p> + * Does not change the internal state of the statistic.</p> + * + * @param values the input array + * @param begin index of the first array element to include + * @param length the number of elements to include + * @return the standard deviation of the values or Double.NaN if length = 0 + * @throws IllegalArgumentException if the array is null or the array index + * parameters are not valid + */ + @Override + public double evaluate(final double[] values, final int begin, final int length) { + return FastMath.sqrt(variance.evaluate(values, begin, length)); + } + + /** + * Returns the Standard Deviation of the entries in the specified portion of + * the input array, using the precomputed mean value. Returns + * <code>Double.NaN</code> if the designated subarray is empty. + * <p> + * Returns 0 for a single-value (i.e. length = 1) sample.</p> + * <p> + * The formula used assumes that the supplied mean value is the arithmetic + * mean of the sample data, not a known population parameter. This method + * is supplied only to save computation when the mean has already been + * computed.</p> + * <p> + * Throws <code>IllegalArgumentException</code> if the array is null.</p> + * <p> + * Does not change the internal state of the statistic.</p> + * + * @param values the input array + * @param mean the precomputed mean value + * @param begin index of the first array element to include + * @param length the number of elements to include + * @return the standard deviation of the values or Double.NaN if length = 0 + * @throws IllegalArgumentException if the array is null or the array index + * parameters are not valid + */ + public double evaluate(final double[] values, final double mean, + final int begin, final int length) { + return FastMath.sqrt(variance.evaluate(values, mean, begin, length)); + } + + /** + * Returns the Standard Deviation of the entries in the input array, using + * the precomputed mean value. Returns + * <code>Double.NaN</code> if the designated subarray is empty. + * <p> + * Returns 0 for a single-value (i.e. length = 1) sample.</p> + * <p> + * The formula used assumes that the supplied mean value is the arithmetic + * mean of the sample data, not a known population parameter. This method + * is supplied only to save computation when the mean has already been + * computed.</p> + * <p> + * Throws <code>IllegalArgumentException</code> if the array is null.</p> + * <p> + * Does not change the internal state of the statistic.</p> + * + * @param values the input array + * @param mean the precomputed mean value + * @return the standard deviation of the values or Double.NaN if length = 0 + * @throws IllegalArgumentException if the array is null + */ + public double evaluate(final double[] values, final double mean) { + return FastMath.sqrt(variance.evaluate(values, mean)); + } + + /** + * @return Returns the isBiasCorrected. + */ + public boolean isBiasCorrected() { + return variance.isBiasCorrected(); + } + + /** + * @param isBiasCorrected The isBiasCorrected to set. + */ + public void setBiasCorrected(boolean isBiasCorrected) { + variance.setBiasCorrected(isBiasCorrected); + } + + /** + * {@inheritDoc} + */ + @Override + public StandardDeviation copy() { + StandardDeviation result = new StandardDeviation(); + copy(this, result); + return result; + } + + + /** + * Copies source to dest. + * <p>Neither source nor dest can be null.</p> + * + * @param source StandardDeviation to copy + * @param dest StandardDeviation to copy to + * @throws NullPointerException if either source or dest is null + */ + public static void copy(StandardDeviation source, StandardDeviation dest) { + dest.setData(source.getDataRef()); + dest.variance = source.variance.copy(); + } + +} diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/moment/ThirdMoment.java b/src/main/java/org/apache/commons/math/stat/descriptive/moment/ThirdMoment.java new file mode 100644 index 0000000..5c50989 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/descriptive/moment/ThirdMoment.java @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.descriptive.moment; + +import java.io.Serializable; + + +/** + * Computes a statistic related to the Third Central Moment. Specifically, + * what is computed is the sum of cubed deviations from the sample mean. + * <p> + * The following recursive updating formula is used:</p> + * <p> + * Let <ul> + * <li> dev = (current obs - previous mean) </li> + * <li> m2 = previous value of {@link SecondMoment} </li> + * <li> n = number of observations (including current obs) </li> + * </ul> + * Then</p> + * <p> + * new value = old value - 3 * (dev/n) * m2 + (n-1) * (n -2) * (dev^3/n^2)</p> + * <p> + * Returns <code>Double.NaN</code> if no data values have been added and + * returns <code>0</code> if there is just one value in the data set.</p> + * <p> + * <strong>Note that this implementation is not synchronized.</strong> If + * multiple threads access an instance of this class concurrently, and at least + * one of the threads invokes the <code>increment()</code> or + * <code>clear()</code> method, it must be synchronized externally.</p> + * + * @version $Revision: 811685 $ $Date: 2009-09-05 19:36:48 +0200 (sam. 05 sept. 2009) $ + */ +public class ThirdMoment extends SecondMoment implements Serializable { + + /** Serializable version identifier */ + private static final long serialVersionUID = -7818711964045118679L; + + /** third moment of values that have been added */ + protected double m3; + + /** + * Square of deviation of most recently added value from previous first + * moment, normalized by previous sample size. Retained to prevent + * repeated computation in higher order moments. nDevSq = nDev * nDev. + */ + protected double nDevSq; + + /** + * Create a FourthMoment instance + */ + public ThirdMoment() { + super(); + m3 = Double.NaN; + nDevSq = Double.NaN; + } + + /** + * Copy constructor, creates a new {@code ThirdMoment} identical + * to the {@code original} + * + * @param original the {@code ThirdMoment} instance to copy + */ + public ThirdMoment(ThirdMoment original) { + copy(original, this); + } + + /** + * {@inheritDoc} + */ + @Override + public void increment(final double d) { + if (n < 1) { + m3 = m2 = m1 = 0.0; + } + + double prevM2 = m2; + super.increment(d); + nDevSq = nDev * nDev; + double n0 = n; + m3 = m3 - 3.0 * nDev * prevM2 + (n0 - 1) * (n0 - 2) * nDevSq * dev; + } + + /** + * {@inheritDoc} + */ + @Override + public double getResult() { + return m3; + } + + /** + * {@inheritDoc} + */ + @Override + public void clear() { + super.clear(); + m3 = Double.NaN; + nDevSq = Double.NaN; + } + + /** + * {@inheritDoc} + */ + @Override + public ThirdMoment copy() { + ThirdMoment result = new ThirdMoment(); + copy(this, result); + return result; + } + + /** + * Copies source to dest. + * <p>Neither source nor dest can be null.</p> + * + * @param source ThirdMoment to copy + * @param dest ThirdMoment to copy to + * @throws NullPointerException if either source or dest is null + */ + public static void copy(ThirdMoment source, ThirdMoment dest) { + SecondMoment.copy(source, dest); + dest.m3 = source.m3; + dest.nDevSq = source.nDevSq; + } + +} diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/moment/Variance.java b/src/main/java/org/apache/commons/math/stat/descriptive/moment/Variance.java new file mode 100644 index 0000000..6ce6835 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/descriptive/moment/Variance.java @@ -0,0 +1,610 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.descriptive.moment; + +import java.io.Serializable; + +import org.apache.commons.math.exception.NullArgumentException; +import org.apache.commons.math.exception.util.LocalizedFormats; +import org.apache.commons.math.stat.descriptive.WeightedEvaluation; +import org.apache.commons.math.stat.descriptive.AbstractStorelessUnivariateStatistic; + +/** + * Computes the variance of the available values. By default, the unbiased + * "sample variance" definitional formula is used: + * <p> + * variance = sum((x_i - mean)^2) / (n - 1) </p> + * <p> + * where mean is the {@link Mean} and <code>n</code> is the number + * of sample observations.</p> + * <p> + * The definitional formula does not have good numerical properties, so + * this implementation does not compute the statistic using the definitional + * formula. <ul> + * <li> The <code>getResult</code> method computes the variance using + * updating formulas based on West's algorithm, as described in + * <a href="http://doi.acm.org/10.1145/359146.359152"> Chan, T. F. and + * J. G. Lewis 1979, <i>Communications of the ACM</i>, + * vol. 22 no. 9, pp. 526-531.</a></li> + * <li> The <code>evaluate</code> methods leverage the fact that they have the + * full array of values in memory to execute a two-pass algorithm. + * Specifically, these methods use the "corrected two-pass algorithm" from + * Chan, Golub, Levesque, <i>Algorithms for Computing the Sample Variance</i>, + * American Statistician, vol. 37, no. 3 (1983) pp. 242-247.</li></ul> + * Note that adding values using <code>increment</code> or + * <code>incrementAll</code> and then executing <code>getResult</code> will + * sometimes give a different, less accurate, result than executing + * <code>evaluate</code> with the full array of values. The former approach + * should only be used when the full array of values is not available.</p> + * <p> + * The "population variance" ( sum((x_i - mean)^2) / n ) can also + * be computed using this statistic. The <code>isBiasCorrected</code> + * property determines whether the "population" or "sample" value is + * returned by the <code>evaluate</code> and <code>getResult</code> methods. + * To compute population variances, set this property to <code>false.</code> + * </p> + * <p> + * <strong>Note that this implementation is not synchronized.</strong> If + * multiple threads access an instance of this class concurrently, and at least + * one of the threads invokes the <code>increment()</code> or + * <code>clear()</code> method, it must be synchronized externally.</p> + * + * @version $Revision: 1006299 $ $Date: 2010-10-10 16:47:17 +0200 (dim. 10 oct. 2010) $ + */ +public class Variance extends AbstractStorelessUnivariateStatistic implements Serializable, WeightedEvaluation { + + /** Serializable version identifier */ + private static final long serialVersionUID = -9111962718267217978L; + + /** SecondMoment is used in incremental calculation of Variance*/ + protected SecondMoment moment = null; + + /** + * Boolean test to determine if this Variance should also increment + * the second moment, this evaluates to false when this Variance is + * constructed with an external SecondMoment as a parameter. + */ + protected boolean incMoment = true; + + /** + * Determines whether or not bias correction is applied when computing the + * value of the statisic. True means that bias is corrected. See + * {@link Variance} for details on the formula. + */ + private boolean isBiasCorrected = true; + + /** + * Constructs a Variance with default (true) <code>isBiasCorrected</code> + * property. + */ + public Variance() { + moment = new SecondMoment(); + } + + /** + * Constructs a Variance based on an external second moment. + * + * @param m2 the SecondMoment (Third or Fourth moments work + * here as well.) + */ + public Variance(final SecondMoment m2) { + incMoment = false; + this.moment = m2; + } + + /** + * Constructs a Variance with the specified <code>isBiasCorrected</code> + * property + * + * @param isBiasCorrected setting for bias correction - true means + * bias will be corrected and is equivalent to using the argumentless + * constructor + */ + public Variance(boolean isBiasCorrected) { + moment = new SecondMoment(); + this.isBiasCorrected = isBiasCorrected; + } + + /** + * Constructs a Variance with the specified <code>isBiasCorrected</code> + * property and the supplied external second moment. + * + * @param isBiasCorrected setting for bias correction - true means + * bias will be corrected + * @param m2 the SecondMoment (Third or Fourth moments work + * here as well.) + */ + public Variance(boolean isBiasCorrected, SecondMoment m2) { + incMoment = false; + this.moment = m2; + this.isBiasCorrected = isBiasCorrected; + } + + /** + * Copy constructor, creates a new {@code Variance} identical + * to the {@code original} + * + * @param original the {@code Variance} instance to copy + */ + public Variance(Variance original) { + copy(original, this); + } + + /** + * {@inheritDoc} + * <p>If all values are available, it is more accurate to use + * {@link #evaluate(double[])} rather than adding values one at a time + * using this method and then executing {@link #getResult}, since + * <code>evaluate</code> leverages the fact that is has the full + * list of values together to execute a two-pass algorithm. + * See {@link Variance}.</p> + */ + @Override + public void increment(final double d) { + if (incMoment) { + moment.increment(d); + } + } + + /** + * {@inheritDoc} + */ + @Override + public double getResult() { + if (moment.n == 0) { + return Double.NaN; + } else if (moment.n == 1) { + return 0d; + } else { + if (isBiasCorrected) { + return moment.m2 / (moment.n - 1d); + } else { + return moment.m2 / (moment.n); + } + } + } + + /** + * {@inheritDoc} + */ + public long getN() { + return moment.getN(); + } + + /** + * {@inheritDoc} + */ + @Override + public void clear() { + if (incMoment) { + moment.clear(); + } + } + + /** + * Returns the variance of the entries in the input array, or + * <code>Double.NaN</code> if the array is empty. + * <p> + * See {@link Variance} for details on the computing algorithm.</p> + * <p> + * Returns 0 for a single-value (i.e. length = 1) sample.</p> + * <p> + * Throws <code>IllegalArgumentException</code> if the array is null.</p> + * <p> + * Does not change the internal state of the statistic.</p> + * + * @param values the input array + * @return the variance of the values or Double.NaN if length = 0 + * @throws IllegalArgumentException if the array is null + */ + @Override + public double evaluate(final double[] values) { + if (values == null) { + throw new NullArgumentException(LocalizedFormats.INPUT_ARRAY); + } + return evaluate(values, 0, values.length); + } + + /** + * Returns the variance of the entries in the specified portion of + * the input array, or <code>Double.NaN</code> if the designated subarray + * is empty. + * <p> + * See {@link Variance} for details on the computing algorithm.</p> + * <p> + * Returns 0 for a single-value (i.e. length = 1) sample.</p> + * <p> + * Does not change the internal state of the statistic.</p> + * <p> + * Throws <code>IllegalArgumentException</code> if the array is null.</p> + * + * @param values the input array + * @param begin index of the first array element to include + * @param length the number of elements to include + * @return the variance of the values or Double.NaN if length = 0 + * @throws IllegalArgumentException if the array is null or the array index + * parameters are not valid + */ + @Override + public double evaluate(final double[] values, final int begin, final int length) { + + double var = Double.NaN; + + if (test(values, begin, length)) { + clear(); + if (length == 1) { + var = 0.0; + } else if (length > 1) { + Mean mean = new Mean(); + double m = mean.evaluate(values, begin, length); + var = evaluate(values, m, begin, length); + } + } + return var; + } + + /** + * <p>Returns the weighted variance of the entries in the specified portion of + * the input array, or <code>Double.NaN</code> if the designated subarray + * is empty.</p> + * <p> + * Uses the formula <pre> + * Σ(weights[i]*(values[i] - weightedMean)<sup>2</sup>)/(Σ(weights[i]) - 1) + * </pre> + * where weightedMean is the weighted mean</p> + * <p> + * This formula will not return the same result as the unweighted variance when all + * weights are equal, unless all weights are equal to 1. The formula assumes that + * weights are to be treated as "expansion values," as will be the case if for example + * the weights represent frequency counts. To normalize weights so that the denominator + * in the variance computation equals the length of the input vector minus one, use <pre> + * <code>evaluate(values, MathUtils.normalizeArray(weights, values.length)); </code> + * </pre> + * <p> + * Returns 0 for a single-value (i.e. length = 1) sample.</p> + * <p> + * Throws <code>IllegalArgumentException</code> if any of the following are true: + * <ul><li>the values array is null</li> + * <li>the weights array is null</li> + * <li>the weights array does not have the same length as the values array</li> + * <li>the weights array contains one or more infinite values</li> + * <li>the weights array contains one or more NaN values</li> + * <li>the weights array contains negative values</li> + * <li>the start and length arguments do not determine a valid array</li> + * </ul></p> + * <p> + * Does not change the internal state of the statistic.</p> + * <p> + * Throws <code>IllegalArgumentException</code> if either array is null.</p> + * + * @param values the input array + * @param weights the weights array + * @param begin index of the first array element to include + * @param length the number of elements to include + * @return the weighted variance of the values or Double.NaN if length = 0 + * @throws IllegalArgumentException if the parameters are not valid + * @since 2.1 + */ + public double evaluate(final double[] values, final double[] weights, + final int begin, final int length) { + + double var = Double.NaN; + + if (test(values, weights,begin, length)) { + clear(); + if (length == 1) { + var = 0.0; + } else if (length > 1) { + Mean mean = new Mean(); + double m = mean.evaluate(values, weights, begin, length); + var = evaluate(values, weights, m, begin, length); + } + } + return var; + } + + /** + * <p> + * Returns the weighted variance of the entries in the the input array.</p> + * <p> + * Uses the formula <pre> + * Σ(weights[i]*(values[i] - weightedMean)<sup>2</sup>)/(Σ(weights[i]) - 1) + * </pre> + * where weightedMean is the weighted mean</p> + * <p> + * This formula will not return the same result as the unweighted variance when all + * weights are equal, unless all weights are equal to 1. The formula assumes that + * weights are to be treated as "expansion values," as will be the case if for example + * the weights represent frequency counts. To normalize weights so that the denominator + * in the variance computation equals the length of the input vector minus one, use <pre> + * <code>evaluate(values, MathUtils.normalizeArray(weights, values.length)); </code> + * </pre> + * <p> + * Returns 0 for a single-value (i.e. length = 1) sample.</p> + * <p> + * Throws <code>IllegalArgumentException</code> if any of the following are true: + * <ul><li>the values array is null</li> + * <li>the weights array is null</li> + * <li>the weights array does not have the same length as the values array</li> + * <li>the weights array contains one or more infinite values</li> + * <li>the weights array contains one or more NaN values</li> + * <li>the weights array contains negative values</li> + * </ul></p> + * <p> + * Does not change the internal state of the statistic.</p> + * <p> + * Throws <code>IllegalArgumentException</code> if either array is null.</p> + * + * @param values the input array + * @param weights the weights array + * @return the weighted variance of the values + * @throws IllegalArgumentException if the parameters are not valid + * @since 2.1 + */ + public double evaluate(final double[] values, final double[] weights) { + return evaluate(values, weights, 0, values.length); + } + + /** + * Returns the variance of the entries in the specified portion of + * the input array, using the precomputed mean value. Returns + * <code>Double.NaN</code> if the designated subarray is empty. + * <p> + * See {@link Variance} for details on the computing algorithm.</p> + * <p> + * The formula used assumes that the supplied mean value is the arithmetic + * mean of the sample data, not a known population parameter. This method + * is supplied only to save computation when the mean has already been + * computed.</p> + * <p> + * Returns 0 for a single-value (i.e. length = 1) sample.</p> + * <p> + * Throws <code>IllegalArgumentException</code> if the array is null.</p> + * <p> + * Does not change the internal state of the statistic.</p> + * + * @param values the input array + * @param mean the precomputed mean value + * @param begin index of the first array element to include + * @param length the number of elements to include + * @return the variance of the values or Double.NaN if length = 0 + * @throws IllegalArgumentException if the array is null or the array index + * parameters are not valid + */ + public double evaluate(final double[] values, final double mean, + final int begin, final int length) { + + double var = Double.NaN; + + if (test(values, begin, length)) { + if (length == 1) { + var = 0.0; + } else if (length > 1) { + double accum = 0.0; + double dev = 0.0; + double accum2 = 0.0; + for (int i = begin; i < begin + length; i++) { + dev = values[i] - mean; + accum += dev * dev; + accum2 += dev; + } + double len = length; + if (isBiasCorrected) { + var = (accum - (accum2 * accum2 / len)) / (len - 1.0); + } else { + var = (accum - (accum2 * accum2 / len)) / len; + } + } + } + return var; + } + + /** + * Returns the variance of the entries in the input array, using the + * precomputed mean value. Returns <code>Double.NaN</code> if the array + * is empty. + * <p> + * See {@link Variance} for details on the computing algorithm.</p> + * <p> + * If <code>isBiasCorrected</code> is <code>true</code> the formula used + * assumes that the supplied mean value is the arithmetic mean of the + * sample data, not a known population parameter. If the mean is a known + * population parameter, or if the "population" version of the variance is + * desired, set <code>isBiasCorrected</code> to <code>false</code> before + * invoking this method.</p> + * <p> + * Returns 0 for a single-value (i.e. length = 1) sample.</p> + * <p> + * Throws <code>IllegalArgumentException</code> if the array is null.</p> + * <p> + * Does not change the internal state of the statistic.</p> + * + * @param values the input array + * @param mean the precomputed mean value + * @return the variance of the values or Double.NaN if the array is empty + * @throws IllegalArgumentException if the array is null + */ + public double evaluate(final double[] values, final double mean) { + return evaluate(values, mean, 0, values.length); + } + + /** + * Returns the weighted variance of the entries in the specified portion of + * the input array, using the precomputed weighted mean value. Returns + * <code>Double.NaN</code> if the designated subarray is empty. + * <p> + * Uses the formula <pre> + * Σ(weights[i]*(values[i] - mean)<sup>2</sup>)/(Σ(weights[i]) - 1) + * </pre></p> + * <p> + * The formula used assumes that the supplied mean value is the weighted arithmetic + * mean of the sample data, not a known population parameter. This method + * is supplied only to save computation when the mean has already been + * computed.</p> + * <p> + * This formula will not return the same result as the unweighted variance when all + * weights are equal, unless all weights are equal to 1. The formula assumes that + * weights are to be treated as "expansion values," as will be the case if for example + * the weights represent frequency counts. To normalize weights so that the denominator + * in the variance computation equals the length of the input vector minus one, use <pre> + * <code>evaluate(values, MathUtils.normalizeArray(weights, values.length), mean); </code> + * </pre> + * <p> + * Returns 0 for a single-value (i.e. length = 1) sample.</p> + * <p> + * Throws <code>IllegalArgumentException</code> if any of the following are true: + * <ul><li>the values array is null</li> + * <li>the weights array is null</li> + * <li>the weights array does not have the same length as the values array</li> + * <li>the weights array contains one or more infinite values</li> + * <li>the weights array contains one or more NaN values</li> + * <li>the weights array contains negative values</li> + * <li>the start and length arguments do not determine a valid array</li> + * </ul></p> + * <p> + * Does not change the internal state of the statistic.</p> + * + * @param values the input array + * @param weights the weights array + * @param mean the precomputed weighted mean value + * @param begin index of the first array element to include + * @param length the number of elements to include + * @return the variance of the values or Double.NaN if length = 0 + * @throws IllegalArgumentException if the parameters are not valid + * @since 2.1 + */ + public double evaluate(final double[] values, final double[] weights, + final double mean, final int begin, final int length) { + + double var = Double.NaN; + + if (test(values, weights, begin, length)) { + if (length == 1) { + var = 0.0; + } else if (length > 1) { + double accum = 0.0; + double dev = 0.0; + double accum2 = 0.0; + for (int i = begin; i < begin + length; i++) { + dev = values[i] - mean; + accum += weights[i] * (dev * dev); + accum2 += weights[i] * dev; + } + + double sumWts = 0; + for (int i = 0; i < weights.length; i++) { + sumWts += weights[i]; + } + + if (isBiasCorrected) { + var = (accum - (accum2 * accum2 / sumWts)) / (sumWts - 1.0); + } else { + var = (accum - (accum2 * accum2 / sumWts)) / sumWts; + } + } + } + return var; + } + + /** + * <p>Returns the weighted variance of the values in the input array, using + * the precomputed weighted mean value.</p> + * <p> + * Uses the formula <pre> + * Σ(weights[i]*(values[i] - mean)<sup>2</sup>)/(Σ(weights[i]) - 1) + * </pre></p> + * <p> + * The formula used assumes that the supplied mean value is the weighted arithmetic + * mean of the sample data, not a known population parameter. This method + * is supplied only to save computation when the mean has already been + * computed.</p> + * <p> + * This formula will not return the same result as the unweighted variance when all + * weights are equal, unless all weights are equal to 1. The formula assumes that + * weights are to be treated as "expansion values," as will be the case if for example + * the weights represent frequency counts. To normalize weights so that the denominator + * in the variance computation equals the length of the input vector minus one, use <pre> + * <code>evaluate(values, MathUtils.normalizeArray(weights, values.length), mean); </code> + * </pre> + * <p> + * Returns 0 for a single-value (i.e. length = 1) sample.</p> + * <p> + * Throws <code>IllegalArgumentException</code> if any of the following are true: + * <ul><li>the values array is null</li> + * <li>the weights array is null</li> + * <li>the weights array does not have the same length as the values array</li> + * <li>the weights array contains one or more infinite values</li> + * <li>the weights array contains one or more NaN values</li> + * <li>the weights array contains negative values</li> + * </ul></p> + * <p> + * Does not change the internal state of the statistic.</p> + * + * @param values the input array + * @param weights the weights array + * @param mean the precomputed weighted mean value + * @return the variance of the values or Double.NaN if length = 0 + * @throws IllegalArgumentException if the parameters are not valid + * @since 2.1 + */ + public double evaluate(final double[] values, final double[] weights, final double mean) { + return evaluate(values, weights, mean, 0, values.length); + } + + /** + * @return Returns the isBiasCorrected. + */ + public boolean isBiasCorrected() { + return isBiasCorrected; + } + + /** + * @param biasCorrected The isBiasCorrected to set. + */ + public void setBiasCorrected(boolean biasCorrected) { + this.isBiasCorrected = biasCorrected; + } + + /** + * {@inheritDoc} + */ + @Override + public Variance copy() { + Variance result = new Variance(); + copy(this, result); + return result; + } + + /** + * Copies source to dest. + * <p>Neither source nor dest can be null.</p> + * + * @param source Variance to copy + * @param dest Variance to copy to + * @throws NullPointerException if either source or dest is null + */ + public static void copy(Variance source, Variance dest) { + if (source == null || + dest == null) { + throw new NullArgumentException(); + } + dest.setData(source.getDataRef()); + dest.moment = source.moment.copy(); + dest.isBiasCorrected = source.isBiasCorrected; + dest.incMoment = source.incMoment; + } +} diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/moment/VectorialCovariance.java b/src/main/java/org/apache/commons/math/stat/descriptive/moment/VectorialCovariance.java new file mode 100644 index 0000000..71afc68 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/descriptive/moment/VectorialCovariance.java @@ -0,0 +1,152 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.descriptive.moment; + +import java.io.Serializable; +import java.util.Arrays; + +import org.apache.commons.math.DimensionMismatchException; +import org.apache.commons.math.linear.MatrixUtils; +import org.apache.commons.math.linear.RealMatrix; + +/** + * Returns the covariance matrix of the available vectors. + * @since 1.2 + * @version $Revision: 922714 $ $Date: 2010-03-14 02:35:14 +0100 (dim. 14 mars 2010) $ + */ +public class VectorialCovariance implements Serializable { + + /** Serializable version identifier */ + private static final long serialVersionUID = 4118372414238930270L; + + /** Sums for each component. */ + private final double[] sums; + + /** Sums of products for each component. */ + private final double[] productsSums; + + /** Indicator for bias correction. */ + private final boolean isBiasCorrected; + + /** Number of vectors in the sample. */ + private long n; + + /** Constructs a VectorialCovariance. + * @param dimension vectors dimension + * @param isBiasCorrected if true, computed the unbiased sample covariance, + * otherwise computes the biased population covariance + */ + public VectorialCovariance(int dimension, boolean isBiasCorrected) { + sums = new double[dimension]; + productsSums = new double[dimension * (dimension + 1) / 2]; + n = 0; + this.isBiasCorrected = isBiasCorrected; + } + + /** + * Add a new vector to the sample. + * @param v vector to add + * @exception DimensionMismatchException if the vector does not have the right dimension + */ + public void increment(double[] v) throws DimensionMismatchException { + if (v.length != sums.length) { + throw new DimensionMismatchException(v.length, sums.length); + } + int k = 0; + for (int i = 0; i < v.length; ++i) { + sums[i] += v[i]; + for (int j = 0; j <= i; ++j) { + productsSums[k++] += v[i] * v[j]; + } + } + n++; + } + + /** + * Get the covariance matrix. + * @return covariance matrix + */ + public RealMatrix getResult() { + + int dimension = sums.length; + RealMatrix result = MatrixUtils.createRealMatrix(dimension, dimension); + + if (n > 1) { + double c = 1.0 / (n * (isBiasCorrected ? (n - 1) : n)); + int k = 0; + for (int i = 0; i < dimension; ++i) { + for (int j = 0; j <= i; ++j) { + double e = c * (n * productsSums[k++] - sums[i] * sums[j]); + result.setEntry(i, j, e); + result.setEntry(j, i, e); + } + } + } + + return result; + + } + + /** + * Get the number of vectors in the sample. + * @return number of vectors in the sample + */ + public long getN() { + return n; + } + + /** + * Clears the internal state of the Statistic + */ + public void clear() { + n = 0; + Arrays.fill(sums, 0.0); + Arrays.fill(productsSums, 0.0); + } + + /** {@inheritDoc} */ + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + (isBiasCorrected ? 1231 : 1237); + result = prime * result + (int) (n ^ (n >>> 32)); + result = prime * result + Arrays.hashCode(productsSums); + result = prime * result + Arrays.hashCode(sums); + return result; + } + + /** {@inheritDoc} */ + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (!(obj instanceof VectorialCovariance)) + return false; + VectorialCovariance other = (VectorialCovariance) obj; + if (isBiasCorrected != other.isBiasCorrected) + return false; + if (n != other.n) + return false; + if (!Arrays.equals(productsSums, other.productsSums)) + return false; + if (!Arrays.equals(sums, other.sums)) + return false; + return true; + } + +} diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/moment/VectorialMean.java b/src/main/java/org/apache/commons/math/stat/descriptive/moment/VectorialMean.java new file mode 100644 index 0000000..ef57657 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/descriptive/moment/VectorialMean.java @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.descriptive.moment; + +import java.io.Serializable; +import java.util.Arrays; + +import org.apache.commons.math.DimensionMismatchException; + +/** + * Returns the arithmetic mean of the available vectors. + * @since 1.2 + * @version $Revision: 922714 $ $Date: 2010-03-14 02:35:14 +0100 (dim. 14 mars 2010) $ + */ +public class VectorialMean implements Serializable { + + /** Serializable version identifier */ + private static final long serialVersionUID = 8223009086481006892L; + + /** Means for each component. */ + private final Mean[] means; + + /** Constructs a VectorialMean. + * @param dimension vectors dimension + */ + public VectorialMean(int dimension) { + means = new Mean[dimension]; + for (int i = 0; i < dimension; ++i) { + means[i] = new Mean(); + } + } + + /** + * Add a new vector to the sample. + * @param v vector to add + * @exception DimensionMismatchException if the vector does not have the right dimension + */ + public void increment(double[] v) throws DimensionMismatchException { + if (v.length != means.length) { + throw new DimensionMismatchException(v.length, means.length); + } + for (int i = 0; i < v.length; ++i) { + means[i].increment(v[i]); + } + } + + /** + * Get the mean vector. + * @return mean vector + */ + public double[] getResult() { + double[] result = new double[means.length]; + for (int i = 0; i < result.length; ++i) { + result[i] = means[i].getResult(); + } + return result; + } + + /** + * Get the number of vectors in the sample. + * @return number of vectors in the sample + */ + public long getN() { + return (means.length == 0) ? 0 : means[0].getN(); + } + + /** {@inheritDoc} */ + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + Arrays.hashCode(means); + return result; + } + + /** {@inheritDoc} */ + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (!(obj instanceof VectorialMean)) + return false; + VectorialMean other = (VectorialMean) obj; + if (!Arrays.equals(means, other.means)) + return false; + return true; + } + +} diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/moment/package.html b/src/main/java/org/apache/commons/math/stat/descriptive/moment/package.html new file mode 100644 index 0000000..e024095 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/descriptive/moment/package.html @@ -0,0 +1,20 @@ +<html> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + --> + <!-- $Revision: 480440 $ $Date: 2006-11-29 08:14:12 +0100 (mer. 29 nov. 2006) $ --> + <body>Summary statistics based on moments.</body> +</html> diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/package.html b/src/main/java/org/apache/commons/math/stat/descriptive/package.html new file mode 100644 index 0000000..981fda4 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/descriptive/package.html @@ -0,0 +1,41 @@ +<html> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + --> + <!-- $Revision: 811685 $ $Date: 2009-09-05 19:36:48 +0200 (sam. 05 sept. 2009) $ --> + <body> + Generic univariate summary statistic objects. + + <h3>UnivariateStatistic API Usage Examples:</h3> + <h4>UnivariateStatistic:</h4> + <code>/* evaluation approach */<br/> double[] values = new double[] { 1, 2, + 3, 4, 5 };<br/> <span style="font-weight: bold;">UnivariateStatistic stat + = new Mean();</span><br/> System.out.println("mean = " + <span + style="font-weight: bold;">stat.evaluate(values)</span>);<br/> </code> + <h4>StorelessUnivariateStatistic:</h4> + <code>/* incremental approach */<br/> double[] values = new double[] { 1, 2, + 3, 4, 5 };<br/> <span style="font-weight: bold;"> + StorelessUnivariateStatistic stat = new Mean();</span><br/> + System.out.println("mean before adding a value is NaN = " + <span + style="font-weight: bold;">stat.getResult()</span>);<br/> for (int i = 0; + i < values.length; i++) {<br/> <span + style="font-weight: bold;">stat.increment(values[i]);</span><br/> + System.out.println("current mean = " + <span style="font-weight: bold;"> + stat2.getResult()</span>);<br/> }<br/> <span style="font-weight: bold;"> + stat.clear();</span><br/> System.out.println("mean after clear is NaN = " + + <span style="font-weight: bold;">stat.getResult()</span>);</code> + </body> +</html> diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/rank/Max.java b/src/main/java/org/apache/commons/math/stat/descriptive/rank/Max.java new file mode 100644 index 0000000..1b15750 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/descriptive/rank/Max.java @@ -0,0 +1,163 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.descriptive.rank; + +import java.io.Serializable; + +import org.apache.commons.math.stat.descriptive.AbstractStorelessUnivariateStatistic; + +/** + * Returns the maximum of the available values. + * <p> + * <ul> + * <li>The result is <code>NaN</code> iff all values are <code>NaN</code> + * (i.e. <code>NaN</code> values have no impact on the value of the statistic).</li> + * <li>If any of the values equals <code>Double.POSITIVE_INFINITY</code>, + * the result is <code>Double.POSITIVE_INFINITY.</code></li> + * </ul></p> +* <p> + * <strong>Note that this implementation is not synchronized.</strong> If + * multiple threads access an instance of this class concurrently, and at least + * one of the threads invokes the <code>increment()</code> or + * <code>clear()</code> method, it must be synchronized externally.</p> + * + * @version $Revision: 1006299 $ $Date: 2010-10-10 16:47:17 +0200 (dim. 10 oct. 2010) $ + */ +public class Max extends AbstractStorelessUnivariateStatistic implements Serializable { + + /** Serializable version identifier */ + private static final long serialVersionUID = -5593383832225844641L; + + /** Number of values that have been added */ + private long n; + + /** Current value of the statistic */ + private double value; + + /** + * Create a Max instance + */ + public Max() { + n = 0; + value = Double.NaN; + } + + /** + * Copy constructor, creates a new {@code Max} identical + * to the {@code original} + * + * @param original the {@code Max} instance to copy + */ + public Max(Max original) { + copy(original, this); + } + + /** + * {@inheritDoc} + */ + @Override + public void increment(final double d) { + if (d > value || Double.isNaN(value)) { + value = d; + } + n++; + } + + /** + * {@inheritDoc} + */ + @Override + public void clear() { + value = Double.NaN; + n = 0; + } + + /** + * {@inheritDoc} + */ + @Override + public double getResult() { + return value; + } + + /** + * {@inheritDoc} + */ + public long getN() { + return n; + } + + /** + * Returns the maximum of the entries in the specified portion of + * the input array, or <code>Double.NaN</code> if the designated subarray + * is empty. + * <p> + * Throws <code>IllegalArgumentException</code> if the array is null or + * the array index parameters are not valid.</p> + * <p> + * <ul> + * <li>The result is <code>NaN</code> iff all values are <code>NaN</code> + * (i.e. <code>NaN</code> values have no impact on the value of the statistic).</li> + * <li>If any of the values equals <code>Double.POSITIVE_INFINITY</code>, + * the result is <code>Double.POSITIVE_INFINITY.</code></li> + * </ul></p> + * + * @param values the input array + * @param begin index of the first array element to include + * @param length the number of elements to include + * @return the maximum of the values or Double.NaN if length = 0 + * @throws IllegalArgumentException if the array is null or the array index + * parameters are not valid + */ + @Override + public double evaluate(final double[] values, final int begin, final int length) { + double max = Double.NaN; + if (test(values, begin, length)) { + max = values[begin]; + for (int i = begin; i < begin + length; i++) { + if (!Double.isNaN(values[i])) { + max = (max > values[i]) ? max : values[i]; + } + } + } + return max; + } + + /** + * {@inheritDoc} + */ + @Override + public Max copy() { + Max result = new Max(); + copy(this, result); + return result; + } + + /** + * Copies source to dest. + * <p>Neither source nor dest can be null.</p> + * + * @param source Max to copy + * @param dest Max to copy to + * @throws NullPointerException if either source or dest is null + */ + public static void copy(Max source, Max dest) { + dest.setData(source.getDataRef()); + dest.n = source.n; + dest.value = source.value; + } +} diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/rank/Median.java b/src/main/java/org/apache/commons/math/stat/descriptive/rank/Median.java new file mode 100644 index 0000000..6e13b13 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/descriptive/rank/Median.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.descriptive.rank; + +import java.io.Serializable; + + +/** + * Returns the median of the available values. This is the same as the 50th percentile. + * See {@link Percentile} for a description of the algorithm used. + * <p> + * <strong>Note that this implementation is not synchronized.</strong> If + * multiple threads access an instance of this class concurrently, and at least + * one of the threads invokes the <code>increment()</code> or + * <code>clear()</code> method, it must be synchronized externally.</p> + * + * @version $Revision: 811685 $ $Date: 2009-09-05 19:36:48 +0200 (sam. 05 sept. 2009) $ + */ +public class Median extends Percentile implements Serializable { + + /** Serializable version identifier */ + private static final long serialVersionUID = -3961477041290915687L; + + /** + * Default constructor. + */ + public Median() { + super(50.0); + } + + /** + * Copy constructor, creates a new {@code Median} identical + * to the {@code original} + * + * @param original the {@code Median} instance to copy + */ + public Median(Median original) { + super(original); + } + +} diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/rank/Min.java b/src/main/java/org/apache/commons/math/stat/descriptive/rank/Min.java new file mode 100644 index 0000000..1c264c6 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/descriptive/rank/Min.java @@ -0,0 +1,163 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.descriptive.rank; + +import java.io.Serializable; + +import org.apache.commons.math.stat.descriptive.AbstractStorelessUnivariateStatistic; + +/** + * Returns the minimum of the available values. + * <p> + * <ul> + * <li>The result is <code>NaN</code> iff all values are <code>NaN</code> + * (i.e. <code>NaN</code> values have no impact on the value of the statistic).</li> + * <li>If any of the values equals <code>Double.NEGATIVE_INFINITY</code>, + * the result is <code>Double.NEGATIVE_INFINITY.</code></li> + * </ul></p> + * <p> + * <strong>Note that this implementation is not synchronized.</strong> If + * multiple threads access an instance of this class concurrently, and at least + * one of the threads invokes the <code>increment()</code> or + * <code>clear()</code> method, it must be synchronized externally.</p> + * + * @version $Revision: 1006299 $ $Date: 2010-10-10 16:47:17 +0200 (dim. 10 oct. 2010) $ + */ +public class Min extends AbstractStorelessUnivariateStatistic implements Serializable { + + /** Serializable version identifier */ + private static final long serialVersionUID = -2941995784909003131L; + + /**Number of values that have been added */ + private long n; + + /**Current value of the statistic */ + private double value; + + /** + * Create a Min instance + */ + public Min() { + n = 0; + value = Double.NaN; + } + + /** + * Copy constructor, creates a new {@code Min} identical + * to the {@code original} + * + * @param original the {@code Min} instance to copy + */ + public Min(Min original) { + copy(original, this); + } + + /** + * {@inheritDoc} + */ + @Override + public void increment(final double d) { + if (d < value || Double.isNaN(value)) { + value = d; + } + n++; + } + + /** + * {@inheritDoc} + */ + @Override + public void clear() { + value = Double.NaN; + n = 0; + } + + /** + * {@inheritDoc} + */ + @Override + public double getResult() { + return value; + } + + /** + * {@inheritDoc} + */ + public long getN() { + return n; + } + + /** + * Returns the minimum of the entries in the specified portion of + * the input array, or <code>Double.NaN</code> if the designated subarray + * is empty. + * <p> + * Throws <code>IllegalArgumentException</code> if the array is null or + * the array index parameters are not valid.</p> + * <p> + * <ul> + * <li>The result is <code>NaN</code> iff all values are <code>NaN</code> + * (i.e. <code>NaN</code> values have no impact on the value of the statistic).</li> + * <li>If any of the values equals <code>Double.NEGATIVE_INFINITY</code>, + * the result is <code>Double.NEGATIVE_INFINITY.</code></li> + * </ul> </p> + * + * @param values the input array + * @param begin index of the first array element to include + * @param length the number of elements to include + * @return the minimum of the values or Double.NaN if length = 0 + * @throws IllegalArgumentException if the array is null or the array index + * parameters are not valid + */ + @Override + public double evaluate(final double[] values,final int begin, final int length) { + double min = Double.NaN; + if (test(values, begin, length)) { + min = values[begin]; + for (int i = begin; i < begin + length; i++) { + if (!Double.isNaN(values[i])) { + min = (min < values[i]) ? min : values[i]; + } + } + } + return min; + } + + /** + * {@inheritDoc} + */ + @Override + public Min copy() { + Min result = new Min(); + copy(this, result); + return result; + } + + /** + * Copies source to dest. + * <p>Neither source nor dest can be null.</p> + * + * @param source Min to copy + * @param dest Min to copy to + * @throws NullPointerException if either source or dest is null + */ + public static void copy(Min source, Min dest) { + dest.setData(source.getDataRef()); + dest.n = source.n; + dest.value = source.value; + } +} diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/rank/Percentile.java b/src/main/java/org/apache/commons/math/stat/descriptive/rank/Percentile.java new file mode 100644 index 0000000..0c8a90f --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/descriptive/rank/Percentile.java @@ -0,0 +1,497 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.descriptive.rank; + +import java.io.Serializable; +import java.util.Arrays; + +import org.apache.commons.math.MathRuntimeException; +import org.apache.commons.math.exception.util.LocalizedFormats; +import org.apache.commons.math.stat.descriptive.AbstractUnivariateStatistic; +import org.apache.commons.math.util.FastMath; + +/** + * Provides percentile computation. + * <p> + * There are several commonly used methods for estimating percentiles (a.k.a. + * quantiles) based on sample data. For large samples, the different methods + * agree closely, but when sample sizes are small, different methods will give + * significantly different results. The algorithm implemented here works as follows: + * <ol> + * <li>Let <code>n</code> be the length of the (sorted) array and + * <code>0 < p <= 100</code> be the desired percentile.</li> + * <li>If <code> n = 1 </code> return the unique array element (regardless of + * the value of <code>p</code>); otherwise </li> + * <li>Compute the estimated percentile position + * <code> pos = p * (n + 1) / 100</code> and the difference, <code>d</code> + * between <code>pos</code> and <code>floor(pos)</code> (i.e. the fractional + * part of <code>pos</code>). If <code>pos >= n</code> return the largest + * element in the array; otherwise</li> + * <li>Let <code>lower</code> be the element in position + * <code>floor(pos)</code> in the array and let <code>upper</code> be the + * next element in the array. Return <code>lower + d * (upper - lower)</code> + * </li> + * </ol></p> + * <p> + * To compute percentiles, the data must be at least partially ordered. Input + * arrays are copied and recursively partitioned using an ordering definition. + * The ordering used by <code>Arrays.sort(double[])</code> is the one determined + * by {@link java.lang.Double#compareTo(Double)}. This ordering makes + * <code>Double.NaN</code> larger than any other value (including + * <code>Double.POSITIVE_INFINITY</code>). Therefore, for example, the median + * (50th percentile) of + * <code>{0, 1, 2, 3, 4, Double.NaN}</code> evaluates to <code>2.5.</code></p> + * <p> + * Since percentile estimation usually involves interpolation between array + * elements, arrays containing <code>NaN</code> or infinite values will often + * result in <code>NaN<code> or infinite values returned.</p> + * <p> + * Since 2.2, Percentile implementation uses only selection instead of complete + * sorting and caches selection algorithm state between calls to the various + * {@code evaluate} methods when several percentiles are to be computed on the same data. + * This greatly improves efficiency, both for single percentile and multiple + * percentiles computations. However, it also induces a need to be sure the data + * at one call to {@code evaluate} is the same as the data with the cached algorithm + * state from the previous calls. Percentile does this by checking the array reference + * itself and a checksum of its content by default. If the user already knows he calls + * {@code evaluate} on an immutable array, he can save the checking time by calling the + * {@code evaluate} methods that do <em>not</em> + * </p> + * <p> + * <strong>Note that this implementation is not synchronized.</strong> If + * multiple threads access an instance of this class concurrently, and at least + * one of the threads invokes the <code>increment()</code> or + * <code>clear()</code> method, it must be synchronized externally.</p> + * + * @version $Revision: 1006299 $ $Date: 2010-10-10 16:47:17 +0200 (dim. 10 oct. 2010) $ + */ +public class Percentile extends AbstractUnivariateStatistic implements Serializable { + + /** Serializable version identifier */ + private static final long serialVersionUID = -8091216485095130416L; + + /** Minimum size under which we use a simple insertion sort rather than Hoare's select. */ + private static final int MIN_SELECT_SIZE = 15; + + /** Maximum number of partitioning pivots cached (each level double the number of pivots). */ + private static final int MAX_CACHED_LEVELS = 10; + + /** Determines what percentile is computed when evaluate() is activated + * with no quantile argument */ + private double quantile = 0.0; + + /** Cached pivots. */ + private int[] cachedPivots; + + /** + * Constructs a Percentile with a default quantile + * value of 50.0. + */ + public Percentile() { + this(50.0); + } + + /** + * Constructs a Percentile with the specific quantile value. + * @param p the quantile + * @throws IllegalArgumentException if p is not greater than 0 and less + * than or equal to 100 + */ + public Percentile(final double p) { + setQuantile(p); + cachedPivots = null; + } + + /** + * Copy constructor, creates a new {@code Percentile} identical + * to the {@code original} + * + * @param original the {@code Percentile} instance to copy + */ + public Percentile(Percentile original) { + copy(original, this); + } + + /** {@inheritDoc} */ + @Override + public void setData(final double[] values) { + if (values == null) { + cachedPivots = null; + } else { + cachedPivots = new int[(0x1 << MAX_CACHED_LEVELS) - 1]; + Arrays.fill(cachedPivots, -1); + } + super.setData(values); + } + + /** {@inheritDoc} */ + @Override + public void setData(final double[] values, final int begin, final int length) { + if (values == null) { + cachedPivots = null; + } else { + cachedPivots = new int[(0x1 << MAX_CACHED_LEVELS) - 1]; + Arrays.fill(cachedPivots, -1); + } + super.setData(values, begin, length); + } + + /** + * Returns the result of evaluating the statistic over the stored data. + * <p> + * The stored array is the one which was set by previous calls to + * </p> + * @param p the percentile value to compute + * @return the value of the statistic applied to the stored data + */ + public double evaluate(final double p) { + return evaluate(getDataRef(), p); + } + + /** + * Returns an estimate of the <code>p</code>th percentile of the values + * in the <code>values</code> array. + * <p> + * Calls to this method do not modify the internal <code>quantile</code> + * state of this statistic.</p> + * <p> + * <ul> + * <li>Returns <code>Double.NaN</code> if <code>values</code> has length + * <code>0</code></li> + * <li>Returns (for any value of <code>p</code>) <code>values[0]</code> + * if <code>values</code> has length <code>1</code></li> + * <li>Throws <code>IllegalArgumentException</code> if <code>values</code> + * is null or p is not a valid quantile value (p must be greater than 0 + * and less than or equal to 100) </li> + * </ul></p> + * <p> + * See {@link Percentile} for a description of the percentile estimation + * algorithm used.</p> + * + * @param values input array of values + * @param p the percentile value to compute + * @return the percentile value or Double.NaN if the array is empty + * @throws IllegalArgumentException if <code>values</code> is null + * or p is invalid + */ + public double evaluate(final double[] values, final double p) { + test(values, 0, 0); + return evaluate(values, 0, values.length, p); + } + + /** + * Returns an estimate of the <code>quantile</code>th percentile of the + * designated values in the <code>values</code> array. The quantile + * estimated is determined by the <code>quantile</code> property. + * <p> + * <ul> + * <li>Returns <code>Double.NaN</code> if <code>length = 0</code></li> + * <li>Returns (for any value of <code>quantile</code>) + * <code>values[begin]</code> if <code>length = 1 </code></li> + * <li>Throws <code>IllegalArgumentException</code> if <code>values</code> + * is null, or <code>start</code> or <code>length</code> + * is invalid</li> + * </ul></p> + * <p> + * See {@link Percentile} for a description of the percentile estimation + * algorithm used.</p> + * + * @param values the input array + * @param start index of the first array element to include + * @param length the number of elements to include + * @return the percentile value + * @throws IllegalArgumentException if the parameters are not valid + * + */ + @Override + public double evaluate( final double[] values, final int start, final int length) { + return evaluate(values, start, length, quantile); + } + + /** + * Returns an estimate of the <code>p</code>th percentile of the values + * in the <code>values</code> array, starting with the element in (0-based) + * position <code>begin</code> in the array and including <code>length</code> + * values. + * <p> + * Calls to this method do not modify the internal <code>quantile</code> + * state of this statistic.</p> + * <p> + * <ul> + * <li>Returns <code>Double.NaN</code> if <code>length = 0</code></li> + * <li>Returns (for any value of <code>p</code>) <code>values[begin]</code> + * if <code>length = 1 </code></li> + * <li>Throws <code>IllegalArgumentException</code> if <code>values</code> + * is null , <code>begin</code> or <code>length</code> is invalid, or + * <code>p</code> is not a valid quantile value (p must be greater than 0 + * and less than or equal to 100)</li> + * </ul></p> + * <p> + * See {@link Percentile} for a description of the percentile estimation + * algorithm used.</p> + * + * @param values array of input values + * @param p the percentile to compute + * @param begin the first (0-based) element to include in the computation + * @param length the number of array elements to include + * @return the percentile value + * @throws IllegalArgumentException if the parameters are not valid or the + * input array is null + */ + public double evaluate(final double[] values, final int begin, + final int length, final double p) { + + test(values, begin, length); + + if ((p > 100) || (p <= 0)) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.OUT_OF_BOUNDS_QUANTILE_VALUE, p); + } + if (length == 0) { + return Double.NaN; + } + if (length == 1) { + return values[begin]; // always return single value for n = 1 + } + double n = length; + double pos = p * (n + 1) / 100; + double fpos = FastMath.floor(pos); + int intPos = (int) fpos; + double dif = pos - fpos; + double[] work; + int[] pivotsHeap; + if (values == getDataRef()) { + work = getDataRef(); + pivotsHeap = cachedPivots; + } else { + work = new double[length]; + System.arraycopy(values, begin, work, 0, length); + pivotsHeap = new int[(0x1 << MAX_CACHED_LEVELS) - 1]; + Arrays.fill(pivotsHeap, -1); + } + + if (pos < 1) { + return select(work, pivotsHeap, 0); + } + if (pos >= n) { + return select(work, pivotsHeap, length - 1); + } + double lower = select(work, pivotsHeap, intPos - 1); + double upper = select(work, pivotsHeap, intPos); + return lower + dif * (upper - lower); + } + + /** + * Select the k<sup>th</sup> smallest element from work array + * @param work work array (will be reorganized during the call) + * @param pivotsHeap set of pivot index corresponding to elements that + * are already at their sorted location, stored as an implicit heap + * (i.e. a sorted binary tree stored in a flat array, where the + * children of a node at index n are at indices 2n+1 for the left + * child and 2n+2 for the right child, with 0-based indices) + * @param k index of the desired element + * @return k<sup>th</sup> smallest element + */ + private double select(final double[] work, final int[] pivotsHeap, final int k) { + + int begin = 0; + int end = work.length; + int node = 0; + + while (end - begin > MIN_SELECT_SIZE) { + + final int pivot; + if ((node < pivotsHeap.length) && (pivotsHeap[node] >= 0)) { + // the pivot has already been found in a previous call + // and the array has already been partitioned around it + pivot = pivotsHeap[node]; + } else { + // select a pivot and partition work array around it + pivot = partition(work, begin, end, medianOf3(work, begin, end)); + if (node < pivotsHeap.length) { + pivotsHeap[node] = pivot; + } + } + + if (k == pivot) { + // the pivot was exactly the element we wanted + return work[k]; + } else if (k < pivot) { + // the element is in the left partition + end = pivot; + node = Math.min(2 * node + 1, pivotsHeap.length); // the min is here to avoid integer overflow + } else { + // the element is in the right partition + begin = pivot + 1; + node = Math.min(2 * node + 2, pivotsHeap.length); // the min is here to avoid integer overflow + } + + } + + // the element is somewhere in the small sub-array + // sort the sub-array using insertion sort + insertionSort(work, begin, end); + return work[k]; + + } + + /** Select a pivot index as the median of three + * @param work data array + * @param begin index of the first element of the slice + * @param end index after the last element of the slice + * @return the index of the median element chosen between the + * first, the middle and the last element of the array slice + */ + int medianOf3(final double[] work, final int begin, final int end) { + + final int inclusiveEnd = end - 1; + final int middle = begin + (inclusiveEnd - begin) / 2; + final double wBegin = work[begin]; + final double wMiddle = work[middle]; + final double wEnd = work[inclusiveEnd]; + + if (wBegin < wMiddle) { + if (wMiddle < wEnd) { + return middle; + } else { + return (wBegin < wEnd) ? inclusiveEnd : begin; + } + } else { + if (wBegin < wEnd) { + return begin; + } else { + return (wMiddle < wEnd) ? inclusiveEnd : middle; + } + } + + } + + /** + * Partition an array slice around a pivot + * <p> + * Partitioning exchanges array elements such that all elements + * smaller than pivot are before it and all elements larger than + * pivot are after it + * </p> + * @param work data array + * @param begin index of the first element of the slice + * @param end index after the last element of the slice + * @param pivot initial index of the pivot + * @return index of the pivot after partition + */ + private int partition(final double[] work, final int begin, final int end, final int pivot) { + + final double value = work[pivot]; + work[pivot] = work[begin]; + + int i = begin + 1; + int j = end - 1; + while (i < j) { + while ((i < j) && (work[j] >= value)) { + --j; + } + while ((i < j) && (work[i] <= value)) { + ++i; + } + + if (i < j) { + final double tmp = work[i]; + work[i++] = work[j]; + work[j--] = tmp; + } + } + + if ((i >= end) || (work[i] > value)) { + --i; + } + work[begin] = work[i]; + work[i] = value; + return i; + + } + + /** + * Sort in place a (small) array slice using insertion sort + * @param work array to sort + * @param begin index of the first element of the slice to sort + * @param end index after the last element of the slice to sort + */ + private void insertionSort(final double[] work, final int begin, final int end) { + for (int j = begin + 1; j < end; j++) { + final double saved = work[j]; + int i = j - 1; + while ((i >= begin) && (saved < work[i])) { + work[i + 1] = work[i]; + i--; + } + work[i + 1] = saved; + } + } + + /** + * Returns the value of the quantile field (determines what percentile is + * computed when evaluate() is called with no quantile argument). + * + * @return quantile + */ + public double getQuantile() { + return quantile; + } + + /** + * Sets the value of the quantile field (determines what percentile is + * computed when evaluate() is called with no quantile argument). + * + * @param p a value between 0 < p <= 100 + * @throws IllegalArgumentException if p is not greater than 0 and less + * than or equal to 100 + */ + public void setQuantile(final double p) { + if (p <= 0 || p > 100) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.OUT_OF_BOUNDS_QUANTILE_VALUE, p); + } + quantile = p; + } + + /** + * {@inheritDoc} + */ + @Override + public Percentile copy() { + Percentile result = new Percentile(); + copy(this, result); + return result; + } + + /** + * Copies source to dest. + * <p>Neither source nor dest can be null.</p> + * + * @param source Percentile to copy + * @param dest Percentile to copy to + * @throws NullPointerException if either source or dest is null + */ + public static void copy(Percentile source, Percentile dest) { + dest.setData(source.getDataRef()); + if (source.cachedPivots != null) { + System.arraycopy(source.cachedPivots, 0, dest.cachedPivots, 0, source.cachedPivots.length); + } + dest.quantile = source.quantile; + } + +} diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/rank/package.html b/src/main/java/org/apache/commons/math/stat/descriptive/rank/package.html new file mode 100644 index 0000000..c69107b --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/descriptive/rank/package.html @@ -0,0 +1,20 @@ +<html> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + --> + <!-- $Revision: 480440 $ $Date: 2006-11-29 08:14:12 +0100 (mer. 29 nov. 2006) $ --> + <body>Summary statistics based on ranks.</body> +</html> diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/summary/Product.java b/src/main/java/org/apache/commons/math/stat/descriptive/summary/Product.java new file mode 100644 index 0000000..c7d1d76 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/descriptive/summary/Product.java @@ -0,0 +1,224 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.descriptive.summary; + +import java.io.Serializable; + +import org.apache.commons.math.stat.descriptive.AbstractStorelessUnivariateStatistic; +import org.apache.commons.math.stat.descriptive.WeightedEvaluation; +import org.apache.commons.math.util.FastMath; + +/** + * Returns the product of the available values. + * <p> + * If there are no values in the dataset, or any of the values are + * <code>NaN</code>, then <code>NaN</code> is returned.</p> + * <p> + * <strong>Note that this implementation is not synchronized.</strong> If + * multiple threads access an instance of this class concurrently, and at least + * one of the threads invokes the <code>increment()</code> or + * <code>clear()</code> method, it must be synchronized externally.</p> + * + * @version $Revision: 1006299 $ $Date: 2010-10-10 16:47:17 +0200 (dim. 10 oct. 2010) $ + */ +public class Product extends AbstractStorelessUnivariateStatistic implements Serializable, WeightedEvaluation { + + /** Serializable version identifier */ + private static final long serialVersionUID = 2824226005990582538L; + + /**The number of values that have been added */ + private long n; + + /** + * The current Running Product. + */ + private double value; + + /** + * Create a Product instance + */ + public Product() { + n = 0; + value = Double.NaN; + } + + /** + * Copy constructor, creates a new {@code Product} identical + * to the {@code original} + * + * @param original the {@code Product} instance to copy + */ + public Product(Product original) { + copy(original, this); + } + + /** + * {@inheritDoc} + */ + @Override + public void increment(final double d) { + if (n == 0) { + value = d; + } else { + value *= d; + } + n++; + } + + /** + * {@inheritDoc} + */ + @Override + public double getResult() { + return value; + } + + /** + * {@inheritDoc} + */ + public long getN() { + return n; + } + + /** + * {@inheritDoc} + */ + @Override + public void clear() { + value = Double.NaN; + n = 0; + } + + /** + * Returns the product of the entries in the specified portion of + * the input array, or <code>Double.NaN</code> if the designated subarray + * is empty. + * <p> + * Throws <code>IllegalArgumentException</code> if the array is null.</p> + * + * @param values the input array + * @param begin index of the first array element to include + * @param length the number of elements to include + * @return the product of the values or Double.NaN if length = 0 + * @throws IllegalArgumentException if the array is null or the array index + * parameters are not valid + */ + @Override + public double evaluate(final double[] values, final int begin, final int length) { + double product = Double.NaN; + if (test(values, begin, length)) { + product = 1.0; + for (int i = begin; i < begin + length; i++) { + product *= values[i]; + } + } + return product; + } + + /** + * <p>Returns the weighted product of the entries in the specified portion of + * the input array, or <code>Double.NaN</code> if the designated subarray + * is empty.</p> + * + * <p>Throws <code>IllegalArgumentException</code> if any of the following are true: + * <ul><li>the values array is null</li> + * <li>the weights array is null</li> + * <li>the weights array does not have the same length as the values array</li> + * <li>the weights array contains one or more infinite values</li> + * <li>the weights array contains one or more NaN values</li> + * <li>the weights array contains negative values</li> + * <li>the start and length arguments do not determine a valid array</li> + * </ul></p> + * + * <p>Uses the formula, <pre> + * weighted product = ∏values[i]<sup>weights[i]</sup> + * </pre> + * that is, the weights are applied as exponents when computing the weighted product.</p> + * + * @param values the input array + * @param weights the weights array + * @param begin index of the first array element to include + * @param length the number of elements to include + * @return the product of the values or Double.NaN if length = 0 + * @throws IllegalArgumentException if the parameters are not valid + * @since 2.1 + */ + public double evaluate(final double[] values, final double[] weights, + final int begin, final int length) { + double product = Double.NaN; + if (test(values, weights, begin, length)) { + product = 1.0; + for (int i = begin; i < begin + length; i++) { + product *= FastMath.pow(values[i], weights[i]); + } + } + return product; + } + + /** + * <p>Returns the weighted product of the entries in the input array.</p> + * + * <p>Throws <code>IllegalArgumentException</code> if any of the following are true: + * <ul><li>the values array is null</li> + * <li>the weights array is null</li> + * <li>the weights array does not have the same length as the values array</li> + * <li>the weights array contains one or more infinite values</li> + * <li>the weights array contains one or more NaN values</li> + * <li>the weights array contains negative values</li> + * </ul></p> + * + * <p>Uses the formula, <pre> + * weighted product = ∏values[i]<sup>weights[i]</sup> + * </pre> + * that is, the weights are applied as exponents when computing the weighted product.</p> + * + * @param values the input array + * @param weights the weights array + * @return the product of the values or Double.NaN if length = 0 + * @throws IllegalArgumentException if the parameters are not valid + * @since 2.1 + */ + public double evaluate(final double[] values, final double[] weights) { + return evaluate(values, weights, 0, values.length); + } + + + /** + * {@inheritDoc} + */ + @Override + public Product copy() { + Product result = new Product(); + copy(this, result); + return result; + } + + /** + * Copies source to dest. + * <p>Neither source nor dest can be null.</p> + * + * @param source Product to copy + * @param dest Product to copy to + * @throws NullPointerException if either source or dest is null + */ + public static void copy(Product source, Product dest) { + dest.setData(source.getDataRef()); + dest.n = source.n; + dest.value = source.value; + } + +} diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/summary/Sum.java b/src/main/java/org/apache/commons/math/stat/descriptive/summary/Sum.java new file mode 100644 index 0000000..7188ea8 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/descriptive/summary/Sum.java @@ -0,0 +1,220 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.descriptive.summary; + +import java.io.Serializable; + +import org.apache.commons.math.stat.descriptive.AbstractStorelessUnivariateStatistic; + + +/** + * Returns the sum of the available values. + * <p> + * If there are no values in the dataset, or any of the values are + * <code>NaN</code>, then <code>NaN</code> is returned.</p> + * <p> + * <strong>Note that this implementation is not synchronized.</strong> If + * multiple threads access an instance of this class concurrently, and at least + * one of the threads invokes the <code>increment()</code> or + * <code>clear()</code> method, it must be synchronized externally.</p> + * + * @version $Revision: 1006299 $ $Date: 2010-10-10 16:47:17 +0200 (dim. 10 oct. 2010) $ + */ +public class Sum extends AbstractStorelessUnivariateStatistic implements Serializable { + + /** Serializable version identifier */ + private static final long serialVersionUID = -8231831954703408316L; + + /** */ + private long n; + + /** + * The currently running sum. + */ + private double value; + + /** + * Create a Sum instance + */ + public Sum() { + n = 0; + value = Double.NaN; + } + + /** + * Copy constructor, creates a new {@code Sum} identical + * to the {@code original} + * + * @param original the {@code Sum} instance to copy + */ + public Sum(Sum original) { + copy(original, this); + } + + /** + * {@inheritDoc} + */ + @Override + public void increment(final double d) { + if (n == 0) { + value = d; + } else { + value += d; + } + n++; + } + + /** + * {@inheritDoc} + */ + @Override + public double getResult() { + return value; + } + + /** + * {@inheritDoc} + */ + public long getN() { + return n; + } + + /** + * {@inheritDoc} + */ + @Override + public void clear() { + value = Double.NaN; + n = 0; + } + + /** + * The sum of the entries in the specified portion of + * the input array, or <code>Double.NaN</code> if the designated subarray + * is empty. + * <p> + * Throws <code>IllegalArgumentException</code> if the array is null.</p> + * + * @param values the input array + * @param begin index of the first array element to include + * @param length the number of elements to include + * @return the sum of the values or Double.NaN if length = 0 + * @throws IllegalArgumentException if the array is null or the array index + * parameters are not valid + */ + @Override + public double evaluate(final double[] values, final int begin, final int length) { + double sum = Double.NaN; + if (test(values, begin, length)) { + sum = 0.0; + for (int i = begin; i < begin + length; i++) { + sum += values[i]; + } + } + return sum; + } + + /** + * The weighted sum of the entries in the specified portion of + * the input array, or <code>Double.NaN</code> if the designated subarray + * is empty. + * <p> + * Throws <code>IllegalArgumentException</code> if any of the following are true: + * <ul><li>the values array is null</li> + * <li>the weights array is null</li> + * <li>the weights array does not have the same length as the values array</li> + * <li>the weights array contains one or more infinite values</li> + * <li>the weights array contains one or more NaN values</li> + * <li>the weights array contains negative values</li> + * <li>the start and length arguments do not determine a valid array</li> + * </ul></p> + * <p> + * Uses the formula, <pre> + * weighted sum = Σ(values[i] * weights[i]) + * </pre></p> + * + * @param values the input array + * @param weights the weights array + * @param begin index of the first array element to include + * @param length the number of elements to include + * @return the sum of the values or Double.NaN if length = 0 + * @throws IllegalArgumentException if the parameters are not valid + * @since 2.1 + */ + public double evaluate(final double[] values, final double[] weights, + final int begin, final int length) { + double sum = Double.NaN; + if (test(values, weights, begin, length)) { + sum = 0.0; + for (int i = begin; i < begin + length; i++) { + sum += values[i] * weights[i]; + } + } + return sum; + } + + /** + * The weighted sum of the entries in the the input array. + * <p> + * Throws <code>IllegalArgumentException</code> if any of the following are true: + * <ul><li>the values array is null</li> + * <li>the weights array is null</li> + * <li>the weights array does not have the same length as the values array</li> + * <li>the weights array contains one or more infinite values</li> + * <li>the weights array contains one or more NaN values</li> + * <li>the weights array contains negative values</li> + * </ul></p> + * <p> + * Uses the formula, <pre> + * weighted sum = Σ(values[i] * weights[i]) + * </pre></p> + * + * @param values the input array + * @param weights the weights array + * @return the sum of the values or Double.NaN if length = 0 + * @throws IllegalArgumentException if the parameters are not valid + * @since 2.1 + */ + public double evaluate(final double[] values, final double[] weights) { + return evaluate(values, weights, 0, values.length); + } + + /** + * {@inheritDoc} + */ + @Override + public Sum copy() { + Sum result = new Sum(); + copy(this, result); + return result; + } + + /** + * Copies source to dest. + * <p>Neither source nor dest can be null.</p> + * + * @param source Sum to copy + * @param dest Sum to copy to + * @throws NullPointerException if either source or dest is null + */ + public static void copy(Sum source, Sum dest) { + dest.setData(source.getDataRef()); + dest.n = source.n; + dest.value = source.value; + } + +} diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/summary/SumOfLogs.java b/src/main/java/org/apache/commons/math/stat/descriptive/summary/SumOfLogs.java new file mode 100644 index 0000000..331d5d2 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/descriptive/summary/SumOfLogs.java @@ -0,0 +1,165 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.descriptive.summary; + +import java.io.Serializable; + +import org.apache.commons.math.stat.descriptive.AbstractStorelessUnivariateStatistic; +import org.apache.commons.math.util.FastMath; + +/** + * Returns the sum of the natural logs for this collection of values. + * <p> + * Uses {@link java.lang.Math#log(double)} to compute the logs. Therefore, + * <ul> + * <li>If any of values are < 0, the result is <code>NaN.</code></li> + * <li>If all values are non-negative and less than + * <code>Double.POSITIVE_INFINITY</code>, but at least one value is 0, the + * result is <code>Double.NEGATIVE_INFINITY.</code></li> + * <li>If both <code>Double.POSITIVE_INFINITY</code> and + * <code>Double.NEGATIVE_INFINITY</code> are among the values, the result is + * <code>NaN.</code></li> + * </ul></p> + * <p> + * <strong>Note that this implementation is not synchronized.</strong> If + * multiple threads access an instance of this class concurrently, and at least + * one of the threads invokes the <code>increment()</code> or + * <code>clear()</code> method, it must be synchronized externally.</p> + * + * @version $Revision: 1006299 $ $Date: 2010-10-10 16:47:17 +0200 (dim. 10 oct. 2010) $ + */ +public class SumOfLogs extends AbstractStorelessUnivariateStatistic implements Serializable { + + /** Serializable version identifier */ + private static final long serialVersionUID = -370076995648386763L; + + /**Number of values that have been added */ + private int n; + + /** + * The currently running value + */ + private double value; + + /** + * Create a SumOfLogs instance + */ + public SumOfLogs() { + value = 0d; + n = 0; + } + + /** + * Copy constructor, creates a new {@code SumOfLogs} identical + * to the {@code original} + * + * @param original the {@code SumOfLogs} instance to copy + */ + public SumOfLogs(SumOfLogs original) { + copy(original, this); + } + + /** + * {@inheritDoc} + */ + @Override + public void increment(final double d) { + value += FastMath.log(d); + n++; + } + + /** + * {@inheritDoc} + */ + @Override + public double getResult() { + if (n > 0) { + return value; + } else { + return Double.NaN; + } + } + + /** + * {@inheritDoc} + */ + public long getN() { + return n; + } + + /** + * {@inheritDoc} + */ + @Override + public void clear() { + value = 0d; + n = 0; + } + + /** + * Returns the sum of the natural logs of the entries in the specified portion of + * the input array, or <code>Double.NaN</code> if the designated subarray + * is empty. + * <p> + * Throws <code>IllegalArgumentException</code> if the array is null.</p> + * <p> + * See {@link SumOfLogs}.</p> + * + * @param values the input array + * @param begin index of the first array element to include + * @param length the number of elements to include + * @return the sum of the natural logs of the values or Double.NaN if + * length = 0 + * @throws IllegalArgumentException if the array is null or the array index + * parameters are not valid + */ + @Override + public double evaluate(final double[] values, final int begin, final int length) { + double sumLog = Double.NaN; + if (test(values, begin, length)) { + sumLog = 0.0; + for (int i = begin; i < begin + length; i++) { + sumLog += FastMath.log(values[i]); + } + } + return sumLog; + } + + /** + * {@inheritDoc} + */ + @Override + public SumOfLogs copy() { + SumOfLogs result = new SumOfLogs(); + copy(this, result); + return result; + } + + /** + * Copies source to dest. + * <p>Neither source nor dest can be null.</p> + * + * @param source SumOfLogs to copy + * @param dest SumOfLogs to copy to + * @throws NullPointerException if either source or dest is null + */ + public static void copy(SumOfLogs source, SumOfLogs dest) { + dest.setData(source.getDataRef()); + dest.n = source.n; + dest.value = source.value; + } +} diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/summary/SumOfSquares.java b/src/main/java/org/apache/commons/math/stat/descriptive/summary/SumOfSquares.java new file mode 100644 index 0000000..a632bf6 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/descriptive/summary/SumOfSquares.java @@ -0,0 +1,154 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.descriptive.summary; + +import java.io.Serializable; + +import org.apache.commons.math.stat.descriptive.AbstractStorelessUnivariateStatistic; + +/** + * Returns the sum of the squares of the available values. + * <p> + * If there are no values in the dataset, or any of the values are + * <code>NaN</code>, then <code>NaN</code> is returned.</p> + * <p> + * <strong>Note that this implementation is not synchronized.</strong> If + * multiple threads access an instance of this class concurrently, and at least + * one of the threads invokes the <code>increment()</code> or + * <code>clear()</code> method, it must be synchronized externally.</p> + * + * @version $Revision: 1006299 $ $Date: 2010-10-10 16:47:17 +0200 (dim. 10 oct. 2010) $ + */ +public class SumOfSquares extends AbstractStorelessUnivariateStatistic implements Serializable { + + /** Serializable version identifier */ + private static final long serialVersionUID = 1460986908574398008L; + + /** */ + private long n; + + /** + * The currently running sumSq + */ + private double value; + + /** + * Create a SumOfSquares instance + */ + public SumOfSquares() { + n = 0; + value = Double.NaN; + } + + /** + * Copy constructor, creates a new {@code SumOfSquares} identical + * to the {@code original} + * + * @param original the {@code SumOfSquares} instance to copy + */ + public SumOfSquares(SumOfSquares original) { + copy(original, this); + } + + /** + * {@inheritDoc} + */ + @Override + public void increment(final double d) { + if (n == 0) { + value = d * d; + } else { + value += d * d; + } + n++; + } + + /** + * {@inheritDoc} + */ + @Override + public double getResult() { + return value; + } + + /** + * {@inheritDoc} + */ + public long getN() { + return n; + } + + /** + * {@inheritDoc} + */ + @Override + public void clear() { + value = Double.NaN; + n = 0; + } + + /** + * Returns the sum of the squares of the entries in the specified portion of + * the input array, or <code>Double.NaN</code> if the designated subarray + * is empty. + * <p> + * Throws <code>IllegalArgumentException</code> if the array is null.</p> + * + * @param values the input array + * @param begin index of the first array element to include + * @param length the number of elements to include + * @return the sum of the squares of the values or Double.NaN if length = 0 + * @throws IllegalArgumentException if the array is null or the array index + * parameters are not valid + */ + @Override + public double evaluate(final double[] values,final int begin, final int length) { + double sumSq = Double.NaN; + if (test(values, begin, length)) { + sumSq = 0.0; + for (int i = begin; i < begin + length; i++) { + sumSq += values[i] * values[i]; + } + } + return sumSq; + } + + /** + * {@inheritDoc} + */ + @Override + public SumOfSquares copy() { + SumOfSquares result = new SumOfSquares(); + copy(this, result); + return result; + } + + /** + * Copies source to dest. + * <p>Neither source nor dest can be null.</p> + * + * @param source SumOfSquares to copy + * @param dest SumOfSquares to copy to + * @throws NullPointerException if either source or dest is null + */ + public static void copy(SumOfSquares source, SumOfSquares dest) { + dest.setData(source.getDataRef()); + dest.n = source.n; + dest.value = source.value; + } + +} diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/summary/package.html b/src/main/java/org/apache/commons/math/stat/descriptive/summary/package.html new file mode 100644 index 0000000..db7f731 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/descriptive/summary/package.html @@ -0,0 +1,20 @@ +<html> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + --> + <!-- $Revision: 480440 $ $Date: 2006-11-29 08:14:12 +0100 (mer. 29 nov. 2006) $ --> + <body>Other summary statistics.</body> +</html> diff --git a/src/main/java/org/apache/commons/math/stat/inference/ChiSquareTest.java b/src/main/java/org/apache/commons/math/stat/inference/ChiSquareTest.java new file mode 100644 index 0000000..6a3ecac --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/inference/ChiSquareTest.java @@ -0,0 +1,222 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.inference; + +import org.apache.commons.math.MathException; + +/** + * An interface for Chi-Square tests. + * <p>This interface handles only known distributions. If the distribution is + * unknown and should be provided by a sample, then the {@link UnknownDistributionChiSquareTest + * UnknownDistributionChiSquareTest} extended interface should be used instead.</p> + * @version $Revision: 811685 $ $Date: 2009-09-05 19:36:48 +0200 (sam. 05 sept. 2009) $ + */ +public interface ChiSquareTest { + + /** + * Computes the <a href="http://www.itl.nist.gov/div898/handbook/eda/section3/eda35f.htm"> + * Chi-Square statistic</a> comparing <code>observed</code> and <code>expected</code> + * frequency counts. + * <p> + * This statistic can be used to perform a Chi-Square test evaluating the null hypothesis that + * the observed counts follow the expected distribution.</p> + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>Expected counts must all be positive. + * </li> + * <li>Observed counts must all be >= 0. + * </li> + * <li>The observed and expected arrays must have the same length and + * their common length must be at least 2. + * </li></ul></p><p> + * If any of the preconditions are not met, an + * <code>IllegalArgumentException</code> is thrown.</p> + * + * @param observed array of observed frequency counts + * @param expected array of expected frequency counts + * @return chiSquare statistic + * @throws IllegalArgumentException if preconditions are not met + */ + double chiSquare(double[] expected, long[] observed) + throws IllegalArgumentException; + + /** + * Returns the <i>observed significance level</i>, or <a href= + * "http://www.cas.lancs.ac.uk/glossary_v1.1/hyptest.html#pvalue"> + * p-value</a>, associated with a + * <a href="http://www.itl.nist.gov/div898/handbook/eda/section3/eda35f.htm"> + * Chi-square goodness of fit test</a> comparing the <code>observed</code> + * frequency counts to those in the <code>expected</code> array. + * <p> + * The number returned is the smallest significance level at which one can reject + * the null hypothesis that the observed counts conform to the frequency distribution + * described by the expected counts.</p> + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>Expected counts must all be positive. + * </li> + * <li>Observed counts must all be >= 0. + * </li> + * <li>The observed and expected arrays must have the same length and + * their common length must be at least 2. + * </li></ul></p><p> + * If any of the preconditions are not met, an + * <code>IllegalArgumentException</code> is thrown.</p> + * + * @param observed array of observed frequency counts + * @param expected array of expected frequency counts + * @return p-value + * @throws IllegalArgumentException if preconditions are not met + * @throws MathException if an error occurs computing the p-value + */ + double chiSquareTest(double[] expected, long[] observed) + throws IllegalArgumentException, MathException; + + /** + * Performs a <a href="http://www.itl.nist.gov/div898/handbook/eda/section3/eda35f.htm"> + * Chi-square goodness of fit test</a> evaluating the null hypothesis that the observed counts + * conform to the frequency distribution described by the expected counts, with + * significance level <code>alpha</code>. Returns true iff the null hypothesis can be rejected + * with 100 * (1 - alpha) percent confidence. + * <p> + * <strong>Example:</strong><br> + * To test the hypothesis that <code>observed</code> follows + * <code>expected</code> at the 99% level, use </p><p> + * <code>chiSquareTest(expected, observed, 0.01) </code></p> + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>Expected counts must all be positive. + * </li> + * <li>Observed counts must all be >= 0. + * </li> + * <li>The observed and expected arrays must have the same length and + * their common length must be at least 2. + * <li> <code> 0 < alpha < 0.5 </code> + * </li></ul></p><p> + * If any of the preconditions are not met, an + * <code>IllegalArgumentException</code> is thrown.</p> + * + * @param observed array of observed frequency counts + * @param expected array of expected frequency counts + * @param alpha significance level of the test + * @return true iff null hypothesis can be rejected with confidence + * 1 - alpha + * @throws IllegalArgumentException if preconditions are not met + * @throws MathException if an error occurs performing the test + */ + boolean chiSquareTest(double[] expected, long[] observed, double alpha) + throws IllegalArgumentException, MathException; + + /** + * Computes the Chi-Square statistic associated with a + * <a href="http://www.itl.nist.gov/div898/handbook/prc/section4/prc45.htm"> + * chi-square test of independence</a> based on the input <code>counts</code> + * array, viewed as a two-way table. + * <p> + * The rows of the 2-way table are + * <code>count[0], ... , count[count.length - 1] </code></p> + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>All counts must be >= 0. + * </li> + * <li>The count array must be rectangular (i.e. all count[i] subarrays + * must have the same length). + * </li> + * <li>The 2-way table represented by <code>counts</code> must have at + * least 2 columns and at least 2 rows. + * </li> + * </li></ul></p><p> + * If any of the preconditions are not met, an + * <code>IllegalArgumentException</code> is thrown.</p> + * + * @param counts array representation of 2-way table + * @return chiSquare statistic + * @throws IllegalArgumentException if preconditions are not met + */ + double chiSquare(long[][] counts) + throws IllegalArgumentException; + + /** + * Returns the <i>observed significance level</i>, or <a href= + * "http://www.cas.lancs.ac.uk/glossary_v1.1/hyptest.html#pvalue"> + * p-value</a>, associated with a + * <a href="http://www.itl.nist.gov/div898/handbook/prc/section4/prc45.htm"> + * chi-square test of independence</a> based on the input <code>counts</code> + * array, viewed as a two-way table. + * <p> + * The rows of the 2-way table are + * <code>count[0], ... , count[count.length - 1] </code></p> + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>All counts must be >= 0. + * </li> + * <li>The count array must be rectangular (i.e. all count[i] subarrays must have the same length). + * </li> + * <li>The 2-way table represented by <code>counts</code> must have at least 2 columns and + * at least 2 rows. + * </li> + * </li></ul></p><p> + * If any of the preconditions are not met, an + * <code>IllegalArgumentException</code> is thrown.</p> + * + * @param counts array representation of 2-way table + * @return p-value + * @throws IllegalArgumentException if preconditions are not met + * @throws MathException if an error occurs computing the p-value + */ + double chiSquareTest(long[][] counts) + throws IllegalArgumentException, MathException; + + /** + * Performs a <a href="http://www.itl.nist.gov/div898/handbook/prc/section4/prc45.htm"> + * chi-square test of independence</a> evaluating the null hypothesis that the classifications + * represented by the counts in the columns of the input 2-way table are independent of the rows, + * with significance level <code>alpha</code>. Returns true iff the null hypothesis can be rejected + * with 100 * (1 - alpha) percent confidence. + * <p> + * The rows of the 2-way table are + * <code>count[0], ... , count[count.length - 1] </code></p> + * <p> + * <strong>Example:</strong><br> + * To test the null hypothesis that the counts in + * <code>count[0], ... , count[count.length - 1] </code> + * all correspond to the same underlying probability distribution at the 99% level, use </p><p> + * <code>chiSquareTest(counts, 0.01) </code></p> + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>All counts must be >= 0. + * </li> + * <li>The count array must be rectangular (i.e. all count[i] subarrays must have the same length). + * </li> + * <li>The 2-way table represented by <code>counts</code> must have at least 2 columns and + * at least 2 rows. + * </li> + * </li></ul></p><p> + * If any of the preconditions are not met, an + * <code>IllegalArgumentException</code> is thrown.</p> + * + * @param counts array representation of 2-way table + * @param alpha significance level of the test + * @return true iff null hypothesis can be rejected with confidence + * 1 - alpha + * @throws IllegalArgumentException if preconditions are not met + * @throws MathException if an error occurs performing the test + */ + boolean chiSquareTest(long[][] counts, double alpha) + throws IllegalArgumentException, MathException; + +} diff --git a/src/main/java/org/apache/commons/math/stat/inference/ChiSquareTestImpl.java b/src/main/java/org/apache/commons/math/stat/inference/ChiSquareTestImpl.java new file mode 100644 index 0000000..abb32a5 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/inference/ChiSquareTestImpl.java @@ -0,0 +1,424 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.inference; + +import org.apache.commons.math.MathException; +import org.apache.commons.math.MathRuntimeException; +import org.apache.commons.math.distribution.ChiSquaredDistribution; +import org.apache.commons.math.distribution.ChiSquaredDistributionImpl; +import org.apache.commons.math.exception.util.LocalizedFormats; +import org.apache.commons.math.util.FastMath; + +/** + * Implements Chi-Square test statistics defined in the + * {@link UnknownDistributionChiSquareTest} interface. + * + * @version $Revision: 990655 $ $Date: 2010-08-29 23:49:40 +0200 (dim. 29 août 2010) $ + */ +public class ChiSquareTestImpl implements UnknownDistributionChiSquareTest { + + /** Distribution used to compute inference statistics. */ + private ChiSquaredDistribution distribution; + + /** + * Construct a ChiSquareTestImpl + */ + public ChiSquareTestImpl() { + this(new ChiSquaredDistributionImpl(1.0)); + } + + /** + * Create a test instance using the given distribution for computing + * inference statistics. + * @param x distribution used to compute inference statistics. + * @since 1.2 + */ + public ChiSquareTestImpl(ChiSquaredDistribution x) { + super(); + setDistribution(x); + } + /** + * {@inheritDoc} + * <p><strong>Note: </strong>This implementation rescales the + * <code>expected</code> array if necessary to ensure that the sum of the + * expected and observed counts are equal.</p> + * + * @param observed array of observed frequency counts + * @param expected array of expected frequency counts + * @return chi-square test statistic + * @throws IllegalArgumentException if preconditions are not met + * or length is less than 2 + */ + public double chiSquare(double[] expected, long[] observed) + throws IllegalArgumentException { + if (expected.length < 2) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.INSUFFICIENT_DIMENSION, expected.length, 2); + } + if (expected.length != observed.length) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.DIMENSIONS_MISMATCH_SIMPLE, expected.length, observed.length); + } + checkPositive(expected); + checkNonNegative(observed); + double sumExpected = 0d; + double sumObserved = 0d; + for (int i = 0; i < observed.length; i++) { + sumExpected += expected[i]; + sumObserved += observed[i]; + } + double ratio = 1.0d; + boolean rescale = false; + if (FastMath.abs(sumExpected - sumObserved) > 10E-6) { + ratio = sumObserved / sumExpected; + rescale = true; + } + double sumSq = 0.0d; + for (int i = 0; i < observed.length; i++) { + if (rescale) { + final double dev = observed[i] - ratio * expected[i]; + sumSq += dev * dev / (ratio * expected[i]); + } else { + final double dev = observed[i] - expected[i]; + sumSq += dev * dev / expected[i]; + } + } + return sumSq; + } + + /** + * {@inheritDoc} + * <p><strong>Note: </strong>This implementation rescales the + * <code>expected</code> array if necessary to ensure that the sum of the + * expected and observed counts are equal.</p> + * + * @param observed array of observed frequency counts + * @param expected array of expected frequency counts + * @return p-value + * @throws IllegalArgumentException if preconditions are not met + * @throws MathException if an error occurs computing the p-value + */ + public double chiSquareTest(double[] expected, long[] observed) + throws IllegalArgumentException, MathException { + distribution.setDegreesOfFreedom(expected.length - 1.0); + return 1.0 - distribution.cumulativeProbability( + chiSquare(expected, observed)); + } + + /** + * {@inheritDoc} + * <p><strong>Note: </strong>This implementation rescales the + * <code>expected</code> array if necessary to ensure that the sum of the + * expected and observed counts are equal.</p> + * + * @param observed array of observed frequency counts + * @param expected array of expected frequency counts + * @param alpha significance level of the test + * @return true iff null hypothesis can be rejected with confidence + * 1 - alpha + * @throws IllegalArgumentException if preconditions are not met + * @throws MathException if an error occurs performing the test + */ + public boolean chiSquareTest(double[] expected, long[] observed, + double alpha) throws IllegalArgumentException, MathException { + if ((alpha <= 0) || (alpha > 0.5)) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL, + alpha, 0, 0.5); + } + return chiSquareTest(expected, observed) < alpha; + } + + /** + * @param counts array representation of 2-way table + * @return chi-square test statistic + * @throws IllegalArgumentException if preconditions are not met + */ + public double chiSquare(long[][] counts) throws IllegalArgumentException { + + checkArray(counts); + int nRows = counts.length; + int nCols = counts[0].length; + + // compute row, column and total sums + double[] rowSum = new double[nRows]; + double[] colSum = new double[nCols]; + double total = 0.0d; + for (int row = 0; row < nRows; row++) { + for (int col = 0; col < nCols; col++) { + rowSum[row] += counts[row][col]; + colSum[col] += counts[row][col]; + total += counts[row][col]; + } + } + + // compute expected counts and chi-square + double sumSq = 0.0d; + double expected = 0.0d; + for (int row = 0; row < nRows; row++) { + for (int col = 0; col < nCols; col++) { + expected = (rowSum[row] * colSum[col]) / total; + sumSq += ((counts[row][col] - expected) * + (counts[row][col] - expected)) / expected; + } + } + return sumSq; + } + + /** + * @param counts array representation of 2-way table + * @return p-value + * @throws IllegalArgumentException if preconditions are not met + * @throws MathException if an error occurs computing the p-value + */ + public double chiSquareTest(long[][] counts) + throws IllegalArgumentException, MathException { + checkArray(counts); + double df = ((double) counts.length -1) * ((double) counts[0].length - 1); + distribution.setDegreesOfFreedom(df); + return 1 - distribution.cumulativeProbability(chiSquare(counts)); + } + + /** + * @param counts array representation of 2-way table + * @param alpha significance level of the test + * @return true iff null hypothesis can be rejected with confidence + * 1 - alpha + * @throws IllegalArgumentException if preconditions are not met + * @throws MathException if an error occurs performing the test + */ + public boolean chiSquareTest(long[][] counts, double alpha) + throws IllegalArgumentException, MathException { + if ((alpha <= 0) || (alpha > 0.5)) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL, + alpha, 0.0, 0.5); + } + return chiSquareTest(counts) < alpha; + } + + /** + * @param observed1 array of observed frequency counts of the first data set + * @param observed2 array of observed frequency counts of the second data set + * @return chi-square test statistic + * @throws IllegalArgumentException if preconditions are not met + * @since 1.2 + */ + public double chiSquareDataSetsComparison(long[] observed1, long[] observed2) + throws IllegalArgumentException { + + // Make sure lengths are same + if (observed1.length < 2) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.INSUFFICIENT_DIMENSION, observed1.length, 2); + } + if (observed1.length != observed2.length) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.DIMENSIONS_MISMATCH_SIMPLE, + observed1.length, observed2.length); + } + + // Ensure non-negative counts + checkNonNegative(observed1); + checkNonNegative(observed2); + + // Compute and compare count sums + long countSum1 = 0; + long countSum2 = 0; + boolean unequalCounts = false; + double weight = 0.0; + for (int i = 0; i < observed1.length; i++) { + countSum1 += observed1[i]; + countSum2 += observed2[i]; + } + // Ensure neither sample is uniformly 0 + if (countSum1 == 0) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.OBSERVED_COUNTS_ALL_ZERO, 1); + } + if (countSum2 == 0) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.OBSERVED_COUNTS_ALL_ZERO, 2); + } + // Compare and compute weight only if different + unequalCounts = countSum1 != countSum2; + if (unequalCounts) { + weight = FastMath.sqrt((double) countSum1 / (double) countSum2); + } + // Compute ChiSquare statistic + double sumSq = 0.0d; + double dev = 0.0d; + double obs1 = 0.0d; + double obs2 = 0.0d; + for (int i = 0; i < observed1.length; i++) { + if (observed1[i] == 0 && observed2[i] == 0) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.OBSERVED_COUNTS_BOTTH_ZERO_FOR_ENTRY, i); + } else { + obs1 = observed1[i]; + obs2 = observed2[i]; + if (unequalCounts) { // apply weights + dev = obs1/weight - obs2 * weight; + } else { + dev = obs1 - obs2; + } + sumSq += (dev * dev) / (obs1 + obs2); + } + } + return sumSq; + } + + /** + * @param observed1 array of observed frequency counts of the first data set + * @param observed2 array of observed frequency counts of the second data set + * @return p-value + * @throws IllegalArgumentException if preconditions are not met + * @throws MathException if an error occurs computing the p-value + * @since 1.2 + */ + public double chiSquareTestDataSetsComparison(long[] observed1, long[] observed2) + throws IllegalArgumentException, MathException { + distribution.setDegreesOfFreedom((double) observed1.length - 1); + return 1 - distribution.cumulativeProbability( + chiSquareDataSetsComparison(observed1, observed2)); + } + + /** + * @param observed1 array of observed frequency counts of the first data set + * @param observed2 array of observed frequency counts of the second data set + * @param alpha significance level of the test + * @return true iff null hypothesis can be rejected with confidence + * 1 - alpha + * @throws IllegalArgumentException if preconditions are not met + * @throws MathException if an error occurs performing the test + * @since 1.2 + */ + public boolean chiSquareTestDataSetsComparison(long[] observed1, long[] observed2, + double alpha) throws IllegalArgumentException, MathException { + if ((alpha <= 0) || (alpha > 0.5)) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL, + alpha, 0.0, 0.5); + } + return chiSquareTestDataSetsComparison(observed1, observed2) < alpha; + } + + /** + * Checks to make sure that the input long[][] array is rectangular, + * has at least 2 rows and 2 columns, and has all non-negative entries, + * throwing IllegalArgumentException if any of these checks fail. + * + * @param in input 2-way table to check + * @throws IllegalArgumentException if the array is not valid + */ + private void checkArray(long[][] in) throws IllegalArgumentException { + + if (in.length < 2) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.INSUFFICIENT_DIMENSION, in.length, 2); + } + + if (in[0].length < 2) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.INSUFFICIENT_DIMENSION, in[0].length, 2); + } + + checkRectangular(in); + checkNonNegative(in); + + } + + //--------------------- Private array methods -- should find a utility home for these + + /** + * Throws IllegalArgumentException if the input array is not rectangular. + * + * @param in array to be tested + * @throws NullPointerException if input array is null + * @throws IllegalArgumentException if input array is not rectangular + */ + private void checkRectangular(long[][] in) { + for (int i = 1; i < in.length; i++) { + if (in[i].length != in[0].length) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.DIFFERENT_ROWS_LENGTHS, + in[i].length, in[0].length); + } + } + } + + /** + * Check all entries of the input array are > 0. + * + * @param in array to be tested + * @exception IllegalArgumentException if one entry is not positive + */ + private void checkPositive(double[] in) throws IllegalArgumentException { + for (int i = 0; i < in.length; i++) { + if (in[i] <= 0) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.NOT_POSITIVE_ELEMENT_AT_INDEX, + i, in[i]); + } + } + } + + /** + * Check all entries of the input array are >= 0. + * + * @param in array to be tested + * @exception IllegalArgumentException if one entry is negative + */ + private void checkNonNegative(long[] in) throws IllegalArgumentException { + for (int i = 0; i < in.length; i++) { + if (in[i] < 0) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.NEGATIVE_ELEMENT_AT_INDEX, + i, in[i]); + } + } + } + + /** + * Check all entries of the input array are >= 0. + * + * @param in array to be tested + * @exception IllegalArgumentException if one entry is negative + */ + private void checkNonNegative(long[][] in) throws IllegalArgumentException { + for (int i = 0; i < in.length; i ++) { + for (int j = 0; j < in[i].length; j++) { + if (in[i][j] < 0) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.NEGATIVE_ELEMENT_AT_2D_INDEX, + i, j, in[i][j]); + } + } + } + } + + /** + * Modify the distribution used to compute inference statistics. + * + * @param value + * the new distribution + * @since 1.2 + */ + public void setDistribution(ChiSquaredDistribution value) { + distribution = value; + } +} diff --git a/src/main/java/org/apache/commons/math/stat/inference/OneWayAnova.java b/src/main/java/org/apache/commons/math/stat/inference/OneWayAnova.java new file mode 100644 index 0000000..a2cde47 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/inference/OneWayAnova.java @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.inference; + +import org.apache.commons.math.MathException; +import java.util.Collection; + +/** + * An interface for one-way ANOVA (analysis of variance). + * + * <p> Tests for differences between two or more categories of univariate data + * (for example, the body mass index of accountants, lawyers, doctors and + * computer programmers). When two categories are given, this is equivalent to + * the {@link org.apache.commons.math.stat.inference.TTest}. + * </p> + * + * @since 1.2 + * @version $Revision: 811786 $ $Date: 2009-09-06 11:36:08 +0200 (dim. 06 sept. 2009) $ + */ +public interface OneWayAnova { + + /** + * Computes the ANOVA F-value for a collection of <code>double[]</code> + * arrays. + * + * <p><strong>Preconditions</strong>: <ul> + * <li>The categoryData <code>Collection</code> must contain + * <code>double[]</code> arrays.</li> + * <li> There must be at least two <code>double[]</code> arrays in the + * <code>categoryData</code> collection and each of these arrays must + * contain at least two values.</li></ul></p> + * + * @param categoryData <code>Collection</code> of <code>double[]</code> + * arrays each containing data for one category + * @return Fvalue + * @throws IllegalArgumentException if the preconditions are not met + * @throws MathException if the statistic can not be computed do to a + * convergence or other numerical error. + */ + double anovaFValue(Collection<double[]> categoryData) + throws IllegalArgumentException, MathException; + + /** + * Computes the ANOVA P-value for a collection of <code>double[]</code> + * arrays. + * + * <p><strong>Preconditions</strong>: <ul> + * <li>The categoryData <code>Collection</code> must contain + * <code>double[]</code> arrays.</li> + * <li> There must be at least two <code>double[]</code> arrays in the + * <code>categoryData</code> collection and each of these arrays must + * contain at least two values.</li></ul></p> + * + * @param categoryData <code>Collection</code> of <code>double[]</code> + * arrays each containing data for one category + * @return Pvalue + * @throws IllegalArgumentException if the preconditions are not met + * @throws MathException if the statistic can not be computed do to a + * convergence or other numerical error. + */ + double anovaPValue(Collection<double[]> categoryData) + throws IllegalArgumentException, MathException; + + /** + * Performs an ANOVA test, evaluating the null hypothesis that there + * is no difference among the means of the data categories. + * + * <p><strong>Preconditions</strong>: <ul> + * <li>The categoryData <code>Collection</code> must contain + * <code>double[]</code> arrays.</li> + * <li> There must be at least two <code>double[]</code> arrays in the + * <code>categoryData</code> collection and each of these arrays must + * contain at least two values.</li> + * <li>alpha must be strictly greater than 0 and less than or equal to 0.5. + * </li></ul></p> + * + * @param categoryData <code>Collection</code> of <code>double[]</code> + * arrays each containing data for one category + * @param alpha significance level of the test + * @return true if the null hypothesis can be rejected with + * confidence 1 - alpha + * @throws IllegalArgumentException if the preconditions are not met + * @throws MathException if the statistic can not be computed do to a + * convergence or other numerical error. + */ + boolean anovaTest(Collection<double[]> categoryData, double alpha) + throws IllegalArgumentException, MathException; + +} diff --git a/src/main/java/org/apache/commons/math/stat/inference/OneWayAnovaImpl.java b/src/main/java/org/apache/commons/math/stat/inference/OneWayAnovaImpl.java new file mode 100644 index 0000000..a47d0cf --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/inference/OneWayAnovaImpl.java @@ -0,0 +1,210 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.inference; + +import java.util.Collection; + +import org.apache.commons.math.MathException; +import org.apache.commons.math.MathRuntimeException; +import org.apache.commons.math.distribution.FDistribution; +import org.apache.commons.math.distribution.FDistributionImpl; +import org.apache.commons.math.exception.util.LocalizedFormats; +import org.apache.commons.math.stat.descriptive.summary.Sum; +import org.apache.commons.math.stat.descriptive.summary.SumOfSquares; + + +/** + * Implements one-way ANOVA statistics defined in the {@link OneWayAnovaImpl} + * interface. + * + * <p>Uses the + * {@link org.apache.commons.math.distribution.FDistribution + * commons-math F Distribution implementation} to estimate exact p-values.</p> + * + * <p>This implementation is based on a description at + * http://faculty.vassar.edu/lowry/ch13pt1.html</p> + * <pre> + * Abbreviations: bg = between groups, + * wg = within groups, + * ss = sum squared deviations + * </pre> + * + * @since 1.2 + * @version $Revision: 983921 $ $Date: 2010-08-10 12:46:06 +0200 (mar. 10 août 2010) $ + */ +public class OneWayAnovaImpl implements OneWayAnova { + + /** + * Default constructor. + */ + public OneWayAnovaImpl() { + } + + /** + * {@inheritDoc}<p> + * This implementation computes the F statistic using the definitional + * formula<pre> + * F = msbg/mswg</pre> + * where<pre> + * msbg = between group mean square + * mswg = within group mean square</pre> + * are as defined <a href="http://faculty.vassar.edu/lowry/ch13pt1.html"> + * here</a></p> + */ + public double anovaFValue(Collection<double[]> categoryData) + throws IllegalArgumentException, MathException { + AnovaStats a = anovaStats(categoryData); + return a.F; + } + + /** + * {@inheritDoc}<p> + * This implementation uses the + * {@link org.apache.commons.math.distribution.FDistribution + * commons-math F Distribution implementation} to estimate the exact + * p-value, using the formula<pre> + * p = 1 - cumulativeProbability(F)</pre> + * where <code>F</code> is the F value and <code>cumulativeProbability</code> + * is the commons-math implementation of the F distribution.</p> + */ + public double anovaPValue(Collection<double[]> categoryData) + throws IllegalArgumentException, MathException { + AnovaStats a = anovaStats(categoryData); + FDistribution fdist = new FDistributionImpl(a.dfbg, a.dfwg); + return 1.0 - fdist.cumulativeProbability(a.F); + } + + /** + * {@inheritDoc}<p> + * This implementation uses the + * {@link org.apache.commons.math.distribution.FDistribution + * commons-math F Distribution implementation} to estimate the exact + * p-value, using the formula<pre> + * p = 1 - cumulativeProbability(F)</pre> + * where <code>F</code> is the F value and <code>cumulativeProbability</code> + * is the commons-math implementation of the F distribution.</p> + * <p>True is returned iff the estimated p-value is less than alpha.</p> + */ + public boolean anovaTest(Collection<double[]> categoryData, double alpha) + throws IllegalArgumentException, MathException { + if ((alpha <= 0) || (alpha > 0.5)) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL, + alpha, 0, 0.5); + } + return anovaPValue(categoryData) < alpha; + } + + + /** + * This method actually does the calculations (except P-value). + * + * @param categoryData <code>Collection</code> of <code>double[]</code> + * arrays each containing data for one category + * @return computed AnovaStats + * @throws IllegalArgumentException if categoryData does not meet + * preconditions specified in the interface definition + * @throws MathException if an error occurs computing the Anova stats + */ + private AnovaStats anovaStats(Collection<double[]> categoryData) + throws IllegalArgumentException, MathException { + + // check if we have enough categories + if (categoryData.size() < 2) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.TWO_OR_MORE_CATEGORIES_REQUIRED, + categoryData.size()); + } + + // check if each category has enough data and all is double[] + for (double[] array : categoryData) { + if (array.length <= 1) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.TWO_OR_MORE_VALUES_IN_CATEGORY_REQUIRED, + array.length); + } + } + + int dfwg = 0; + double sswg = 0; + Sum totsum = new Sum(); + SumOfSquares totsumsq = new SumOfSquares(); + int totnum = 0; + + for (double[] data : categoryData) { + + Sum sum = new Sum(); + SumOfSquares sumsq = new SumOfSquares(); + int num = 0; + + for (int i = 0; i < data.length; i++) { + double val = data[i]; + + // within category + num++; + sum.increment(val); + sumsq.increment(val); + + // for all categories + totnum++; + totsum.increment(val); + totsumsq.increment(val); + } + dfwg += num - 1; + double ss = sumsq.getResult() - sum.getResult() * sum.getResult() / num; + sswg += ss; + } + double sst = totsumsq.getResult() - totsum.getResult() * + totsum.getResult()/totnum; + double ssbg = sst - sswg; + int dfbg = categoryData.size() - 1; + double msbg = ssbg/dfbg; + double mswg = sswg/dfwg; + double F = msbg/mswg; + + return new AnovaStats(dfbg, dfwg, F); + } + + /** + Convenience class to pass dfbg,dfwg,F values around within AnovaImpl. + No get/set methods provided. + */ + private static class AnovaStats { + + /** Degrees of freedom in numerator (between groups). */ + private int dfbg; + + /** Degrees of freedom in denominator (within groups). */ + private int dfwg; + + /** Statistic. */ + private double F; + + /** + * Constructor + * @param dfbg degrees of freedom in numerator (between groups) + * @param dfwg degrees of freedom in denominator (within groups) + * @param F statistic + */ + private AnovaStats(int dfbg, int dfwg, double F) { + this.dfbg = dfbg; + this.dfwg = dfwg; + this.F = F; + } + } + +} diff --git a/src/main/java/org/apache/commons/math/stat/inference/TTest.java b/src/main/java/org/apache/commons/math/stat/inference/TTest.java new file mode 100644 index 0000000..0ccb0c0 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/inference/TTest.java @@ -0,0 +1,771 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.inference; + +import org.apache.commons.math.MathException; +import org.apache.commons.math.stat.descriptive.StatisticalSummary; + +/** + * An interface for Student's t-tests. + * <p> + * Tests can be:<ul> + * <li>One-sample or two-sample</li> + * <li>One-sided or two-sided</li> + * <li>Paired or unpaired (for two-sample tests)</li> + * <li>Homoscedastic (equal variance assumption) or heteroscedastic + * (for two sample tests)</li> + * <li>Fixed significance level (boolean-valued) or returning p-values. + * </li></ul></p> + * <p> + * Test statistics are available for all tests. Methods including "Test" in + * in their names perform tests, all other methods return t-statistics. Among + * the "Test" methods, <code>double-</code>valued methods return p-values; + * <code>boolean-</code>valued methods perform fixed significance level tests. + * Significance levels are always specified as numbers between 0 and 0.5 + * (e.g. tests at the 95% level use <code>alpha=0.05</code>).</p> + * <p> + * Input to tests can be either <code>double[]</code> arrays or + * {@link StatisticalSummary} instances.</p> + * + * + * @version $Revision: 811786 $ $Date: 2009-09-06 11:36:08 +0200 (dim. 06 sept. 2009) $ + */ +public interface TTest { + /** + * Computes a paired, 2-sample t-statistic based on the data in the input + * arrays. The t-statistic returned is equivalent to what would be returned by + * computing the one-sample t-statistic {@link #t(double, double[])}, with + * <code>mu = 0</code> and the sample array consisting of the (signed) + * differences between corresponding entries in <code>sample1</code> and + * <code>sample2.</code> + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>The input arrays must have the same length and their common length + * must be at least 2. + * </li></ul></p> + * + * @param sample1 array of sample data values + * @param sample2 array of sample data values + * @return t statistic + * @throws IllegalArgumentException if the precondition is not met + * @throws MathException if the statistic can not be computed do to a + * convergence or other numerical error. + */ + double pairedT(double[] sample1, double[] sample2) + throws IllegalArgumentException, MathException; + /** + * Returns the <i>observed significance level</i>, or + * <i> p-value</i>, associated with a paired, two-sample, two-tailed t-test + * based on the data in the input arrays. + * <p> + * The number returned is the smallest significance level + * at which one can reject the null hypothesis that the mean of the paired + * differences is 0 in favor of the two-sided alternative that the mean paired + * difference is not equal to 0. For a one-sided test, divide the returned + * value by 2.</p> + * <p> + * This test is equivalent to a one-sample t-test computed using + * {@link #tTest(double, double[])} with <code>mu = 0</code> and the sample + * array consisting of the signed differences between corresponding elements of + * <code>sample1</code> and <code>sample2.</code></p> + * <p> + * <strong>Usage Note:</strong><br> + * The validity of the p-value depends on the assumptions of the parametric + * t-test procedure, as discussed + * <a href="http://www.basic.nwu.edu/statguidefiles/ttest_unpaired_ass_viol.html"> + * here</a></p> + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>The input array lengths must be the same and their common length must + * be at least 2. + * </li></ul></p> + * + * @param sample1 array of sample data values + * @param sample2 array of sample data values + * @return p-value for t-test + * @throws IllegalArgumentException if the precondition is not met + * @throws MathException if an error occurs computing the p-value + */ + double pairedTTest(double[] sample1, double[] sample2) + throws IllegalArgumentException, MathException; + /** + * Performs a paired t-test evaluating the null hypothesis that the + * mean of the paired differences between <code>sample1</code> and + * <code>sample2</code> is 0 in favor of the two-sided alternative that the + * mean paired difference is not equal to 0, with significance level + * <code>alpha</code>. + * <p> + * Returns <code>true</code> iff the null hypothesis can be rejected with + * confidence <code>1 - alpha</code>. To perform a 1-sided test, use + * <code>alpha * 2</code></p> + * <p> + * <strong>Usage Note:</strong><br> + * The validity of the test depends on the assumptions of the parametric + * t-test procedure, as discussed + * <a href="http://www.basic.nwu.edu/statguidefiles/ttest_unpaired_ass_viol.html"> + * here</a></p> + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>The input array lengths must be the same and their common length + * must be at least 2. + * </li> + * <li> <code> 0 < alpha < 0.5 </code> + * </li></ul></p> + * + * @param sample1 array of sample data values + * @param sample2 array of sample data values + * @param alpha significance level of the test + * @return true if the null hypothesis can be rejected with + * confidence 1 - alpha + * @throws IllegalArgumentException if the preconditions are not met + * @throws MathException if an error occurs performing the test + */ + boolean pairedTTest( + double[] sample1, + double[] sample2, + double alpha) + throws IllegalArgumentException, MathException; + /** + * Computes a <a href="http://www.itl.nist.gov/div898/handbook/prc/section2/prc22.htm#formula"> + * t statistic </a> given observed values and a comparison constant. + * <p> + * This statistic can be used to perform a one sample t-test for the mean. + * </p><p> + * <strong>Preconditions</strong>: <ul> + * <li>The observed array length must be at least 2. + * </li></ul></p> + * + * @param mu comparison constant + * @param observed array of values + * @return t statistic + * @throws IllegalArgumentException if input array length is less than 2 + */ + double t(double mu, double[] observed) + throws IllegalArgumentException; + /** + * Computes a <a href="http://www.itl.nist.gov/div898/handbook/prc/section2/prc22.htm#formula"> + * t statistic </a> to use in comparing the mean of the dataset described by + * <code>sampleStats</code> to <code>mu</code>. + * <p> + * This statistic can be used to perform a one sample t-test for the mean. + * </p><p> + * <strong>Preconditions</strong>: <ul> + * <li><code>observed.getN() > = 2</code>. + * </li></ul></p> + * + * @param mu comparison constant + * @param sampleStats DescriptiveStatistics holding sample summary statitstics + * @return t statistic + * @throws IllegalArgumentException if the precondition is not met + */ + double t(double mu, StatisticalSummary sampleStats) + throws IllegalArgumentException; + /** + * Computes a 2-sample t statistic, under the hypothesis of equal + * subpopulation variances. To compute a t-statistic without the + * equal variances hypothesis, use {@link #t(double[], double[])}. + * <p> + * This statistic can be used to perform a (homoscedastic) two-sample + * t-test to compare sample means.</p> + * <p> + * The t-statisitc is</p> + * <p> + * <code> t = (m1 - m2) / (sqrt(1/n1 +1/n2) sqrt(var))</code> + * </p><p> + * where <strong><code>n1</code></strong> is the size of first sample; + * <strong><code> n2</code></strong> is the size of second sample; + * <strong><code> m1</code></strong> is the mean of first sample; + * <strong><code> m2</code></strong> is the mean of second sample</li> + * </ul> + * and <strong><code>var</code></strong> is the pooled variance estimate: + * </p><p> + * <code>var = sqrt(((n1 - 1)var1 + (n2 - 1)var2) / ((n1-1) + (n2-1)))</code> + * </p><p> + * with <strong><code>var1<code></strong> the variance of the first sample and + * <strong><code>var2</code></strong> the variance of the second sample. + * </p><p> + * <strong>Preconditions</strong>: <ul> + * <li>The observed array lengths must both be at least 2. + * </li></ul></p> + * + * @param sample1 array of sample data values + * @param sample2 array of sample data values + * @return t statistic + * @throws IllegalArgumentException if the precondition is not met + */ + double homoscedasticT(double[] sample1, double[] sample2) + throws IllegalArgumentException; + /** + * Computes a 2-sample t statistic, without the hypothesis of equal + * subpopulation variances. To compute a t-statistic assuming equal + * variances, use {@link #homoscedasticT(double[], double[])}. + * <p> + * This statistic can be used to perform a two-sample t-test to compare + * sample means.</p> + * <p> + * The t-statisitc is</p> + * <p> + * <code> t = (m1 - m2) / sqrt(var1/n1 + var2/n2)</code> + * </p><p> + * where <strong><code>n1</code></strong> is the size of the first sample + * <strong><code> n2</code></strong> is the size of the second sample; + * <strong><code> m1</code></strong> is the mean of the first sample; + * <strong><code> m2</code></strong> is the mean of the second sample; + * <strong><code> var1</code></strong> is the variance of the first sample; + * <strong><code> var2</code></strong> is the variance of the second sample; + * </p><p> + * <strong>Preconditions</strong>: <ul> + * <li>The observed array lengths must both be at least 2. + * </li></ul></p> + * + * @param sample1 array of sample data values + * @param sample2 array of sample data values + * @return t statistic + * @throws IllegalArgumentException if the precondition is not met + */ + double t(double[] sample1, double[] sample2) + throws IllegalArgumentException; + /** + * Computes a 2-sample t statistic </a>, comparing the means of the datasets + * described by two {@link StatisticalSummary} instances, without the + * assumption of equal subpopulation variances. Use + * {@link #homoscedasticT(StatisticalSummary, StatisticalSummary)} to + * compute a t-statistic under the equal variances assumption. + * <p> + * This statistic can be used to perform a two-sample t-test to compare + * sample means.</p> + * <p> + * The returned t-statisitc is</p> + * <p> + * <code> t = (m1 - m2) / sqrt(var1/n1 + var2/n2)</code> + * </p><p> + * where <strong><code>n1</code></strong> is the size of the first sample; + * <strong><code> n2</code></strong> is the size of the second sample; + * <strong><code> m1</code></strong> is the mean of the first sample; + * <strong><code> m2</code></strong> is the mean of the second sample + * <strong><code> var1</code></strong> is the variance of the first sample; + * <strong><code> var2</code></strong> is the variance of the second sample + * </p><p> + * <strong>Preconditions</strong>: <ul> + * <li>The datasets described by the two Univariates must each contain + * at least 2 observations. + * </li></ul></p> + * + * @param sampleStats1 StatisticalSummary describing data from the first sample + * @param sampleStats2 StatisticalSummary describing data from the second sample + * @return t statistic + * @throws IllegalArgumentException if the precondition is not met + */ + double t( + StatisticalSummary sampleStats1, + StatisticalSummary sampleStats2) + throws IllegalArgumentException; + /** + * Computes a 2-sample t statistic, comparing the means of the datasets + * described by two {@link StatisticalSummary} instances, under the + * assumption of equal subpopulation variances. To compute a t-statistic + * without the equal variances assumption, use + * {@link #t(StatisticalSummary, StatisticalSummary)}. + * <p> + * This statistic can be used to perform a (homoscedastic) two-sample + * t-test to compare sample means.</p> + * <p> + * The t-statisitc returned is</p> + * <p> + * <code> t = (m1 - m2) / (sqrt(1/n1 +1/n2) sqrt(var))</code> + * </p><p> + * where <strong><code>n1</code></strong> is the size of first sample; + * <strong><code> n2</code></strong> is the size of second sample; + * <strong><code> m1</code></strong> is the mean of first sample; + * <strong><code> m2</code></strong> is the mean of second sample + * and <strong><code>var</code></strong> is the pooled variance estimate: + * </p><p> + * <code>var = sqrt(((n1 - 1)var1 + (n2 - 1)var2) / ((n1-1) + (n2-1)))</code> + * </p><p> + * with <strong><code>var1<code></strong> the variance of the first sample and + * <strong><code>var2</code></strong> the variance of the second sample. + * </p><p> + * <strong>Preconditions</strong>: <ul> + * <li>The datasets described by the two Univariates must each contain + * at least 2 observations. + * </li></ul></p> + * + * @param sampleStats1 StatisticalSummary describing data from the first sample + * @param sampleStats2 StatisticalSummary describing data from the second sample + * @return t statistic + * @throws IllegalArgumentException if the precondition is not met + */ + double homoscedasticT( + StatisticalSummary sampleStats1, + StatisticalSummary sampleStats2) + throws IllegalArgumentException; + /** + * Returns the <i>observed significance level</i>, or + * <i>p-value</i>, associated with a one-sample, two-tailed t-test + * comparing the mean of the input array with the constant <code>mu</code>. + * <p> + * The number returned is the smallest significance level + * at which one can reject the null hypothesis that the mean equals + * <code>mu</code> in favor of the two-sided alternative that the mean + * is different from <code>mu</code>. For a one-sided test, divide the + * returned value by 2.</p> + * <p> + * <strong>Usage Note:</strong><br> + * The validity of the test depends on the assumptions of the parametric + * t-test procedure, as discussed + * <a href="http://www.basic.nwu.edu/statguidefiles/ttest_unpaired_ass_viol.html">here</a> + * </p><p> + * <strong>Preconditions</strong>: <ul> + * <li>The observed array length must be at least 2. + * </li></ul></p> + * + * @param mu constant value to compare sample mean against + * @param sample array of sample data values + * @return p-value + * @throws IllegalArgumentException if the precondition is not met + * @throws MathException if an error occurs computing the p-value + */ + double tTest(double mu, double[] sample) + throws IllegalArgumentException, MathException; + /** + * Performs a <a href="http://www.itl.nist.gov/div898/handbook/eda/section3/eda353.htm"> + * two-sided t-test</a> evaluating the null hypothesis that the mean of the population from + * which <code>sample</code> is drawn equals <code>mu</code>. + * <p> + * Returns <code>true</code> iff the null hypothesis can be + * rejected with confidence <code>1 - alpha</code>. To + * perform a 1-sided test, use <code>alpha * 2</code></p> + * <p> + * <strong>Examples:</strong><br><ol> + * <li>To test the (2-sided) hypothesis <code>sample mean = mu </code> at + * the 95% level, use <br><code>tTest(mu, sample, 0.05) </code> + * </li> + * <li>To test the (one-sided) hypothesis <code> sample mean < mu </code> + * at the 99% level, first verify that the measured sample mean is less + * than <code>mu</code> and then use + * <br><code>tTest(mu, sample, 0.02) </code> + * </li></ol></p> + * <p> + * <strong>Usage Note:</strong><br> + * The validity of the test depends on the assumptions of the one-sample + * parametric t-test procedure, as discussed + * <a href="http://www.basic.nwu.edu/statguidefiles/sg_glos.html#one-sample">here</a> + * </p><p> + * <strong>Preconditions</strong>: <ul> + * <li>The observed array length must be at least 2. + * </li></ul></p> + * + * @param mu constant value to compare sample mean against + * @param sample array of sample data values + * @param alpha significance level of the test + * @return p-value + * @throws IllegalArgumentException if the precondition is not met + * @throws MathException if an error computing the p-value + */ + boolean tTest(double mu, double[] sample, double alpha) + throws IllegalArgumentException, MathException; + /** + * Returns the <i>observed significance level</i>, or + * <i>p-value</i>, associated with a one-sample, two-tailed t-test + * comparing the mean of the dataset described by <code>sampleStats</code> + * with the constant <code>mu</code>. + * <p> + * The number returned is the smallest significance level + * at which one can reject the null hypothesis that the mean equals + * <code>mu</code> in favor of the two-sided alternative that the mean + * is different from <code>mu</code>. For a one-sided test, divide the + * returned value by 2.</p> + * <p> + * <strong>Usage Note:</strong><br> + * The validity of the test depends on the assumptions of the parametric + * t-test procedure, as discussed + * <a href="http://www.basic.nwu.edu/statguidefiles/ttest_unpaired_ass_viol.html"> + * here</a></p> + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>The sample must contain at least 2 observations. + * </li></ul></p> + * + * @param mu constant value to compare sample mean against + * @param sampleStats StatisticalSummary describing sample data + * @return p-value + * @throws IllegalArgumentException if the precondition is not met + * @throws MathException if an error occurs computing the p-value + */ + double tTest(double mu, StatisticalSummary sampleStats) + throws IllegalArgumentException, MathException; + /** + * Performs a <a href="http://www.itl.nist.gov/div898/handbook/eda/section3/eda353.htm"> + * two-sided t-test</a> evaluating the null hypothesis that the mean of the + * population from which the dataset described by <code>stats</code> is + * drawn equals <code>mu</code>. + * <p> + * Returns <code>true</code> iff the null hypothesis can be rejected with + * confidence <code>1 - alpha</code>. To perform a 1-sided test, use + * <code>alpha * 2.</code></p> + * <p> + * <strong>Examples:</strong><br><ol> + * <li>To test the (2-sided) hypothesis <code>sample mean = mu </code> at + * the 95% level, use <br><code>tTest(mu, sampleStats, 0.05) </code> + * </li> + * <li>To test the (one-sided) hypothesis <code> sample mean < mu </code> + * at the 99% level, first verify that the measured sample mean is less + * than <code>mu</code> and then use + * <br><code>tTest(mu, sampleStats, 0.02) </code> + * </li></ol></p> + * <p> + * <strong>Usage Note:</strong><br> + * The validity of the test depends on the assumptions of the one-sample + * parametric t-test procedure, as discussed + * <a href="http://www.basic.nwu.edu/statguidefiles/sg_glos.html#one-sample">here</a> + * </p><p> + * <strong>Preconditions</strong>: <ul> + * <li>The sample must include at least 2 observations. + * </li></ul></p> + * + * @param mu constant value to compare sample mean against + * @param sampleStats StatisticalSummary describing sample data values + * @param alpha significance level of the test + * @return p-value + * @throws IllegalArgumentException if the precondition is not met + * @throws MathException if an error occurs computing the p-value + */ + boolean tTest( + double mu, + StatisticalSummary sampleStats, + double alpha) + throws IllegalArgumentException, MathException; + /** + * Returns the <i>observed significance level</i>, or + * <i>p-value</i>, associated with a two-sample, two-tailed t-test + * comparing the means of the input arrays. + * <p> + * The number returned is the smallest significance level + * at which one can reject the null hypothesis that the two means are + * equal in favor of the two-sided alternative that they are different. + * For a one-sided test, divide the returned value by 2.</p> + * <p> + * The test does not assume that the underlying popuation variances are + * equal and it uses approximated degrees of freedom computed from the + * sample data to compute the p-value. The t-statistic used is as defined in + * {@link #t(double[], double[])} and the Welch-Satterthwaite approximation + * to the degrees of freedom is used, + * as described + * <a href="http://www.itl.nist.gov/div898/handbook/prc/section3/prc31.htm"> + * here.</a> To perform the test under the assumption of equal subpopulation + * variances, use {@link #homoscedasticTTest(double[], double[])}.</p> + * <p> + * <strong>Usage Note:</strong><br> + * The validity of the p-value depends on the assumptions of the parametric + * t-test procedure, as discussed + * <a href="http://www.basic.nwu.edu/statguidefiles/ttest_unpaired_ass_viol.html"> + * here</a></p> + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>The observed array lengths must both be at least 2. + * </li></ul></p> + * + * @param sample1 array of sample data values + * @param sample2 array of sample data values + * @return p-value for t-test + * @throws IllegalArgumentException if the precondition is not met + * @throws MathException if an error occurs computing the p-value + */ + double tTest(double[] sample1, double[] sample2) + throws IllegalArgumentException, MathException; + /** + * Returns the <i>observed significance level</i>, or + * <i>p-value</i>, associated with a two-sample, two-tailed t-test + * comparing the means of the input arrays, under the assumption that + * the two samples are drawn from subpopulations with equal variances. + * To perform the test without the equal variances assumption, use + * {@link #tTest(double[], double[])}.</p> + * <p> + * The number returned is the smallest significance level + * at which one can reject the null hypothesis that the two means are + * equal in favor of the two-sided alternative that they are different. + * For a one-sided test, divide the returned value by 2.</p> + * <p> + * A pooled variance estimate is used to compute the t-statistic. See + * {@link #homoscedasticT(double[], double[])}. The sum of the sample sizes + * minus 2 is used as the degrees of freedom.</p> + * <p> + * <strong>Usage Note:</strong><br> + * The validity of the p-value depends on the assumptions of the parametric + * t-test procedure, as discussed + * <a href="http://www.basic.nwu.edu/statguidefiles/ttest_unpaired_ass_viol.html"> + * here</a></p> + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>The observed array lengths must both be at least 2. + * </li></ul></p> + * + * @param sample1 array of sample data values + * @param sample2 array of sample data values + * @return p-value for t-test + * @throws IllegalArgumentException if the precondition is not met + * @throws MathException if an error occurs computing the p-value + */ + double homoscedasticTTest( + double[] sample1, + double[] sample2) + throws IllegalArgumentException, MathException; + /** + * Performs a + * <a href="http://www.itl.nist.gov/div898/handbook/eda/section3/eda353.htm"> + * two-sided t-test</a> evaluating the null hypothesis that <code>sample1</code> + * and <code>sample2</code> are drawn from populations with the same mean, + * with significance level <code>alpha</code>. This test does not assume + * that the subpopulation variances are equal. To perform the test assuming + * equal variances, use + * {@link #homoscedasticTTest(double[], double[], double)}. + * <p> + * Returns <code>true</code> iff the null hypothesis that the means are + * equal can be rejected with confidence <code>1 - alpha</code>. To + * perform a 1-sided test, use <code>alpha * 2</code></p> + * <p> + * See {@link #t(double[], double[])} for the formula used to compute the + * t-statistic. Degrees of freedom are approximated using the + * <a href="http://www.itl.nist.gov/div898/handbook/prc/section3/prc31.htm"> + * Welch-Satterthwaite approximation.</a></p> + * <p> + * <strong>Examples:</strong><br><ol> + * <li>To test the (2-sided) hypothesis <code>mean 1 = mean 2 </code> at + * the 95% level, use + * <br><code>tTest(sample1, sample2, 0.05). </code> + * </li> + * <li>To test the (one-sided) hypothesis <code> mean 1 < mean 2 </code>, + * at the 99% level, first verify that the measured mean of <code>sample 1</code> + * is less than the mean of <code>sample 2</code> and then use + * <br><code>tTest(sample1, sample2, 0.02) </code> + * </li></ol></p> + * <p> + * <strong>Usage Note:</strong><br> + * The validity of the test depends on the assumptions of the parametric + * t-test procedure, as discussed + * <a href="http://www.basic.nwu.edu/statguidefiles/ttest_unpaired_ass_viol.html"> + * here</a></p> + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>The observed array lengths must both be at least 2. + * </li> + * <li> <code> 0 < alpha < 0.5 </code> + * </li></ul></p> + * + * @param sample1 array of sample data values + * @param sample2 array of sample data values + * @param alpha significance level of the test + * @return true if the null hypothesis can be rejected with + * confidence 1 - alpha + * @throws IllegalArgumentException if the preconditions are not met + * @throws MathException if an error occurs performing the test + */ + boolean tTest( + double[] sample1, + double[] sample2, + double alpha) + throws IllegalArgumentException, MathException; + /** + * Performs a + * <a href="http://www.itl.nist.gov/div898/handbook/eda/section3/eda353.htm"> + * two-sided t-test</a> evaluating the null hypothesis that <code>sample1</code> + * and <code>sample2</code> are drawn from populations with the same mean, + * with significance level <code>alpha</code>, assuming that the + * subpopulation variances are equal. Use + * {@link #tTest(double[], double[], double)} to perform the test without + * the assumption of equal variances. + * <p> + * Returns <code>true</code> iff the null hypothesis that the means are + * equal can be rejected with confidence <code>1 - alpha</code>. To + * perform a 1-sided test, use <code>alpha * 2.</code> To perform the test + * without the assumption of equal subpopulation variances, use + * {@link #tTest(double[], double[], double)}.</p> + * <p> + * A pooled variance estimate is used to compute the t-statistic. See + * {@link #t(double[], double[])} for the formula. The sum of the sample + * sizes minus 2 is used as the degrees of freedom.</p> + * <p> + * <strong>Examples:</strong><br><ol> + * <li>To test the (2-sided) hypothesis <code>mean 1 = mean 2 </code> at + * the 95% level, use <br><code>tTest(sample1, sample2, 0.05). </code> + * </li> + * <li>To test the (one-sided) hypothesis <code> mean 1 < mean 2, </code> + * at the 99% level, first verify that the measured mean of + * <code>sample 1</code> is less than the mean of <code>sample 2</code> + * and then use + * <br><code>tTest(sample1, sample2, 0.02) </code> + * </li></ol></p> + * <p> + * <strong>Usage Note:</strong><br> + * The validity of the test depends on the assumptions of the parametric + * t-test procedure, as discussed + * <a href="http://www.basic.nwu.edu/statguidefiles/ttest_unpaired_ass_viol.html"> + * here</a></p> + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>The observed array lengths must both be at least 2. + * </li> + * <li> <code> 0 < alpha < 0.5 </code> + * </li></ul></p> + * + * @param sample1 array of sample data values + * @param sample2 array of sample data values + * @param alpha significance level of the test + * @return true if the null hypothesis can be rejected with + * confidence 1 - alpha + * @throws IllegalArgumentException if the preconditions are not met + * @throws MathException if an error occurs performing the test + */ + boolean homoscedasticTTest( + double[] sample1, + double[] sample2, + double alpha) + throws IllegalArgumentException, MathException; + /** + * Returns the <i>observed significance level</i>, or + * <i>p-value</i>, associated with a two-sample, two-tailed t-test + * comparing the means of the datasets described by two StatisticalSummary + * instances. + * <p> + * The number returned is the smallest significance level + * at which one can reject the null hypothesis that the two means are + * equal in favor of the two-sided alternative that they are different. + * For a one-sided test, divide the returned value by 2.</p> + * <p> + * The test does not assume that the underlying popuation variances are + * equal and it uses approximated degrees of freedom computed from the + * sample data to compute the p-value. To perform the test assuming + * equal variances, use + * {@link #homoscedasticTTest(StatisticalSummary, StatisticalSummary)}.</p> + * <p> + * <strong>Usage Note:</strong><br> + * The validity of the p-value depends on the assumptions of the parametric + * t-test procedure, as discussed + * <a href="http://www.basic.nwu.edu/statguidefiles/ttest_unpaired_ass_viol.html"> + * here</a></p> + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>The datasets described by the two Univariates must each contain + * at least 2 observations. + * </li></ul></p> + * + * @param sampleStats1 StatisticalSummary describing data from the first sample + * @param sampleStats2 StatisticalSummary describing data from the second sample + * @return p-value for t-test + * @throws IllegalArgumentException if the precondition is not met + * @throws MathException if an error occurs computing the p-value + */ + double tTest( + StatisticalSummary sampleStats1, + StatisticalSummary sampleStats2) + throws IllegalArgumentException, MathException; + /** + * Returns the <i>observed significance level</i>, or + * <i>p-value</i>, associated with a two-sample, two-tailed t-test + * comparing the means of the datasets described by two StatisticalSummary + * instances, under the hypothesis of equal subpopulation variances. To + * perform a test without the equal variances assumption, use + * {@link #tTest(StatisticalSummary, StatisticalSummary)}. + * <p> + * The number returned is the smallest significance level + * at which one can reject the null hypothesis that the two means are + * equal in favor of the two-sided alternative that they are different. + * For a one-sided test, divide the returned value by 2.</p> + * <p> + * See {@link #homoscedasticT(double[], double[])} for the formula used to + * compute the t-statistic. The sum of the sample sizes minus 2 is used as + * the degrees of freedom.</p> + * <p> + * <strong>Usage Note:</strong><br> + * The validity of the p-value depends on the assumptions of the parametric + * t-test procedure, as discussed + * <a href="http://www.basic.nwu.edu/statguidefiles/ttest_unpaired_ass_viol.html">here</a> + * </p><p> + * <strong>Preconditions</strong>: <ul> + * <li>The datasets described by the two Univariates must each contain + * at least 2 observations. + * </li></ul></p> + * + * @param sampleStats1 StatisticalSummary describing data from the first sample + * @param sampleStats2 StatisticalSummary describing data from the second sample + * @return p-value for t-test + * @throws IllegalArgumentException if the precondition is not met + * @throws MathException if an error occurs computing the p-value + */ + double homoscedasticTTest( + StatisticalSummary sampleStats1, + StatisticalSummary sampleStats2) + throws IllegalArgumentException, MathException; + /** + * Performs a + * <a href="http://www.itl.nist.gov/div898/handbook/eda/section3/eda353.htm"> + * two-sided t-test</a> evaluating the null hypothesis that + * <code>sampleStats1</code> and <code>sampleStats2</code> describe + * datasets drawn from populations with the same mean, with significance + * level <code>alpha</code>. This test does not assume that the + * subpopulation variances are equal. To perform the test under the equal + * variances assumption, use + * {@link #homoscedasticTTest(StatisticalSummary, StatisticalSummary)}. + * <p> + * Returns <code>true</code> iff the null hypothesis that the means are + * equal can be rejected with confidence <code>1 - alpha</code>. To + * perform a 1-sided test, use <code>alpha * 2</code></p> + * <p> + * See {@link #t(double[], double[])} for the formula used to compute the + * t-statistic. Degrees of freedom are approximated using the + * <a href="http://www.itl.nist.gov/div898/handbook/prc/section3/prc31.htm"> + * Welch-Satterthwaite approximation.</a></p> + * <p> + * <strong>Examples:</strong><br><ol> + * <li>To test the (2-sided) hypothesis <code>mean 1 = mean 2 </code> at + * the 95%, use + * <br><code>tTest(sampleStats1, sampleStats2, 0.05) </code> + * </li> + * <li>To test the (one-sided) hypothesis <code> mean 1 < mean 2 </code> + * at the 99% level, first verify that the measured mean of + * <code>sample 1</code> is less than the mean of <code>sample 2</code> + * and then use + * <br><code>tTest(sampleStats1, sampleStats2, 0.02) </code> + * </li></ol></p> + * <p> + * <strong>Usage Note:</strong><br> + * The validity of the test depends on the assumptions of the parametric + * t-test procedure, as discussed + * <a href="http://www.basic.nwu.edu/statguidefiles/ttest_unpaired_ass_viol.html"> + * here</a></p> + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>The datasets described by the two Univariates must each contain + * at least 2 observations. + * </li> + * <li> <code> 0 < alpha < 0.5 </code> + * </li></ul></p> + * + * @param sampleStats1 StatisticalSummary describing sample data values + * @param sampleStats2 StatisticalSummary describing sample data values + * @param alpha significance level of the test + * @return true if the null hypothesis can be rejected with + * confidence 1 - alpha + * @throws IllegalArgumentException if the preconditions are not met + * @throws MathException if an error occurs performing the test + */ + boolean tTest( + StatisticalSummary sampleStats1, + StatisticalSummary sampleStats2, + double alpha) + throws IllegalArgumentException, MathException; +} diff --git a/src/main/java/org/apache/commons/math/stat/inference/TTestImpl.java b/src/main/java/org/apache/commons/math/stat/inference/TTestImpl.java new file mode 100644 index 0000000..d4d1a12 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/inference/TTestImpl.java @@ -0,0 +1,1069 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.inference; + +import org.apache.commons.math.MathException; +import org.apache.commons.math.MathRuntimeException; +import org.apache.commons.math.distribution.TDistribution; +import org.apache.commons.math.distribution.TDistributionImpl; +import org.apache.commons.math.exception.util.LocalizedFormats; +import org.apache.commons.math.stat.StatUtils; +import org.apache.commons.math.stat.descriptive.StatisticalSummary; +import org.apache.commons.math.util.FastMath; + +/** + * Implements t-test statistics defined in the {@link TTest} interface. + * <p> + * Uses commons-math {@link org.apache.commons.math.distribution.TDistributionImpl} + * implementation to estimate exact p-values.</p> + * + * @version $Revision: 1042336 $ $Date: 2010-12-05 13:40:48 +0100 (dim. 05 déc. 2010) $ + */ +public class TTestImpl implements TTest { + + /** Distribution used to compute inference statistics. + * @deprecated in 2.2 (to be removed in 3.0). + */ + @Deprecated + private TDistribution distribution; + + /** + * Default constructor. + */ + public TTestImpl() { + this(new TDistributionImpl(1.0)); + } + + /** + * Create a test instance using the given distribution for computing + * inference statistics. + * @param t distribution used to compute inference statistics. + * @since 1.2 + * @deprecated in 2.2 (to be removed in 3.0). + */ + @Deprecated + public TTestImpl(TDistribution t) { + super(); + setDistribution(t); + } + + /** + * Computes a paired, 2-sample t-statistic based on the data in the input + * arrays. The t-statistic returned is equivalent to what would be returned by + * computing the one-sample t-statistic {@link #t(double, double[])}, with + * <code>mu = 0</code> and the sample array consisting of the (signed) + * differences between corresponding entries in <code>sample1</code> and + * <code>sample2.</code> + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>The input arrays must have the same length and their common length + * must be at least 2. + * </li></ul></p> + * + * @param sample1 array of sample data values + * @param sample2 array of sample data values + * @return t statistic + * @throws IllegalArgumentException if the precondition is not met + * @throws MathException if the statistic can not be computed do to a + * convergence or other numerical error. + */ + public double pairedT(double[] sample1, double[] sample2) + throws IllegalArgumentException, MathException { + checkSampleData(sample1); + checkSampleData(sample2); + double meanDifference = StatUtils.meanDifference(sample1, sample2); + return t(meanDifference, 0, + StatUtils.varianceDifference(sample1, sample2, meanDifference), + sample1.length); + } + + /** + * Returns the <i>observed significance level</i>, or + * <i> p-value</i>, associated with a paired, two-sample, two-tailed t-test + * based on the data in the input arrays. + * <p> + * The number returned is the smallest significance level + * at which one can reject the null hypothesis that the mean of the paired + * differences is 0 in favor of the two-sided alternative that the mean paired + * difference is not equal to 0. For a one-sided test, divide the returned + * value by 2.</p> + * <p> + * This test is equivalent to a one-sample t-test computed using + * {@link #tTest(double, double[])} with <code>mu = 0</code> and the sample + * array consisting of the signed differences between corresponding elements of + * <code>sample1</code> and <code>sample2.</code></p> + * <p> + * <strong>Usage Note:</strong><br> + * The validity of the p-value depends on the assumptions of the parametric + * t-test procedure, as discussed + * <a href="http://www.basic.nwu.edu/statguidefiles/ttest_unpaired_ass_viol.html"> + * here</a></p> + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>The input array lengths must be the same and their common length must + * be at least 2. + * </li></ul></p> + * + * @param sample1 array of sample data values + * @param sample2 array of sample data values + * @return p-value for t-test + * @throws IllegalArgumentException if the precondition is not met + * @throws MathException if an error occurs computing the p-value + */ + public double pairedTTest(double[] sample1, double[] sample2) + throws IllegalArgumentException, MathException { + double meanDifference = StatUtils.meanDifference(sample1, sample2); + return tTest(meanDifference, 0, + StatUtils.varianceDifference(sample1, sample2, meanDifference), + sample1.length); + } + + /** + * Performs a paired t-test evaluating the null hypothesis that the + * mean of the paired differences between <code>sample1</code> and + * <code>sample2</code> is 0 in favor of the two-sided alternative that the + * mean paired difference is not equal to 0, with significance level + * <code>alpha</code>. + * <p> + * Returns <code>true</code> iff the null hypothesis can be rejected with + * confidence <code>1 - alpha</code>. To perform a 1-sided test, use + * <code>alpha * 2</code></p> + * <p> + * <strong>Usage Note:</strong><br> + * The validity of the test depends on the assumptions of the parametric + * t-test procedure, as discussed + * <a href="http://www.basic.nwu.edu/statguidefiles/ttest_unpaired_ass_viol.html"> + * here</a></p> + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>The input array lengths must be the same and their common length + * must be at least 2. + * </li> + * <li> <code> 0 < alpha < 0.5 </code> + * </li></ul></p> + * + * @param sample1 array of sample data values + * @param sample2 array of sample data values + * @param alpha significance level of the test + * @return true if the null hypothesis can be rejected with + * confidence 1 - alpha + * @throws IllegalArgumentException if the preconditions are not met + * @throws MathException if an error occurs performing the test + */ + public boolean pairedTTest(double[] sample1, double[] sample2, double alpha) + throws IllegalArgumentException, MathException { + checkSignificanceLevel(alpha); + return pairedTTest(sample1, sample2) < alpha; + } + + /** + * Computes a <a href="http://www.itl.nist.gov/div898/handbook/prc/section2/prc22.htm#formula"> + * t statistic </a> given observed values and a comparison constant. + * <p> + * This statistic can be used to perform a one sample t-test for the mean. + * </p><p> + * <strong>Preconditions</strong>: <ul> + * <li>The observed array length must be at least 2. + * </li></ul></p> + * + * @param mu comparison constant + * @param observed array of values + * @return t statistic + * @throws IllegalArgumentException if input array length is less than 2 + */ + public double t(double mu, double[] observed) + throws IllegalArgumentException { + checkSampleData(observed); + return t(StatUtils.mean(observed), mu, StatUtils.variance(observed), + observed.length); + } + + /** + * Computes a <a href="http://www.itl.nist.gov/div898/handbook/prc/section2/prc22.htm#formula"> + * t statistic </a> to use in comparing the mean of the dataset described by + * <code>sampleStats</code> to <code>mu</code>. + * <p> + * This statistic can be used to perform a one sample t-test for the mean. + * </p><p> + * <strong>Preconditions</strong>: <ul> + * <li><code>observed.getN() > = 2</code>. + * </li></ul></p> + * + * @param mu comparison constant + * @param sampleStats DescriptiveStatistics holding sample summary statitstics + * @return t statistic + * @throws IllegalArgumentException if the precondition is not met + */ + public double t(double mu, StatisticalSummary sampleStats) + throws IllegalArgumentException { + checkSampleData(sampleStats); + return t(sampleStats.getMean(), mu, sampleStats.getVariance(), + sampleStats.getN()); + } + + /** + * Computes a 2-sample t statistic, under the hypothesis of equal + * subpopulation variances. To compute a t-statistic without the + * equal variances hypothesis, use {@link #t(double[], double[])}. + * <p> + * This statistic can be used to perform a (homoscedastic) two-sample + * t-test to compare sample means.</p> + * <p> + * The t-statisitc is</p> + * <p> + * <code> t = (m1 - m2) / (sqrt(1/n1 +1/n2) sqrt(var))</code> + * </p><p> + * where <strong><code>n1</code></strong> is the size of first sample; + * <strong><code> n2</code></strong> is the size of second sample; + * <strong><code> m1</code></strong> is the mean of first sample; + * <strong><code> m2</code></strong> is the mean of second sample</li> + * </ul> + * and <strong><code>var</code></strong> is the pooled variance estimate: + * </p><p> + * <code>var = sqrt(((n1 - 1)var1 + (n2 - 1)var2) / ((n1-1) + (n2-1)))</code> + * </p><p> + * with <strong><code>var1<code></strong> the variance of the first sample and + * <strong><code>var2</code></strong> the variance of the second sample. + * </p><p> + * <strong>Preconditions</strong>: <ul> + * <li>The observed array lengths must both be at least 2. + * </li></ul></p> + * + * @param sample1 array of sample data values + * @param sample2 array of sample data values + * @return t statistic + * @throws IllegalArgumentException if the precondition is not met + */ + public double homoscedasticT(double[] sample1, double[] sample2) + throws IllegalArgumentException { + checkSampleData(sample1); + checkSampleData(sample2); + return homoscedasticT(StatUtils.mean(sample1), StatUtils.mean(sample2), + StatUtils.variance(sample1), StatUtils.variance(sample2), + sample1.length, sample2.length); + } + + /** + * Computes a 2-sample t statistic, without the hypothesis of equal + * subpopulation variances. To compute a t-statistic assuming equal + * variances, use {@link #homoscedasticT(double[], double[])}. + * <p> + * This statistic can be used to perform a two-sample t-test to compare + * sample means.</p> + * <p> + * The t-statisitc is</p> + * <p> + * <code> t = (m1 - m2) / sqrt(var1/n1 + var2/n2)</code> + * </p><p> + * where <strong><code>n1</code></strong> is the size of the first sample + * <strong><code> n2</code></strong> is the size of the second sample; + * <strong><code> m1</code></strong> is the mean of the first sample; + * <strong><code> m2</code></strong> is the mean of the second sample; + * <strong><code> var1</code></strong> is the variance of the first sample; + * <strong><code> var2</code></strong> is the variance of the second sample; + * </p><p> + * <strong>Preconditions</strong>: <ul> + * <li>The observed array lengths must both be at least 2. + * </li></ul></p> + * + * @param sample1 array of sample data values + * @param sample2 array of sample data values + * @return t statistic + * @throws IllegalArgumentException if the precondition is not met + */ + public double t(double[] sample1, double[] sample2) + throws IllegalArgumentException { + checkSampleData(sample1); + checkSampleData(sample2); + return t(StatUtils.mean(sample1), StatUtils.mean(sample2), + StatUtils.variance(sample1), StatUtils.variance(sample2), + sample1.length, sample2.length); + } + + /** + * Computes a 2-sample t statistic </a>, comparing the means of the datasets + * described by two {@link StatisticalSummary} instances, without the + * assumption of equal subpopulation variances. Use + * {@link #homoscedasticT(StatisticalSummary, StatisticalSummary)} to + * compute a t-statistic under the equal variances assumption. + * <p> + * This statistic can be used to perform a two-sample t-test to compare + * sample means.</p> + * <p> + * The returned t-statisitc is</p> + * <p> + * <code> t = (m1 - m2) / sqrt(var1/n1 + var2/n2)</code> + * </p><p> + * where <strong><code>n1</code></strong> is the size of the first sample; + * <strong><code> n2</code></strong> is the size of the second sample; + * <strong><code> m1</code></strong> is the mean of the first sample; + * <strong><code> m2</code></strong> is the mean of the second sample + * <strong><code> var1</code></strong> is the variance of the first sample; + * <strong><code> var2</code></strong> is the variance of the second sample + * </p><p> + * <strong>Preconditions</strong>: <ul> + * <li>The datasets described by the two Univariates must each contain + * at least 2 observations. + * </li></ul></p> + * + * @param sampleStats1 StatisticalSummary describing data from the first sample + * @param sampleStats2 StatisticalSummary describing data from the second sample + * @return t statistic + * @throws IllegalArgumentException if the precondition is not met + */ + public double t(StatisticalSummary sampleStats1, + StatisticalSummary sampleStats2) + throws IllegalArgumentException { + checkSampleData(sampleStats1); + checkSampleData(sampleStats2); + return t(sampleStats1.getMean(), sampleStats2.getMean(), + sampleStats1.getVariance(), sampleStats2.getVariance(), + sampleStats1.getN(), sampleStats2.getN()); + } + + /** + * Computes a 2-sample t statistic, comparing the means of the datasets + * described by two {@link StatisticalSummary} instances, under the + * assumption of equal subpopulation variances. To compute a t-statistic + * without the equal variances assumption, use + * {@link #t(StatisticalSummary, StatisticalSummary)}. + * <p> + * This statistic can be used to perform a (homoscedastic) two-sample + * t-test to compare sample means.</p> + * <p> + * The t-statisitc returned is</p> + * <p> + * <code> t = (m1 - m2) / (sqrt(1/n1 +1/n2) sqrt(var))</code> + * </p><p> + * where <strong><code>n1</code></strong> is the size of first sample; + * <strong><code> n2</code></strong> is the size of second sample; + * <strong><code> m1</code></strong> is the mean of first sample; + * <strong><code> m2</code></strong> is the mean of second sample + * and <strong><code>var</code></strong> is the pooled variance estimate: + * </p><p> + * <code>var = sqrt(((n1 - 1)var1 + (n2 - 1)var2) / ((n1-1) + (n2-1)))</code> + * <p> + * with <strong><code>var1<code></strong> the variance of the first sample and + * <strong><code>var2</code></strong> the variance of the second sample. + * </p><p> + * <strong>Preconditions</strong>: <ul> + * <li>The datasets described by the two Univariates must each contain + * at least 2 observations. + * </li></ul></p> + * + * @param sampleStats1 StatisticalSummary describing data from the first sample + * @param sampleStats2 StatisticalSummary describing data from the second sample + * @return t statistic + * @throws IllegalArgumentException if the precondition is not met + */ + public double homoscedasticT(StatisticalSummary sampleStats1, + StatisticalSummary sampleStats2) + throws IllegalArgumentException { + checkSampleData(sampleStats1); + checkSampleData(sampleStats2); + return homoscedasticT(sampleStats1.getMean(), sampleStats2.getMean(), + sampleStats1.getVariance(), sampleStats2.getVariance(), + sampleStats1.getN(), sampleStats2.getN()); + } + + /** + * Returns the <i>observed significance level</i>, or + * <i>p-value</i>, associated with a one-sample, two-tailed t-test + * comparing the mean of the input array with the constant <code>mu</code>. + * <p> + * The number returned is the smallest significance level + * at which one can reject the null hypothesis that the mean equals + * <code>mu</code> in favor of the two-sided alternative that the mean + * is different from <code>mu</code>. For a one-sided test, divide the + * returned value by 2.</p> + * <p> + * <strong>Usage Note:</strong><br> + * The validity of the test depends on the assumptions of the parametric + * t-test procedure, as discussed + * <a href="http://www.basic.nwu.edu/statguidefiles/ttest_unpaired_ass_viol.html">here</a> + * </p><p> + * <strong>Preconditions</strong>: <ul> + * <li>The observed array length must be at least 2. + * </li></ul></p> + * + * @param mu constant value to compare sample mean against + * @param sample array of sample data values + * @return p-value + * @throws IllegalArgumentException if the precondition is not met + * @throws MathException if an error occurs computing the p-value + */ + public double tTest(double mu, double[] sample) + throws IllegalArgumentException, MathException { + checkSampleData(sample); + return tTest( StatUtils.mean(sample), mu, StatUtils.variance(sample), + sample.length); + } + + /** + * Performs a <a href="http://www.itl.nist.gov/div898/handbook/eda/section3/eda353.htm"> + * two-sided t-test</a> evaluating the null hypothesis that the mean of the population from + * which <code>sample</code> is drawn equals <code>mu</code>. + * <p> + * Returns <code>true</code> iff the null hypothesis can be + * rejected with confidence <code>1 - alpha</code>. To + * perform a 1-sided test, use <code>alpha * 2</code> + * </p><p> + * <strong>Examples:</strong><br><ol> + * <li>To test the (2-sided) hypothesis <code>sample mean = mu </code> at + * the 95% level, use <br><code>tTest(mu, sample, 0.05) </code> + * </li> + * <li>To test the (one-sided) hypothesis <code> sample mean < mu </code> + * at the 99% level, first verify that the measured sample mean is less + * than <code>mu</code> and then use + * <br><code>tTest(mu, sample, 0.02) </code> + * </li></ol></p> + * <p> + * <strong>Usage Note:</strong><br> + * The validity of the test depends on the assumptions of the one-sample + * parametric t-test procedure, as discussed + * <a href="http://www.basic.nwu.edu/statguidefiles/sg_glos.html#one-sample">here</a> + * </p><p> + * <strong>Preconditions</strong>: <ul> + * <li>The observed array length must be at least 2. + * </li></ul></p> + * + * @param mu constant value to compare sample mean against + * @param sample array of sample data values + * @param alpha significance level of the test + * @return p-value + * @throws IllegalArgumentException if the precondition is not met + * @throws MathException if an error computing the p-value + */ + public boolean tTest(double mu, double[] sample, double alpha) + throws IllegalArgumentException, MathException { + checkSignificanceLevel(alpha); + return tTest(mu, sample) < alpha; + } + + /** + * Returns the <i>observed significance level</i>, or + * <i>p-value</i>, associated with a one-sample, two-tailed t-test + * comparing the mean of the dataset described by <code>sampleStats</code> + * with the constant <code>mu</code>. + * <p> + * The number returned is the smallest significance level + * at which one can reject the null hypothesis that the mean equals + * <code>mu</code> in favor of the two-sided alternative that the mean + * is different from <code>mu</code>. For a one-sided test, divide the + * returned value by 2.</p> + * <p> + * <strong>Usage Note:</strong><br> + * The validity of the test depends on the assumptions of the parametric + * t-test procedure, as discussed + * <a href="http://www.basic.nwu.edu/statguidefiles/ttest_unpaired_ass_viol.html"> + * here</a></p> + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>The sample must contain at least 2 observations. + * </li></ul></p> + * + * @param mu constant value to compare sample mean against + * @param sampleStats StatisticalSummary describing sample data + * @return p-value + * @throws IllegalArgumentException if the precondition is not met + * @throws MathException if an error occurs computing the p-value + */ + public double tTest(double mu, StatisticalSummary sampleStats) + throws IllegalArgumentException, MathException { + checkSampleData(sampleStats); + return tTest(sampleStats.getMean(), mu, sampleStats.getVariance(), + sampleStats.getN()); + } + + /** + * Performs a <a href="http://www.itl.nist.gov/div898/handbook/eda/section3/eda353.htm"> + * two-sided t-test</a> evaluating the null hypothesis that the mean of the + * population from which the dataset described by <code>stats</code> is + * drawn equals <code>mu</code>. + * <p> + * Returns <code>true</code> iff the null hypothesis can be rejected with + * confidence <code>1 - alpha</code>. To perform a 1-sided test, use + * <code>alpha * 2.</code></p> + * <p> + * <strong>Examples:</strong><br><ol> + * <li>To test the (2-sided) hypothesis <code>sample mean = mu </code> at + * the 95% level, use <br><code>tTest(mu, sampleStats, 0.05) </code> + * </li> + * <li>To test the (one-sided) hypothesis <code> sample mean < mu </code> + * at the 99% level, first verify that the measured sample mean is less + * than <code>mu</code> and then use + * <br><code>tTest(mu, sampleStats, 0.02) </code> + * </li></ol></p> + * <p> + * <strong>Usage Note:</strong><br> + * The validity of the test depends on the assumptions of the one-sample + * parametric t-test procedure, as discussed + * <a href="http://www.basic.nwu.edu/statguidefiles/sg_glos.html#one-sample">here</a> + * </p><p> + * <strong>Preconditions</strong>: <ul> + * <li>The sample must include at least 2 observations. + * </li></ul></p> + * + * @param mu constant value to compare sample mean against + * @param sampleStats StatisticalSummary describing sample data values + * @param alpha significance level of the test + * @return p-value + * @throws IllegalArgumentException if the precondition is not met + * @throws MathException if an error occurs computing the p-value + */ + public boolean tTest( double mu, StatisticalSummary sampleStats, + double alpha) + throws IllegalArgumentException, MathException { + checkSignificanceLevel(alpha); + return tTest(mu, sampleStats) < alpha; + } + + /** + * Returns the <i>observed significance level</i>, or + * <i>p-value</i>, associated with a two-sample, two-tailed t-test + * comparing the means of the input arrays. + * <p> + * The number returned is the smallest significance level + * at which one can reject the null hypothesis that the two means are + * equal in favor of the two-sided alternative that they are different. + * For a one-sided test, divide the returned value by 2.</p> + * <p> + * The test does not assume that the underlying popuation variances are + * equal and it uses approximated degrees of freedom computed from the + * sample data to compute the p-value. The t-statistic used is as defined in + * {@link #t(double[], double[])} and the Welch-Satterthwaite approximation + * to the degrees of freedom is used, + * as described + * <a href="http://www.itl.nist.gov/div898/handbook/prc/section3/prc31.htm"> + * here.</a> To perform the test under the assumption of equal subpopulation + * variances, use {@link #homoscedasticTTest(double[], double[])}.</p> + * <p> + * <strong>Usage Note:</strong><br> + * The validity of the p-value depends on the assumptions of the parametric + * t-test procedure, as discussed + * <a href="http://www.basic.nwu.edu/statguidefiles/ttest_unpaired_ass_viol.html"> + * here</a></p> + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>The observed array lengths must both be at least 2. + * </li></ul></p> + * + * @param sample1 array of sample data values + * @param sample2 array of sample data values + * @return p-value for t-test + * @throws IllegalArgumentException if the precondition is not met + * @throws MathException if an error occurs computing the p-value + */ + public double tTest(double[] sample1, double[] sample2) + throws IllegalArgumentException, MathException { + checkSampleData(sample1); + checkSampleData(sample2); + return tTest(StatUtils.mean(sample1), StatUtils.mean(sample2), + StatUtils.variance(sample1), StatUtils.variance(sample2), + sample1.length, sample2.length); + } + + /** + * Returns the <i>observed significance level</i>, or + * <i>p-value</i>, associated with a two-sample, two-tailed t-test + * comparing the means of the input arrays, under the assumption that + * the two samples are drawn from subpopulations with equal variances. + * To perform the test without the equal variances assumption, use + * {@link #tTest(double[], double[])}. + * <p> + * The number returned is the smallest significance level + * at which one can reject the null hypothesis that the two means are + * equal in favor of the two-sided alternative that they are different. + * For a one-sided test, divide the returned value by 2.</p> + * <p> + * A pooled variance estimate is used to compute the t-statistic. See + * {@link #homoscedasticT(double[], double[])}. The sum of the sample sizes + * minus 2 is used as the degrees of freedom.</p> + * <p> + * <strong>Usage Note:</strong><br> + * The validity of the p-value depends on the assumptions of the parametric + * t-test procedure, as discussed + * <a href="http://www.basic.nwu.edu/statguidefiles/ttest_unpaired_ass_viol.html"> + * here</a></p> + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>The observed array lengths must both be at least 2. + * </li></ul></p> + * + * @param sample1 array of sample data values + * @param sample2 array of sample data values + * @return p-value for t-test + * @throws IllegalArgumentException if the precondition is not met + * @throws MathException if an error occurs computing the p-value + */ + public double homoscedasticTTest(double[] sample1, double[] sample2) + throws IllegalArgumentException, MathException { + checkSampleData(sample1); + checkSampleData(sample2); + return homoscedasticTTest(StatUtils.mean(sample1), + StatUtils.mean(sample2), StatUtils.variance(sample1), + StatUtils.variance(sample2), sample1.length, + sample2.length); + } + + + /** + * Performs a + * <a href="http://www.itl.nist.gov/div898/handbook/eda/section3/eda353.htm"> + * two-sided t-test</a> evaluating the null hypothesis that <code>sample1</code> + * and <code>sample2</code> are drawn from populations with the same mean, + * with significance level <code>alpha</code>. This test does not assume + * that the subpopulation variances are equal. To perform the test assuming + * equal variances, use + * {@link #homoscedasticTTest(double[], double[], double)}. + * <p> + * Returns <code>true</code> iff the null hypothesis that the means are + * equal can be rejected with confidence <code>1 - alpha</code>. To + * perform a 1-sided test, use <code>alpha / 2</code></p> + * <p> + * See {@link #t(double[], double[])} for the formula used to compute the + * t-statistic. Degrees of freedom are approximated using the + * <a href="http://www.itl.nist.gov/div898/handbook/prc/section3/prc31.htm"> + * Welch-Satterthwaite approximation.</a></p> + + * <p> + * <strong>Examples:</strong><br><ol> + * <li>To test the (2-sided) hypothesis <code>mean 1 = mean 2 </code> at + * the 95% level, use + * <br><code>tTest(sample1, sample2, 0.05). </code> + * </li> + * <li>To test the (one-sided) hypothesis <code> mean 1 < mean 2 </code> at + * the 99% level, first verify that the measured mean of <code>sample 1</code> + * is less than the mean of <code>sample 2</code> and then use + * <br><code>tTest(sample1, sample2, 0.02) </code> + * </li></ol></p> + * <p> + * <strong>Usage Note:</strong><br> + * The validity of the test depends on the assumptions of the parametric + * t-test procedure, as discussed + * <a href="http://www.basic.nwu.edu/statguidefiles/ttest_unpaired_ass_viol.html"> + * here</a></p> + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>The observed array lengths must both be at least 2. + * </li> + * <li> <code> 0 < alpha < 0.5 </code> + * </li></ul></p> + * + * @param sample1 array of sample data values + * @param sample2 array of sample data values + * @param alpha significance level of the test + * @return true if the null hypothesis can be rejected with + * confidence 1 - alpha + * @throws IllegalArgumentException if the preconditions are not met + * @throws MathException if an error occurs performing the test + */ + public boolean tTest(double[] sample1, double[] sample2, + double alpha) + throws IllegalArgumentException, MathException { + checkSignificanceLevel(alpha); + return tTest(sample1, sample2) < alpha; + } + + /** + * Performs a + * <a href="http://www.itl.nist.gov/div898/handbook/eda/section3/eda353.htm"> + * two-sided t-test</a> evaluating the null hypothesis that <code>sample1</code> + * and <code>sample2</code> are drawn from populations with the same mean, + * with significance level <code>alpha</code>, assuming that the + * subpopulation variances are equal. Use + * {@link #tTest(double[], double[], double)} to perform the test without + * the assumption of equal variances. + * <p> + * Returns <code>true</code> iff the null hypothesis that the means are + * equal can be rejected with confidence <code>1 - alpha</code>. To + * perform a 1-sided test, use <code>alpha * 2.</code> To perform the test + * without the assumption of equal subpopulation variances, use + * {@link #tTest(double[], double[], double)}.</p> + * <p> + * A pooled variance estimate is used to compute the t-statistic. See + * {@link #t(double[], double[])} for the formula. The sum of the sample + * sizes minus 2 is used as the degrees of freedom.</p> + * <p> + * <strong>Examples:</strong><br><ol> + * <li>To test the (2-sided) hypothesis <code>mean 1 = mean 2 </code> at + * the 95% level, use <br><code>tTest(sample1, sample2, 0.05). </code> + * </li> + * <li>To test the (one-sided) hypothesis <code> mean 1 < mean 2, </code> + * at the 99% level, first verify that the measured mean of + * <code>sample 1</code> is less than the mean of <code>sample 2</code> + * and then use + * <br><code>tTest(sample1, sample2, 0.02) </code> + * </li></ol></p> + * <p> + * <strong>Usage Note:</strong><br> + * The validity of the test depends on the assumptions of the parametric + * t-test procedure, as discussed + * <a href="http://www.basic.nwu.edu/statguidefiles/ttest_unpaired_ass_viol.html"> + * here</a></p> + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>The observed array lengths must both be at least 2. + * </li> + * <li> <code> 0 < alpha < 0.5 </code> + * </li></ul></p> + * + * @param sample1 array of sample data values + * @param sample2 array of sample data values + * @param alpha significance level of the test + * @return true if the null hypothesis can be rejected with + * confidence 1 - alpha + * @throws IllegalArgumentException if the preconditions are not met + * @throws MathException if an error occurs performing the test + */ + public boolean homoscedasticTTest(double[] sample1, double[] sample2, + double alpha) + throws IllegalArgumentException, MathException { + checkSignificanceLevel(alpha); + return homoscedasticTTest(sample1, sample2) < alpha; + } + + /** + * Returns the <i>observed significance level</i>, or + * <i>p-value</i>, associated with a two-sample, two-tailed t-test + * comparing the means of the datasets described by two StatisticalSummary + * instances. + * <p> + * The number returned is the smallest significance level + * at which one can reject the null hypothesis that the two means are + * equal in favor of the two-sided alternative that they are different. + * For a one-sided test, divide the returned value by 2.</p> + * <p> + * The test does not assume that the underlying popuation variances are + * equal and it uses approximated degrees of freedom computed from the + * sample data to compute the p-value. To perform the test assuming + * equal variances, use + * {@link #homoscedasticTTest(StatisticalSummary, StatisticalSummary)}.</p> + * <p> + * <strong>Usage Note:</strong><br> + * The validity of the p-value depends on the assumptions of the parametric + * t-test procedure, as discussed + * <a href="http://www.basic.nwu.edu/statguidefiles/ttest_unpaired_ass_viol.html"> + * here</a></p> + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>The datasets described by the two Univariates must each contain + * at least 2 observations. + * </li></ul></p> + * + * @param sampleStats1 StatisticalSummary describing data from the first sample + * @param sampleStats2 StatisticalSummary describing data from the second sample + * @return p-value for t-test + * @throws IllegalArgumentException if the precondition is not met + * @throws MathException if an error occurs computing the p-value + */ + public double tTest(StatisticalSummary sampleStats1, StatisticalSummary sampleStats2) + throws IllegalArgumentException, MathException { + checkSampleData(sampleStats1); + checkSampleData(sampleStats2); + return tTest(sampleStats1.getMean(), sampleStats2.getMean(), sampleStats1.getVariance(), + sampleStats2.getVariance(), sampleStats1.getN(), + sampleStats2.getN()); + } + + /** + * Returns the <i>observed significance level</i>, or + * <i>p-value</i>, associated with a two-sample, two-tailed t-test + * comparing the means of the datasets described by two StatisticalSummary + * instances, under the hypothesis of equal subpopulation variances. To + * perform a test without the equal variances assumption, use + * {@link #tTest(StatisticalSummary, StatisticalSummary)}. + * <p> + * The number returned is the smallest significance level + * at which one can reject the null hypothesis that the two means are + * equal in favor of the two-sided alternative that they are different. + * For a one-sided test, divide the returned value by 2.</p> + * <p> + * See {@link #homoscedasticT(double[], double[])} for the formula used to + * compute the t-statistic. The sum of the sample sizes minus 2 is used as + * the degrees of freedom.</p> + * <p> + * <strong>Usage Note:</strong><br> + * The validity of the p-value depends on the assumptions of the parametric + * t-test procedure, as discussed + * <a href="http://www.basic.nwu.edu/statguidefiles/ttest_unpaired_ass_viol.html">here</a> + * </p><p> + * <strong>Preconditions</strong>: <ul> + * <li>The datasets described by the two Univariates must each contain + * at least 2 observations. + * </li></ul></p> + * + * @param sampleStats1 StatisticalSummary describing data from the first sample + * @param sampleStats2 StatisticalSummary describing data from the second sample + * @return p-value for t-test + * @throws IllegalArgumentException if the precondition is not met + * @throws MathException if an error occurs computing the p-value + */ + public double homoscedasticTTest(StatisticalSummary sampleStats1, + StatisticalSummary sampleStats2) + throws IllegalArgumentException, MathException { + checkSampleData(sampleStats1); + checkSampleData(sampleStats2); + return homoscedasticTTest(sampleStats1.getMean(), + sampleStats2.getMean(), sampleStats1.getVariance(), + sampleStats2.getVariance(), sampleStats1.getN(), + sampleStats2.getN()); + } + + /** + * Performs a + * <a href="http://www.itl.nist.gov/div898/handbook/eda/section3/eda353.htm"> + * two-sided t-test</a> evaluating the null hypothesis that + * <code>sampleStats1</code> and <code>sampleStats2</code> describe + * datasets drawn from populations with the same mean, with significance + * level <code>alpha</code>. This test does not assume that the + * subpopulation variances are equal. To perform the test under the equal + * variances assumption, use + * {@link #homoscedasticTTest(StatisticalSummary, StatisticalSummary)}. + * <p> + * Returns <code>true</code> iff the null hypothesis that the means are + * equal can be rejected with confidence <code>1 - alpha</code>. To + * perform a 1-sided test, use <code>alpha * 2</code></p> + * <p> + * See {@link #t(double[], double[])} for the formula used to compute the + * t-statistic. Degrees of freedom are approximated using the + * <a href="http://www.itl.nist.gov/div898/handbook/prc/section3/prc31.htm"> + * Welch-Satterthwaite approximation.</a></p> + * <p> + * <strong>Examples:</strong><br><ol> + * <li>To test the (2-sided) hypothesis <code>mean 1 = mean 2 </code> at + * the 95%, use + * <br><code>tTest(sampleStats1, sampleStats2, 0.05) </code> + * </li> + * <li>To test the (one-sided) hypothesis <code> mean 1 < mean 2 </code> + * at the 99% level, first verify that the measured mean of + * <code>sample 1</code> is less than the mean of <code>sample 2</code> + * and then use + * <br><code>tTest(sampleStats1, sampleStats2, 0.02) </code> + * </li></ol></p> + * <p> + * <strong>Usage Note:</strong><br> + * The validity of the test depends on the assumptions of the parametric + * t-test procedure, as discussed + * <a href="http://www.basic.nwu.edu/statguidefiles/ttest_unpaired_ass_viol.html"> + * here</a></p> + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>The datasets described by the two Univariates must each contain + * at least 2 observations. + * </li> + * <li> <code> 0 < alpha < 0.5 </code> + * </li></ul></p> + * + * @param sampleStats1 StatisticalSummary describing sample data values + * @param sampleStats2 StatisticalSummary describing sample data values + * @param alpha significance level of the test + * @return true if the null hypothesis can be rejected with + * confidence 1 - alpha + * @throws IllegalArgumentException if the preconditions are not met + * @throws MathException if an error occurs performing the test + */ + public boolean tTest(StatisticalSummary sampleStats1, + StatisticalSummary sampleStats2, double alpha) + throws IllegalArgumentException, MathException { + checkSignificanceLevel(alpha); + return tTest(sampleStats1, sampleStats2) < alpha; + } + + //----------------------------------------------- Protected methods + + /** + * Computes approximate degrees of freedom for 2-sample t-test. + * + * @param v1 first sample variance + * @param v2 second sample variance + * @param n1 first sample n + * @param n2 second sample n + * @return approximate degrees of freedom + */ + protected double df(double v1, double v2, double n1, double n2) { + return (((v1 / n1) + (v2 / n2)) * ((v1 / n1) + (v2 / n2))) / + ((v1 * v1) / (n1 * n1 * (n1 - 1d)) + (v2 * v2) / + (n2 * n2 * (n2 - 1d))); + } + + /** + * Computes t test statistic for 1-sample t-test. + * + * @param m sample mean + * @param mu constant to test against + * @param v sample variance + * @param n sample n + * @return t test statistic + */ + protected double t(double m, double mu, double v, double n) { + return (m - mu) / FastMath.sqrt(v / n); + } + + /** + * Computes t test statistic for 2-sample t-test. + * <p> + * Does not assume that subpopulation variances are equal.</p> + * + * @param m1 first sample mean + * @param m2 second sample mean + * @param v1 first sample variance + * @param v2 second sample variance + * @param n1 first sample n + * @param n2 second sample n + * @return t test statistic + */ + protected double t(double m1, double m2, double v1, double v2, double n1, + double n2) { + return (m1 - m2) / FastMath.sqrt((v1 / n1) + (v2 / n2)); + } + + /** + * Computes t test statistic for 2-sample t-test under the hypothesis + * of equal subpopulation variances. + * + * @param m1 first sample mean + * @param m2 second sample mean + * @param v1 first sample variance + * @param v2 second sample variance + * @param n1 first sample n + * @param n2 second sample n + * @return t test statistic + */ + protected double homoscedasticT(double m1, double m2, double v1, + double v2, double n1, double n2) { + double pooledVariance = ((n1 - 1) * v1 + (n2 -1) * v2 ) / (n1 + n2 - 2); + return (m1 - m2) / FastMath.sqrt(pooledVariance * (1d / n1 + 1d / n2)); + } + + /** + * Computes p-value for 2-sided, 1-sample t-test. + * + * @param m sample mean + * @param mu constant to test against + * @param v sample variance + * @param n sample n + * @return p-value + * @throws MathException if an error occurs computing the p-value + */ + protected double tTest(double m, double mu, double v, double n) + throws MathException { + double t = FastMath.abs(t(m, mu, v, n)); + distribution.setDegreesOfFreedom(n - 1); + return 2.0 * distribution.cumulativeProbability(-t); + } + + /** + * Computes p-value for 2-sided, 2-sample t-test. + * <p> + * Does not assume subpopulation variances are equal. Degrees of freedom + * are estimated from the data.</p> + * + * @param m1 first sample mean + * @param m2 second sample mean + * @param v1 first sample variance + * @param v2 second sample variance + * @param n1 first sample n + * @param n2 second sample n + * @return p-value + * @throws MathException if an error occurs computing the p-value + */ + protected double tTest(double m1, double m2, double v1, double v2, + double n1, double n2) + throws MathException { + double t = FastMath.abs(t(m1, m2, v1, v2, n1, n2)); + double degreesOfFreedom = 0; + degreesOfFreedom = df(v1, v2, n1, n2); + distribution.setDegreesOfFreedom(degreesOfFreedom); + return 2.0 * distribution.cumulativeProbability(-t); + } + + /** + * Computes p-value for 2-sided, 2-sample t-test, under the assumption + * of equal subpopulation variances. + * <p> + * The sum of the sample sizes minus 2 is used as degrees of freedom.</p> + * + * @param m1 first sample mean + * @param m2 second sample mean + * @param v1 first sample variance + * @param v2 second sample variance + * @param n1 first sample n + * @param n2 second sample n + * @return p-value + * @throws MathException if an error occurs computing the p-value + */ + protected double homoscedasticTTest(double m1, double m2, double v1, + double v2, double n1, double n2) + throws MathException { + double t = FastMath.abs(homoscedasticT(m1, m2, v1, v2, n1, n2)); + double degreesOfFreedom = n1 + n2 - 2; + distribution.setDegreesOfFreedom(degreesOfFreedom); + return 2.0 * distribution.cumulativeProbability(-t); + } + + /** + * Modify the distribution used to compute inference statistics. + * @param value the new distribution + * @since 1.2 + * @deprecated in 2.2 (to be removed in 3.0). + */ + @Deprecated + public void setDistribution(TDistribution value) { + distribution = value; + } + + /** Check significance level. + * @param alpha significance level + * @exception IllegalArgumentException if significance level is out of bounds + */ + private void checkSignificanceLevel(final double alpha) + throws IllegalArgumentException { + if ((alpha <= 0) || (alpha > 0.5)) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL, + alpha, 0.0, 0.5); + } + } + + /** Check sample data. + * @param data sample data + * @exception IllegalArgumentException if there is not enough sample data + */ + private void checkSampleData(final double[] data) + throws IllegalArgumentException { + if ((data == null) || (data.length < 2)) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.INSUFFICIENT_DATA_FOR_T_STATISTIC, + (data == null) ? 0 : data.length); + } + } + + /** Check sample data. + * @param stat statistical summary + * @exception IllegalArgumentException if there is not enough sample data + */ + private void checkSampleData(final StatisticalSummary stat) + throws IllegalArgumentException { + if ((stat == null) || (stat.getN() < 2)) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.INSUFFICIENT_DATA_FOR_T_STATISTIC, + (stat == null) ? 0 : stat.getN()); + } + } + +} diff --git a/src/main/java/org/apache/commons/math/stat/inference/TestUtils.java b/src/main/java/org/apache/commons/math/stat/inference/TestUtils.java new file mode 100644 index 0000000..5023d55 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/inference/TestUtils.java @@ -0,0 +1,436 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.inference; + +import java.util.Collection; +import org.apache.commons.math.MathException; +import org.apache.commons.math.stat.descriptive.StatisticalSummary; + +/** + * A collection of static methods to create inference test instances or to + * perform inference tests. + * + * <p> + * The set methods are not compatible with using the class in multiple threads, + * and have therefore been deprecated (along with the getters). + * The setters and getters will be removed in version 3.0. + * + * @since 1.1 + * @version $Revision: 1067582 $ $Date: 2011-02-06 04:55:32 +0100 (dim. 06 févr. 2011) $ + */ +public class TestUtils { + + /** Singleton TTest instance using default implementation. */ + private static TTest tTest = new TTestImpl(); + + /** Singleton ChiSquareTest instance using default implementation. */ + private static ChiSquareTest chiSquareTest = + new ChiSquareTestImpl(); + + /** Singleton ChiSquareTest instance using default implementation. */ + private static UnknownDistributionChiSquareTest unknownDistributionChiSquareTest = + new ChiSquareTestImpl(); + + /** Singleton OneWayAnova instance using default implementation. */ + private static OneWayAnova oneWayAnova = + new OneWayAnovaImpl(); + + /** + * Prevent instantiation. + */ + protected TestUtils() { + super(); + } + + /** + * Set the (singleton) TTest instance. + * + * @param chiSquareTest the new instance to use + * @since 1.2 + * @deprecated 2.2 will be removed in 3.0 - not compatible with use from multiple threads + */ + @Deprecated + public static void setChiSquareTest(TTest chiSquareTest) { + TestUtils.tTest = chiSquareTest; + } + + /** + * Return a (singleton) TTest instance. Does not create a new instance. + * + * @return a TTest instance + * @deprecated 2.2 will be removed in 3.0 + */ + @Deprecated + public static TTest getTTest() { + return tTest; + } + + /** + * Set the (singleton) ChiSquareTest instance. + * + * @param chiSquareTest the new instance to use + * @since 1.2 + * @deprecated 2.2 will be removed in 3.0 - not compatible with use from multiple threads + */ + @Deprecated + public static void setChiSquareTest(ChiSquareTest chiSquareTest) { + TestUtils.chiSquareTest = chiSquareTest; + } + + /** + * Return a (singleton) ChiSquareTest instance. Does not create a new instance. + * + * @return a ChiSquareTest instance + * @deprecated 2.2 will be removed in 3.0 + */ + @Deprecated + public static ChiSquareTest getChiSquareTest() { + return chiSquareTest; + } + + /** + * Set the (singleton) UnknownDistributionChiSquareTest instance. + * + * @param unknownDistributionChiSquareTest the new instance to use + * @since 1.2 + * @deprecated 2.2 will be removed in 3.0 - not compatible with use from multiple threads + */ + @Deprecated + public static void setUnknownDistributionChiSquareTest(UnknownDistributionChiSquareTest unknownDistributionChiSquareTest) { + TestUtils.unknownDistributionChiSquareTest = unknownDistributionChiSquareTest; + } + + /** + * Return a (singleton) UnknownDistributionChiSquareTest instance. Does not create a new instance. + * + * @return a UnknownDistributionChiSquareTest instance + * @deprecated 2.2 will be removed in 3.0 + */ + @Deprecated + public static UnknownDistributionChiSquareTest getUnknownDistributionChiSquareTest() { + return unknownDistributionChiSquareTest; + } + + /** + * Set the (singleton) OneWayAnova instance + * + * @param oneWayAnova the new instance to use + * @since 1.2 + * @deprecated 2.2 will be removed in 3.0 - not compatible with use from multiple threads + */ + @Deprecated + public static void setOneWayAnova(OneWayAnova oneWayAnova) { + TestUtils.oneWayAnova = oneWayAnova; + } + + /** + * Return a (singleton) OneWayAnova instance. Does not create a new instance. + * + * @return a OneWayAnova instance + * @since 1.2 + * @deprecated 2.2 will be removed in 3.0 + */ + @Deprecated + public static OneWayAnova getOneWayAnova() { + return oneWayAnova; + } + + + // CHECKSTYLE: stop JavadocMethodCheck + + /** + * @see org.apache.commons.math.stat.inference.TTest#homoscedasticT(double[], double[]) + */ + public static double homoscedasticT(double[] sample1, double[] sample2) + throws IllegalArgumentException { + return tTest.homoscedasticT(sample1, sample2); + } + + /** + * @see org.apache.commons.math.stat.inference.TTest#homoscedasticT(org.apache.commons.math.stat.descriptive.StatisticalSummary, org.apache.commons.math.stat.descriptive.StatisticalSummary) + */ + public static double homoscedasticT(StatisticalSummary sampleStats1, + StatisticalSummary sampleStats2) + throws IllegalArgumentException { + return tTest.homoscedasticT(sampleStats1, sampleStats2); + } + + /** + * @see org.apache.commons.math.stat.inference.TTest#homoscedasticTTest(double[], double[], double) + */ + public static boolean homoscedasticTTest(double[] sample1, double[] sample2, + double alpha) + throws IllegalArgumentException, MathException { + return tTest. homoscedasticTTest(sample1, sample2, alpha); + } + + /** + * @see org.apache.commons.math.stat.inference.TTest#homoscedasticTTest(double[], double[]) + */ + public static double homoscedasticTTest(double[] sample1, double[] sample2) + throws IllegalArgumentException, MathException { + return tTest.homoscedasticTTest(sample1, sample2); + } + + /** + * @see org.apache.commons.math.stat.inference.TTest#homoscedasticTTest(org.apache.commons.math.stat.descriptive.StatisticalSummary, org.apache.commons.math.stat.descriptive.StatisticalSummary) + */ + public static double homoscedasticTTest(StatisticalSummary sampleStats1, + StatisticalSummary sampleStats2) + throws IllegalArgumentException, MathException { + return tTest.homoscedasticTTest(sampleStats1, sampleStats2); + } + + /** + * @see org.apache.commons.math.stat.inference.TTest#pairedT(double[], double[]) + */ + public static double pairedT(double[] sample1, double[] sample2) + throws IllegalArgumentException, MathException { + return tTest.pairedT(sample1, sample2); + } + + /** + * @see org.apache.commons.math.stat.inference.TTest#pairedTTest(double[], double[], double) + */ + public static boolean pairedTTest(double[] sample1, double[] sample2, + double alpha) + throws IllegalArgumentException, MathException { + return tTest.pairedTTest(sample1, sample2, alpha); + } + + /** + * @see org.apache.commons.math.stat.inference.TTest#pairedTTest(double[], double[]) + */ + public static double pairedTTest(double[] sample1, double[] sample2) + throws IllegalArgumentException, MathException { + return tTest.pairedTTest(sample1, sample2); + } + + /** + * @see org.apache.commons.math.stat.inference.TTest#t(double, double[]) + */ + public static double t(double mu, double[] observed) + throws IllegalArgumentException { + return tTest.t(mu, observed); + } + + /** + * @see org.apache.commons.math.stat.inference.TTest#t(double, org.apache.commons.math.stat.descriptive.StatisticalSummary) + */ + public static double t(double mu, StatisticalSummary sampleStats) + throws IllegalArgumentException { + return tTest.t(mu, sampleStats); + } + + /** + * @see org.apache.commons.math.stat.inference.TTest#t(double[], double[]) + */ + public static double t(double[] sample1, double[] sample2) + throws IllegalArgumentException { + return tTest.t(sample1, sample2); + } + + /** + * @see org.apache.commons.math.stat.inference.TTest#t(org.apache.commons.math.stat.descriptive.StatisticalSummary, org.apache.commons.math.stat.descriptive.StatisticalSummary) + */ + public static double t(StatisticalSummary sampleStats1, + StatisticalSummary sampleStats2) + throws IllegalArgumentException { + return tTest.t(sampleStats1, sampleStats2); + } + + /** + * @see org.apache.commons.math.stat.inference.TTest#tTest(double, double[], double) + */ + public static boolean tTest(double mu, double[] sample, double alpha) + throws IllegalArgumentException, MathException { + return tTest.tTest(mu, sample, alpha); + } + + /** + * @see org.apache.commons.math.stat.inference.TTest#tTest(double, double[]) + */ + public static double tTest(double mu, double[] sample) + throws IllegalArgumentException, MathException { + return tTest.tTest(mu, sample); + } + + /** + * @see org.apache.commons.math.stat.inference.TTest#tTest(double, org.apache.commons.math.stat.descriptive.StatisticalSummary, double) + */ + public static boolean tTest(double mu, StatisticalSummary sampleStats, + double alpha) + throws IllegalArgumentException, MathException { + return tTest. tTest(mu, sampleStats, alpha); + } + + /** + * @see org.apache.commons.math.stat.inference.TTest#tTest(double, org.apache.commons.math.stat.descriptive.StatisticalSummary) + */ + public static double tTest(double mu, StatisticalSummary sampleStats) + throws IllegalArgumentException, MathException { + return tTest.tTest(mu, sampleStats); + } + + /** + * @see org.apache.commons.math.stat.inference.TTest#tTest(double[], double[], double) + */ + public static boolean tTest(double[] sample1, double[] sample2, double alpha) + throws IllegalArgumentException, MathException { + return tTest.tTest(sample1, sample2, alpha); + } + + /** + * @see org.apache.commons.math.stat.inference.TTest#tTest(double[], double[]) + */ + public static double tTest(double[] sample1, double[] sample2) + throws IllegalArgumentException, MathException { + return tTest.tTest(sample1, sample2); + } + + /** + * @see org.apache.commons.math.stat.inference.TTest#tTest(org.apache.commons.math.stat.descriptive.StatisticalSummary, org.apache.commons.math.stat.descriptive.StatisticalSummary, double) + */ + public static boolean tTest(StatisticalSummary sampleStats1, + StatisticalSummary sampleStats2, double alpha) + throws IllegalArgumentException, MathException { + return tTest. tTest(sampleStats1, sampleStats2, alpha); + } + + /** + * @see org.apache.commons.math.stat.inference.TTest#tTest(org.apache.commons.math.stat.descriptive.StatisticalSummary, org.apache.commons.math.stat.descriptive.StatisticalSummary) + */ + public static double tTest(StatisticalSummary sampleStats1, + StatisticalSummary sampleStats2) + throws IllegalArgumentException, MathException { + return tTest.tTest(sampleStats1, sampleStats2); + } + + /** + * @see org.apache.commons.math.stat.inference.ChiSquareTest#chiSquare(double[], long[]) + */ + public static double chiSquare(double[] expected, long[] observed) + throws IllegalArgumentException { + return chiSquareTest.chiSquare(expected, observed); + } + + /** + * @see org.apache.commons.math.stat.inference.ChiSquareTest#chiSquare(long[][]) + */ + public static double chiSquare(long[][] counts) + throws IllegalArgumentException { + return chiSquareTest.chiSquare(counts); + } + + /** + * @see org.apache.commons.math.stat.inference.ChiSquareTest#chiSquareTest(double[], long[], double) + */ + public static boolean chiSquareTest(double[] expected, long[] observed, + double alpha) + throws IllegalArgumentException, MathException { + return chiSquareTest.chiSquareTest(expected, observed, alpha); + } + + /** + * @see org.apache.commons.math.stat.inference.ChiSquareTest#chiSquareTest(double[], long[]) + */ + public static double chiSquareTest(double[] expected, long[] observed) + throws IllegalArgumentException, MathException { + return chiSquareTest.chiSquareTest(expected, observed); + } + + /** + * @see org.apache.commons.math.stat.inference.ChiSquareTest#chiSquareTest(long[][], double) + */ + public static boolean chiSquareTest(long[][] counts, double alpha) + throws IllegalArgumentException, MathException { + return chiSquareTest. chiSquareTest(counts, alpha); + } + + /** + * @see org.apache.commons.math.stat.inference.ChiSquareTest#chiSquareTest(long[][]) + */ + public static double chiSquareTest(long[][] counts) + throws IllegalArgumentException, MathException { + return chiSquareTest. chiSquareTest(counts); + } + + /** + * @see org.apache.commons.math.stat.inference.UnknownDistributionChiSquareTest#chiSquareDataSetsComparison(long[], long[]) + * + * @since 1.2 + */ + public static double chiSquareDataSetsComparison(long[] observed1, long[] observed2) + throws IllegalArgumentException { + return unknownDistributionChiSquareTest.chiSquareDataSetsComparison(observed1, observed2); + } + + /** + * @see org.apache.commons.math.stat.inference.UnknownDistributionChiSquareTest#chiSquareTestDataSetsComparison(long[], long[]) + * + * @since 1.2 + */ + public static double chiSquareTestDataSetsComparison(long[] observed1, long[] observed2) + throws IllegalArgumentException, MathException { + return unknownDistributionChiSquareTest.chiSquareTestDataSetsComparison(observed1, observed2); + } + + + /** + * @see org.apache.commons.math.stat.inference.UnknownDistributionChiSquareTest#chiSquareTestDataSetsComparison(long[], long[], double) + * + * @since 1.2 + */ + public static boolean chiSquareTestDataSetsComparison(long[] observed1, long[] observed2, + double alpha) + throws IllegalArgumentException, MathException { + return unknownDistributionChiSquareTest.chiSquareTestDataSetsComparison(observed1, observed2, alpha); + } + + /** + * @see org.apache.commons.math.stat.inference.OneWayAnova#anovaFValue(Collection) + * + * @since 1.2 + */ + public static double oneWayAnovaFValue(Collection<double[]> categoryData) + throws IllegalArgumentException, MathException { + return oneWayAnova.anovaFValue(categoryData); + } + + /** + * @see org.apache.commons.math.stat.inference.OneWayAnova#anovaPValue(Collection) + * + * @since 1.2 + */ + public static double oneWayAnovaPValue(Collection<double[]> categoryData) + throws IllegalArgumentException, MathException { + return oneWayAnova.anovaPValue(categoryData); + } + + /** + * @see org.apache.commons.math.stat.inference.OneWayAnova#anovaTest(Collection,double) + * + * @since 1.2 + */ + public static boolean oneWayAnovaTest(Collection<double[]> categoryData, double alpha) + throws IllegalArgumentException, MathException { + return oneWayAnova.anovaTest(categoryData, alpha); + } + + // CHECKSTYLE: resume JavadocMethodCheck + +} diff --git a/src/main/java/org/apache/commons/math/stat/inference/UnknownDistributionChiSquareTest.java b/src/main/java/org/apache/commons/math/stat/inference/UnknownDistributionChiSquareTest.java new file mode 100644 index 0000000..662e4d6 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/inference/UnknownDistributionChiSquareTest.java @@ -0,0 +1,144 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.inference; + +import org.apache.commons.math.MathException; + +/** + * An interface for Chi-Square tests for unknown distributions. + * <p>Two samples tests are used when the distribution is unknown <i>a priori</i> + * but provided by one sample. We compare the second sample against the first.</p> + * + * @version $Revision: 811685 $ $Date: 2009-09-05 19:36:48 +0200 (sam. 05 sept. 2009) $ + * @since 1.2 + */ +public interface UnknownDistributionChiSquareTest extends ChiSquareTest { + + /** + * <p>Computes a + * <a href="http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/chi2samp.htm"> + * Chi-Square two sample test statistic</a> comparing bin frequency counts + * in <code>observed1</code> and <code>observed2</code>. The + * sums of frequency counts in the two samples are not required to be the + * same. The formula used to compute the test statistic is</p> + * <code> + * ∑[(K * observed1[i] - observed2[i]/K)<sup>2</sup> / (observed1[i] + observed2[i])] + * </code> where + * <br/><code>K = &sqrt;[&sum(observed2 / ∑(observed1)]</code> + * </p> + * <p>This statistic can be used to perform a Chi-Square test evaluating the null hypothesis that + * both observed counts follow the same distribution.</p> + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>Observed counts must be non-negative. + * </li> + * <li>Observed counts for a specific bin must not both be zero. + * </li> + * <li>Observed counts for a specific sample must not all be 0. + * </li> + * <li>The arrays <code>observed1</code> and <code>observed2</code> must have the same length and + * their common length must be at least 2. + * </li></ul></p><p> + * If any of the preconditions are not met, an + * <code>IllegalArgumentException</code> is thrown.</p> + * + * @param observed1 array of observed frequency counts of the first data set + * @param observed2 array of observed frequency counts of the second data set + * @return chiSquare statistic + * @throws IllegalArgumentException if preconditions are not met + */ + double chiSquareDataSetsComparison(long[] observed1, long[] observed2) + throws IllegalArgumentException; + + /** + * <p>Returns the <i>observed significance level</i>, or <a href= + * "http://www.cas.lancs.ac.uk/glossary_v1.1/hyptest.html#pvalue"> + * p-value</a>, associated with a Chi-Square two sample test comparing + * bin frequency counts in <code>observed1</code> and + * <code>observed2</code>. + * </p> + * <p>The number returned is the smallest significance level at which one + * can reject the null hypothesis that the observed counts conform to the + * same distribution. + * </p> + * <p>See {@link #chiSquareDataSetsComparison(long[], long[])} for details + * on the formula used to compute the test statistic. The degrees of + * of freedom used to perform the test is one less than the common length + * of the input observed count arrays. + * </p> + * <strong>Preconditions</strong>: <ul> + * <li>Observed counts must be non-negative. + * </li> + * <li>Observed counts for a specific bin must not both be zero. + * </li> + * <li>Observed counts for a specific sample must not all be 0. + * </li> + * <li>The arrays <code>observed1</code> and <code>observed2</code> must + * have the same length and + * their common length must be at least 2. + * </li></ul><p> + * If any of the preconditions are not met, an + * <code>IllegalArgumentException</code> is thrown.</p> + * + * @param observed1 array of observed frequency counts of the first data set + * @param observed2 array of observed frequency counts of the second data set + * @return p-value + * @throws IllegalArgumentException if preconditions are not met + * @throws MathException if an error occurs computing the p-value + */ + double chiSquareTestDataSetsComparison(long[] observed1, long[] observed2) + throws IllegalArgumentException, MathException; + + /** + * <p>Performs a Chi-Square two sample test comparing two binned data + * sets. The test evaluates the null hypothesis that the two lists of + * observed counts conform to the same frequency distribution, with + * significance level <code>alpha</code>. Returns true iff the null + * hypothesis can be rejected with 100 * (1 - alpha) percent confidence. + * </p> + * <p>See {@link #chiSquareDataSetsComparison(long[], long[])} for + * details on the formula used to compute the Chisquare statistic used + * in the test. The degrees of of freedom used to perform the test is + * one less than the common length of the input observed count arrays. + * </p> + * <strong>Preconditions</strong>: <ul> + * <li>Observed counts must be non-negative. + * </li> + * <li>Observed counts for a specific bin must not both be zero. + * </li> + * <li>Observed counts for a specific sample must not all be 0. + * </li> + * <li>The arrays <code>observed1</code> and <code>observed2</code> must + * have the same length and their common length must be at least 2. + * </li> + * <li> <code> 0 < alpha < 0.5 </code> + * </li></ul><p> + * If any of the preconditions are not met, an + * <code>IllegalArgumentException</code> is thrown.</p> + * + * @param observed1 array of observed frequency counts of the first data set + * @param observed2 array of observed frequency counts of the second data set + * @param alpha significance level of the test + * @return true iff null hypothesis can be rejected with confidence + * 1 - alpha + * @throws IllegalArgumentException if preconditions are not met + * @throws MathException if an error occurs performing the test + */ + boolean chiSquareTestDataSetsComparison(long[] observed1, long[] observed2, double alpha) + throws IllegalArgumentException, MathException; + +} diff --git a/src/main/java/org/apache/commons/math/stat/inference/package.html b/src/main/java/org/apache/commons/math/stat/inference/package.html new file mode 100644 index 0000000..288eebf --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/inference/package.html @@ -0,0 +1,23 @@ +<html> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + --> + <!-- $Revision: 480440 $ $Date: 2006-11-29 08:14:12 +0100 (mer. 29 nov. 2006) $ --> + <body> + Classes providing hypothesis testing and confidence interval + construction. + </body> +</html> diff --git a/src/main/java/org/apache/commons/math/stat/package.html b/src/main/java/org/apache/commons/math/stat/package.html new file mode 100644 index 0000000..d62d67a --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/package.html @@ -0,0 +1,20 @@ +<html> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + --> + <!-- $Revision: 480440 $ $Date: 2006-11-29 08:14:12 +0100 (mer. 29 nov. 2006) $ --> + <body>Data storage, manipulation and summary routines.</body> +</html> diff --git a/src/main/java/org/apache/commons/math/stat/ranking/NaNStrategy.java b/src/main/java/org/apache/commons/math/stat/ranking/NaNStrategy.java new file mode 100644 index 0000000..cffa7d1 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/ranking/NaNStrategy.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.math.stat.ranking; + +/** + * Strategies for handling NaN values in rank transformations. + * <ul> + * <li>MINIMAL - NaNs are treated as minimal in the ordering, equivalent to + * (that is, tied with) <code>Double.NEGATIVE_INFINITY</code>.</li> + * <li>MAXIMAL - NaNs are treated as maximal in the ordering, equivalent to + * <code>Double.POSITIVE_INFINITY</code></li> + * <li>REMOVED - NaNs are removed before the rank transform is applied</li> + * <li>FIXED - NaNs are left "in place," that is the rank transformation is + * applied to the other elements in the input array, but the NaN elements + * are returned unchanged.</li> + * </ul> + * + * @since 2.0 + * @version $Revision: 811685 $ $Date: 2009-09-05 19:36:48 +0200 (sam. 05 sept. 2009) $ + */ +public enum NaNStrategy { + + /** NaNs are considered minimal in the ordering */ + MINIMAL, + + /** NaNs are considered maximal in the ordering */ + MAXIMAL, + + /** NaNs are removed before computing ranks */ + REMOVED, + + /** NaNs are left in place */ + FIXED +} diff --git a/src/main/java/org/apache/commons/math/stat/ranking/NaturalRanking.java b/src/main/java/org/apache/commons/math/stat/ranking/NaturalRanking.java new file mode 100644 index 0000000..f51189c --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/ranking/NaturalRanking.java @@ -0,0 +1,464 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.math.stat.ranking; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; + +import org.apache.commons.math.exception.MathInternalError; +import org.apache.commons.math.random.RandomData; +import org.apache.commons.math.random.RandomDataImpl; +import org.apache.commons.math.random.RandomGenerator; +import org.apache.commons.math.util.FastMath; + + +/** + * <p> Ranking based on the natural ordering on doubles.</p> + * <p>NaNs are treated according to the configured {@link NaNStrategy} and ties + * are handled using the selected {@link TiesStrategy}. + * Configuration settings are supplied in optional constructor arguments. + * Defaults are {@link NaNStrategy#MAXIMAL} and {@link TiesStrategy#AVERAGE}, + * respectively. When using {@link TiesStrategy#RANDOM}, a + * {@link RandomGenerator} may be supplied as a constructor argument.</p> + * <p>Examples: + * <table border="1" cellpadding="3"> + * <tr><th colspan="3"> + * Input data: (20, 17, 30, 42.3, 17, 50, Double.NaN, Double.NEGATIVE_INFINITY, 17) + * </th></tr> + * <tr><th>NaNStrategy</th><th>TiesStrategy</th> + * <th><code>rank(data)</code></th> + * <tr> + * <td>default (NaNs maximal)</td> + * <td>default (ties averaged)</td> + * <td>(5, 3, 6, 7, 3, 8, 9, 1, 3)</td></tr> + * <tr> + * <td>default (NaNs maximal)</td> + * <td>MINIMUM</td> + * <td>(5, 2, 6, 7, 2, 8, 9, 1, 2)</td></tr> + * <tr> + * <td>MINIMAL</td> + * <td>default (ties averaged)</td> + * <td>(6, 4, 7, 8, 4, 9, 1.5, 1.5, 4)</td></tr> + * <tr> + * <td>REMOVED</td> + * <td>SEQUENTIAL</td> + * <td>(5, 2, 6, 7, 3, 8, 1, 4)</td></tr> + * <tr> + * <td>MINIMAL</td> + * <td>MAXIMUM</td> + * <td>(6, 5, 7, 8, 5, 9, 2, 2, 5)</td></tr></table></p> + * + * @since 2.0 + * @version $Revision: 1061496 $ $Date: 2011-01-20 21:32:16 +0100 (jeu. 20 janv. 2011) $ + */ +public class NaturalRanking implements RankingAlgorithm { + + /** default NaN strategy */ + public static final NaNStrategy DEFAULT_NAN_STRATEGY = NaNStrategy.MAXIMAL; + + /** default ties strategy */ + public static final TiesStrategy DEFAULT_TIES_STRATEGY = TiesStrategy.AVERAGE; + + /** NaN strategy - defaults to NaNs maximal */ + private final NaNStrategy nanStrategy; + + /** Ties strategy - defaults to ties averaged */ + private final TiesStrategy tiesStrategy; + + /** Source of random data - used only when ties strategy is RANDOM */ + private final RandomData randomData; + + /** + * Create a NaturalRanking with default strategies for handling ties and NaNs. + */ + public NaturalRanking() { + super(); + tiesStrategy = DEFAULT_TIES_STRATEGY; + nanStrategy = DEFAULT_NAN_STRATEGY; + randomData = null; + } + + /** + * Create a NaturalRanking with the given TiesStrategy. + * + * @param tiesStrategy the TiesStrategy to use + */ + public NaturalRanking(TiesStrategy tiesStrategy) { + super(); + this.tiesStrategy = tiesStrategy; + nanStrategy = DEFAULT_NAN_STRATEGY; + randomData = new RandomDataImpl(); + } + + /** + * Create a NaturalRanking with the given NaNStrategy. + * + * @param nanStrategy the NaNStrategy to use + */ + public NaturalRanking(NaNStrategy nanStrategy) { + super(); + this.nanStrategy = nanStrategy; + tiesStrategy = DEFAULT_TIES_STRATEGY; + randomData = null; + } + + /** + * Create a NaturalRanking with the given NaNStrategy and TiesStrategy. + * + * @param nanStrategy NaNStrategy to use + * @param tiesStrategy TiesStrategy to use + */ + public NaturalRanking(NaNStrategy nanStrategy, TiesStrategy tiesStrategy) { + super(); + this.nanStrategy = nanStrategy; + this.tiesStrategy = tiesStrategy; + randomData = new RandomDataImpl(); + } + + /** + * Create a NaturalRanking with TiesStrategy.RANDOM and the given + * RandomGenerator as the source of random data. + * + * @param randomGenerator source of random data + */ + public NaturalRanking(RandomGenerator randomGenerator) { + super(); + this.tiesStrategy = TiesStrategy.RANDOM; + nanStrategy = DEFAULT_NAN_STRATEGY; + randomData = new RandomDataImpl(randomGenerator); + } + + + /** + * Create a NaturalRanking with the given NaNStrategy, TiesStrategy.RANDOM + * and the given source of random data. + * + * @param nanStrategy NaNStrategy to use + * @param randomGenerator source of random data + */ + public NaturalRanking(NaNStrategy nanStrategy, + RandomGenerator randomGenerator) { + super(); + this.nanStrategy = nanStrategy; + this.tiesStrategy = TiesStrategy.RANDOM; + randomData = new RandomDataImpl(randomGenerator); + } + + /** + * Return the NaNStrategy + * + * @return returns the NaNStrategy + */ + public NaNStrategy getNanStrategy() { + return nanStrategy; + } + + /** + * Return the TiesStrategy + * + * @return the TiesStrategy + */ + public TiesStrategy getTiesStrategy() { + return tiesStrategy; + } + + /** + * Rank <code>data</code> using the natural ordering on Doubles, with + * NaN values handled according to <code>nanStrategy</code> and ties + * resolved using <code>tiesStrategy.</code> + * + * @param data array to be ranked + * @return array of ranks + */ + public double[] rank(double[] data) { + + // Array recording initial positions of data to be ranked + IntDoublePair[] ranks = new IntDoublePair[data.length]; + for (int i = 0; i < data.length; i++) { + ranks[i] = new IntDoublePair(data[i], i); + } + + // Recode, remove or record positions of NaNs + List<Integer> nanPositions = null; + switch (nanStrategy) { + case MAXIMAL: // Replace NaNs with +INFs + recodeNaNs(ranks, Double.POSITIVE_INFINITY); + break; + case MINIMAL: // Replace NaNs with -INFs + recodeNaNs(ranks, Double.NEGATIVE_INFINITY); + break; + case REMOVED: // Drop NaNs from data + ranks = removeNaNs(ranks); + break; + case FIXED: // Record positions of NaNs + nanPositions = getNanPositions(ranks); + break; + default: // this should not happen unless NaNStrategy enum is changed + throw new MathInternalError(); + } + + // Sort the IntDoublePairs + Arrays.sort(ranks); + + // Walk the sorted array, filling output array using sorted positions, + // resolving ties as we go + double[] out = new double[ranks.length]; + int pos = 1; // position in sorted array + out[ranks[0].getPosition()] = pos; + List<Integer> tiesTrace = new ArrayList<Integer>(); + tiesTrace.add(ranks[0].getPosition()); + for (int i = 1; i < ranks.length; i++) { + if (Double.compare(ranks[i].getValue(), ranks[i - 1].getValue()) > 0) { + // tie sequence has ended (or had length 1) + pos = i + 1; + if (tiesTrace.size() > 1) { // if seq is nontrivial, resolve + resolveTie(out, tiesTrace); + } + tiesTrace = new ArrayList<Integer>(); + tiesTrace.add(ranks[i].getPosition()); + } else { + // tie sequence continues + tiesTrace.add(ranks[i].getPosition()); + } + out[ranks[i].getPosition()] = pos; + } + if (tiesTrace.size() > 1) { // handle tie sequence at end + resolveTie(out, tiesTrace); + } + if (nanStrategy == NaNStrategy.FIXED) { + restoreNaNs(out, nanPositions); + } + return out; + } + + /** + * Returns an array that is a copy of the input array with IntDoublePairs + * having NaN values removed. + * + * @param ranks input array + * @return array with NaN-valued entries removed + */ + private IntDoublePair[] removeNaNs(IntDoublePair[] ranks) { + if (!containsNaNs(ranks)) { + return ranks; + } + IntDoublePair[] outRanks = new IntDoublePair[ranks.length]; + int j = 0; + for (int i = 0; i < ranks.length; i++) { + if (Double.isNaN(ranks[i].getValue())) { + // drop, but adjust original ranks of later elements + for (int k = i + 1; k < ranks.length; k++) { + ranks[k] = new IntDoublePair( + ranks[k].getValue(), ranks[k].getPosition() - 1); + } + } else { + outRanks[j] = new IntDoublePair( + ranks[i].getValue(), ranks[i].getPosition()); + j++; + } + } + IntDoublePair[] returnRanks = new IntDoublePair[j]; + System.arraycopy(outRanks, 0, returnRanks, 0, j); + return returnRanks; + } + + /** + * Recodes NaN values to the given value. + * + * @param ranks array to recode + * @param value the value to replace NaNs with + */ + private void recodeNaNs(IntDoublePair[] ranks, double value) { + for (int i = 0; i < ranks.length; i++) { + if (Double.isNaN(ranks[i].getValue())) { + ranks[i] = new IntDoublePair( + value, ranks[i].getPosition()); + } + } + } + + /** + * Checks for presence of NaNs in <code>ranks.</code> + * + * @param ranks array to be searched for NaNs + * @return true iff ranks contains one or more NaNs + */ + private boolean containsNaNs(IntDoublePair[] ranks) { + for (int i = 0; i < ranks.length; i++) { + if (Double.isNaN(ranks[i].getValue())) { + return true; + } + } + return false; + } + + /** + * Resolve a sequence of ties, using the configured {@link TiesStrategy}. + * The input <code>ranks</code> array is expected to take the same value + * for all indices in <code>tiesTrace</code>. The common value is recoded + * according to the tiesStrategy. For example, if ranks = <5,8,2,6,2,7,1,2>, + * tiesTrace = <2,4,7> and tiesStrategy is MINIMUM, ranks will be unchanged. + * The same array and trace with tiesStrategy AVERAGE will come out + * <5,8,3,6,3,7,1,3>. + * + * @param ranks array of ranks + * @param tiesTrace list of indices where <code>ranks</code> is constant + * -- that is, for any i and j in TiesTrace, <code> ranks[i] == ranks[j] + * </code> + */ + private void resolveTie(double[] ranks, List<Integer> tiesTrace) { + + // constant value of ranks over tiesTrace + final double c = ranks[tiesTrace.get(0)]; + + // length of sequence of tied ranks + final int length = tiesTrace.size(); + + switch (tiesStrategy) { + case AVERAGE: // Replace ranks with average + fill(ranks, tiesTrace, (2 * c + length - 1) / 2d); + break; + case MAXIMUM: // Replace ranks with maximum values + fill(ranks, tiesTrace, c + length - 1); + break; + case MINIMUM: // Replace ties with minimum + fill(ranks, tiesTrace, c); + break; + case RANDOM: // Fill with random integral values in [c, c + length - 1] + Iterator<Integer> iterator = tiesTrace.iterator(); + long f = FastMath.round(c); + while (iterator.hasNext()) { + ranks[iterator.next()] = + randomData.nextLong(f, f + length - 1); + } + break; + case SEQUENTIAL: // Fill sequentially from c to c + length - 1 + // walk and fill + iterator = tiesTrace.iterator(); + f = FastMath.round(c); + int i = 0; + while (iterator.hasNext()) { + ranks[iterator.next()] = f + i++; + } + break; + default: // this should not happen unless TiesStrategy enum is changed + throw new MathInternalError(); + } + } + + /** + * Sets<code>data[i] = value</code> for each i in <code>tiesTrace.</code> + * + * @param data array to modify + * @param tiesTrace list of index values to set + * @param value value to set + */ + private void fill(double[] data, List<Integer> tiesTrace, double value) { + Iterator<Integer> iterator = tiesTrace.iterator(); + while (iterator.hasNext()) { + data[iterator.next()] = value; + } + } + + /** + * Set <code>ranks[i] = Double.NaN</code> for each i in <code>nanPositions.</code> + * + * @param ranks array to modify + * @param nanPositions list of index values to set to <code>Double.NaN</code> + */ + private void restoreNaNs(double[] ranks, List<Integer> nanPositions) { + if (nanPositions.size() == 0) { + return; + } + Iterator<Integer> iterator = nanPositions.iterator(); + while (iterator.hasNext()) { + ranks[iterator.next().intValue()] = Double.NaN; + } + + } + + /** + * Returns a list of indexes where <code>ranks</code> is <code>NaN.</code> + * + * @param ranks array to search for <code>NaNs</code> + * @return list of indexes i such that <code>ranks[i] = NaN</code> + */ + private List<Integer> getNanPositions(IntDoublePair[] ranks) { + ArrayList<Integer> out = new ArrayList<Integer>(); + for (int i = 0; i < ranks.length; i++) { + if (Double.isNaN(ranks[i].getValue())) { + out.add(Integer.valueOf(i)); + } + } + return out; + } + + /** + * Represents the position of a double value in an ordering. + * Comparable interface is implemented so Arrays.sort can be used + * to sort an array of IntDoublePairs by value. Note that the + * implicitly defined natural ordering is NOT consistent with equals. + */ + private static class IntDoublePair implements Comparable<IntDoublePair> { + + /** Value of the pair */ + private final double value; + + /** Original position of the pair */ + private final int position; + + /** + * Construct an IntDoublePair with the given value and position. + * @param value the value of the pair + * @param position the original position + */ + public IntDoublePair(double value, int position) { + this.value = value; + this.position = position; + } + + /** + * Compare this IntDoublePair to another pair. + * Only the <strong>values</strong> are compared. + * + * @param other the other pair to compare this to + * @return result of <code>Double.compare(value, other.value)</code> + */ + public int compareTo(IntDoublePair other) { + return Double.compare(value, other.value); + } + + /** + * Returns the value of the pair. + * @return value + */ + public double getValue() { + return value; + } + + /** + * Returns the original position of the pair. + * @return position + */ + public int getPosition() { + return position; + } + } +} diff --git a/src/main/java/org/apache/commons/math/stat/ranking/RankingAlgorithm.java b/src/main/java/org/apache/commons/math/stat/ranking/RankingAlgorithm.java new file mode 100644 index 0000000..b01f324 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/ranking/RankingAlgorithm.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.math.stat.ranking; + +/** + * Interface representing a rank transformation. + * + * @since 2.0 + * @version $Revision: 811685 $ $Date: 2009-09-05 19:36:48 +0200 (sam. 05 sept. 2009) $ + */ +public interface RankingAlgorithm { + /** + * <p>Performs a rank transformation on the input data, returning an array + * of ranks.</p> + * + * <p>Ranks should be 1-based - that is, the smallest value + * returned in an array of ranks should be greater than or equal to one, + * rather than 0. Ranks should in general take integer values, though + * implementations may return averages or other floating point values + * to resolve ties in the input data.</p> + * + * @param data array of data to be ranked + * @return an array of ranks corresponding to the elements of the input array + */ + double[] rank (double[] data); +} diff --git a/src/main/java/org/apache/commons/math/stat/ranking/TiesStrategy.java b/src/main/java/org/apache/commons/math/stat/ranking/TiesStrategy.java new file mode 100644 index 0000000..794c229 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/ranking/TiesStrategy.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.math.stat.ranking; + +/** + * Strategies for handling tied values in rank transformations. + * <ul> + * <li>SEQUENTIAL - Ties are assigned ranks in order of occurrence in the original array, + * for example (1,3,4,3) is ranked as (1,2,4,3)</li> + * <li>MINIMUM - Tied values are assigned the minimum applicable rank, or the rank + * of the first occurrence. For example, (1,3,4,3) is ranked as (1,2,4,2)</li> + * <li>MAXIMUM - Tied values are assigned the maximum applicable rank, or the rank + * of the last occurrence. For example, (1,3,4,3) is ranked as (1,3,4,3)</li> + * <li>AVERAGE - Tied values are assigned the average of the applicable ranks. + * For example, (1,3,4,3) is ranked as (1,2.5,4,2.5)</li> + * <li>RANDOM - Tied values are assigned a random integer rank from among the + * applicable values. The assigned rank will always be an integer, (inclusively) + * between the values returned by the MINIMUM and MAXIMUM strategies.</li> + * </ul> + * + * @since 2.0 + * @version $Revision: 981332 $ $Date: 2010-08-02 00:24:31 +0200 (lun. 02 août 2010) $ + */ +public enum TiesStrategy { + + /** Ties assigned sequential ranks in order of occurrence */ + SEQUENTIAL, + + /** Ties get the minimum applicable rank */ + MINIMUM, + + /** Ties get the maximum applicable rank */ + MAXIMUM, + + /** Ties get the average of applicable ranks */ + AVERAGE, + + /** Ties get a random integral value from among applicable ranks */ + RANDOM +} diff --git a/src/main/java/org/apache/commons/math/stat/ranking/package.html b/src/main/java/org/apache/commons/math/stat/ranking/package.html new file mode 100644 index 0000000..63e0c4a --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/ranking/package.html @@ -0,0 +1,22 @@ +<html> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + --> + <!-- $Revision:$ $Date:$ --> + <body> + Classes providing rank transformations. + </body> +</html> diff --git a/src/main/java/org/apache/commons/math/stat/regression/AbstractMultipleLinearRegression.java b/src/main/java/org/apache/commons/math/stat/regression/AbstractMultipleLinearRegression.java new file mode 100644 index 0000000..9757682 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/regression/AbstractMultipleLinearRegression.java @@ -0,0 +1,366 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.regression; + +import org.apache.commons.math.MathRuntimeException; +import org.apache.commons.math.exception.util.LocalizedFormats; +import org.apache.commons.math.linear.RealMatrix; +import org.apache.commons.math.linear.Array2DRowRealMatrix; +import org.apache.commons.math.linear.RealVector; +import org.apache.commons.math.linear.ArrayRealVector; +import org.apache.commons.math.stat.descriptive.moment.Variance; +import org.apache.commons.math.util.FastMath; + +/** + * Abstract base class for implementations of MultipleLinearRegression. + * @version $Revision: 1073459 $ $Date: 2011-02-22 20:18:12 +0100 (mar. 22 févr. 2011) $ + * @since 2.0 + */ +public abstract class AbstractMultipleLinearRegression implements + MultipleLinearRegression { + + /** X sample data. */ + protected RealMatrix X; + + /** Y sample data. */ + protected RealVector Y; + + /** Whether or not the regression model includes an intercept. True means no intercept. */ + private boolean noIntercept = false; + + /** + * @return true if the model has no intercept term; false otherwise + * @since 2.2 + */ + public boolean isNoIntercept() { + return noIntercept; + } + + /** + * @param noIntercept true means the model is to be estimated without an intercept term + * @since 2.2 + */ + public void setNoIntercept(boolean noIntercept) { + this.noIntercept = noIntercept; + } + + /** + * <p>Loads model x and y sample data from a flat input array, overriding any previous sample. + * </p> + * <p>Assumes that rows are concatenated with y values first in each row. For example, an input + * <code>data</code> array containing the sequence of values (1, 2, 3, 4, 5, 6, 7, 8, 9) with + * <code>nobs = 3</code> and <code>nvars = 2</code> creates a regression dataset with two + * independent variables, as below: + * <pre> + * y x[0] x[1] + * -------------- + * 1 2 3 + * 4 5 6 + * 7 8 9 + * </pre> + * </p> + * <p>Note that there is no need to add an initial unitary column (column of 1's) when + * specifying a model including an intercept term. If {@link #isNoIntercept()} is <code>true</code>, + * the X matrix will be created without an initial column of "1"s; otherwise this column will + * be added. + * </p> + * <p>Throws IllegalArgumentException if any of the following preconditions fail: + * <ul><li><code>data</code> cannot be null</li> + * <li><code>data.length = nobs * (nvars + 1)</li> + * <li><code>nobs > nvars</code></li></ul> + * </p> + * + * @param data input data array + * @param nobs number of observations (rows) + * @param nvars number of independent variables (columns, not counting y) + * @throws IllegalArgumentException if the preconditions are not met + */ + public void newSampleData(double[] data, int nobs, int nvars) { + if (data == null) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.NULL_NOT_ALLOWED); + } + if (data.length != nobs * (nvars + 1)) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.INVALID_REGRESSION_ARRAY, data.length, nobs, nvars); + } + if (nobs <= nvars) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.NOT_ENOUGH_DATA_FOR_NUMBER_OF_PREDICTORS); + } + double[] y = new double[nobs]; + final int cols = noIntercept ? nvars: nvars + 1; + double[][] x = new double[nobs][cols]; + int pointer = 0; + for (int i = 0; i < nobs; i++) { + y[i] = data[pointer++]; + if (!noIntercept) { + x[i][0] = 1.0d; + } + for (int j = noIntercept ? 0 : 1; j < cols; j++) { + x[i][j] = data[pointer++]; + } + } + this.X = new Array2DRowRealMatrix(x); + this.Y = new ArrayRealVector(y); + } + + /** + * Loads new y sample data, overriding any previous data. + * + * @param y the array representing the y sample + * @throws IllegalArgumentException if y is null or empty + */ + protected void newYSampleData(double[] y) { + if (y == null) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.NULL_NOT_ALLOWED); + } + if (y.length == 0) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.NO_DATA); + } + this.Y = new ArrayRealVector(y); + } + + /** + * <p>Loads new x sample data, overriding any previous data. + * </p> + * The input <code>x</code> array should have one row for each sample + * observation, with columns corresponding to independent variables. + * For example, if <pre> + * <code> x = new double[][] {{1, 2}, {3, 4}, {5, 6}} </code></pre> + * then <code>setXSampleData(x) </code> results in a model with two independent + * variables and 3 observations: + * <pre> + * x[0] x[1] + * ---------- + * 1 2 + * 3 4 + * 5 6 + * </pre> + * </p> + * <p>Note that there is no need to add an initial unitary column (column of 1's) when + * specifying a model including an intercept term. + * </p> + * @param x the rectangular array representing the x sample + * @throws IllegalArgumentException if x is null, empty or not rectangular + */ + protected void newXSampleData(double[][] x) { + if (x == null) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.NULL_NOT_ALLOWED); + } + if (x.length == 0) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.NO_DATA); + } + if (noIntercept) { + this.X = new Array2DRowRealMatrix(x, true); + } else { // Augment design matrix with initial unitary column + final int nVars = x[0].length; + final double[][] xAug = new double[x.length][nVars + 1]; + for (int i = 0; i < x.length; i++) { + if (x[i].length != nVars) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.DIFFERENT_ROWS_LENGTHS, + x[i].length, nVars); + } + xAug[i][0] = 1.0d; + System.arraycopy(x[i], 0, xAug[i], 1, nVars); + } + this.X = new Array2DRowRealMatrix(xAug, false); + } + } + + /** + * Validates sample data. Checks that + * <ul><li>Neither x nor y is null or empty;</li> + * <li>The length (i.e. number of rows) of x equals the length of y</li> + * <li>x has at least one more row than it has columns (i.e. there is + * sufficient data to estimate regression coefficients for each of the + * columns in x plus an intercept.</li> + * </ul> + * + * @param x the [n,k] array representing the x data + * @param y the [n,1] array representing the y data + * @throws IllegalArgumentException if any of the checks fail + * + */ + protected void validateSampleData(double[][] x, double[] y) { + if ((x == null) || (y == null) || (x.length != y.length)) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.DIMENSIONS_MISMATCH_SIMPLE, + (x == null) ? 0 : x.length, + (y == null) ? 0 : y.length); + } + if (x.length == 0) { // Must be no y data either + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.NO_DATA); + } + if (x[0].length + 1 > x.length) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.NOT_ENOUGH_DATA_FOR_NUMBER_OF_PREDICTORS, + x.length, x[0].length); + } + } + + /** + * Validates that the x data and covariance matrix have the same + * number of rows and that the covariance matrix is square. + * + * @param x the [n,k] array representing the x sample + * @param covariance the [n,n] array representing the covariance matrix + * @throws IllegalArgumentException if the number of rows in x is not equal + * to the number of rows in covariance or covariance is not square. + */ + protected void validateCovarianceData(double[][] x, double[][] covariance) { + if (x.length != covariance.length) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.DIMENSIONS_MISMATCH_SIMPLE, x.length, covariance.length); + } + if (covariance.length > 0 && covariance.length != covariance[0].length) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.NON_SQUARE_MATRIX, + covariance.length, covariance[0].length); + } + } + + /** + * {@inheritDoc} + */ + public double[] estimateRegressionParameters() { + RealVector b = calculateBeta(); + return b.getData(); + } + + /** + * {@inheritDoc} + */ + public double[] estimateResiduals() { + RealVector b = calculateBeta(); + RealVector e = Y.subtract(X.operate(b)); + return e.getData(); + } + + /** + * {@inheritDoc} + */ + public double[][] estimateRegressionParametersVariance() { + return calculateBetaVariance().getData(); + } + + /** + * {@inheritDoc} + */ + public double[] estimateRegressionParametersStandardErrors() { + double[][] betaVariance = estimateRegressionParametersVariance(); + double sigma = calculateErrorVariance(); + int length = betaVariance[0].length; + double[] result = new double[length]; + for (int i = 0; i < length; i++) { + result[i] = FastMath.sqrt(sigma * betaVariance[i][i]); + } + return result; + } + + /** + * {@inheritDoc} + */ + public double estimateRegressandVariance() { + return calculateYVariance(); + } + + /** + * Estimates the variance of the error. + * + * @return estimate of the error variance + * @since 2.2 + */ + public double estimateErrorVariance() { + return calculateErrorVariance(); + + } + + /** + * Estimates the standard error of the regression. + * + * @return regression standard error + * @since 2.2 + */ + public double estimateRegressionStandardError() { + return Math.sqrt(estimateErrorVariance()); + } + + /** + * Calculates the beta of multiple linear regression in matrix notation. + * + * @return beta + */ + protected abstract RealVector calculateBeta(); + + /** + * Calculates the beta variance of multiple linear regression in matrix + * notation. + * + * @return beta variance + */ + protected abstract RealMatrix calculateBetaVariance(); + + + /** + * Calculates the variance of the y values. + * + * @return Y variance + */ + protected double calculateYVariance() { + return new Variance().evaluate(Y.getData()); + } + + /** + * <p>Calculates the variance of the error term.</p> + * Uses the formula <pre> + * var(u) = u · u / (n - k) + * </pre> + * where n and k are the row and column dimensions of the design + * matrix X. + * + * @return error variance estimate + * @since 2.2 + */ + protected double calculateErrorVariance() { + RealVector residuals = calculateResiduals(); + return residuals.dotProduct(residuals) / + (X.getRowDimension() - X.getColumnDimension()); + } + + /** + * Calculates the residuals of multiple linear regression in matrix + * notation. + * + * <pre> + * u = y - X * b + * </pre> + * + * @return The residuals [n,1] matrix + */ + protected RealVector calculateResiduals() { + RealVector b = calculateBeta(); + return Y.subtract(X.operate(b)); + } + +} diff --git a/src/main/java/org/apache/commons/math/stat/regression/GLSMultipleLinearRegression.java b/src/main/java/org/apache/commons/math/stat/regression/GLSMultipleLinearRegression.java new file mode 100644 index 0000000..dc6ef0d --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/regression/GLSMultipleLinearRegression.java @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.regression; + +import org.apache.commons.math.linear.LUDecompositionImpl; +import org.apache.commons.math.linear.RealMatrix; +import org.apache.commons.math.linear.Array2DRowRealMatrix; +import org.apache.commons.math.linear.RealVector; + +/** + * The GLS implementation of the multiple linear regression. + * + * GLS assumes a general covariance matrix Omega of the error + * <pre> + * u ~ N(0, Omega) + * </pre> + * + * Estimated by GLS, + * <pre> + * b=(X' Omega^-1 X)^-1X'Omega^-1 y + * </pre> + * whose variance is + * <pre> + * Var(b)=(X' Omega^-1 X)^-1 + * </pre> + * @version $Revision: 1073460 $ $Date: 2011-02-22 20:22:39 +0100 (mar. 22 févr. 2011) $ + * @since 2.0 + */ +public class GLSMultipleLinearRegression extends AbstractMultipleLinearRegression { + + /** Covariance matrix. */ + private RealMatrix Omega; + + /** Inverse of covariance matrix. */ + private RealMatrix OmegaInverse; + + /** Replace sample data, overriding any previous sample. + * @param y y values of the sample + * @param x x values of the sample + * @param covariance array representing the covariance matrix + */ + public void newSampleData(double[] y, double[][] x, double[][] covariance) { + validateSampleData(x, y); + newYSampleData(y); + newXSampleData(x); + validateCovarianceData(x, covariance); + newCovarianceData(covariance); + } + + /** + * Add the covariance data. + * + * @param omega the [n,n] array representing the covariance + */ + protected void newCovarianceData(double[][] omega){ + this.Omega = new Array2DRowRealMatrix(omega); + this.OmegaInverse = null; + } + + /** + * Get the inverse of the covariance. + * <p>The inverse of the covariance matrix is lazily evaluated and cached.</p> + * @return inverse of the covariance + */ + protected RealMatrix getOmegaInverse() { + if (OmegaInverse == null) { + OmegaInverse = new LUDecompositionImpl(Omega).getSolver().getInverse(); + } + return OmegaInverse; + } + + /** + * Calculates beta by GLS. + * <pre> + * b=(X' Omega^-1 X)^-1X'Omega^-1 y + * </pre> + * @return beta + */ + @Override + protected RealVector calculateBeta() { + RealMatrix OI = getOmegaInverse(); + RealMatrix XT = X.transpose(); + RealMatrix XTOIX = XT.multiply(OI).multiply(X); + RealMatrix inverse = new LUDecompositionImpl(XTOIX).getSolver().getInverse(); + return inverse.multiply(XT).multiply(OI).operate(Y); + } + + /** + * Calculates the variance on the beta. + * <pre> + * Var(b)=(X' Omega^-1 X)^-1 + * </pre> + * @return The beta variance matrix + */ + @Override + protected RealMatrix calculateBetaVariance() { + RealMatrix OI = getOmegaInverse(); + RealMatrix XTOIX = X.transpose().multiply(OI).multiply(X); + return new LUDecompositionImpl(XTOIX).getSolver().getInverse(); + } + + + /** + * Calculates the estimated variance of the error term using the formula + * <pre> + * Var(u) = Tr(u' Omega^-1 u)/(n-k) + * </pre> + * where n and k are the row and column dimensions of the design + * matrix X. + * + * @return error variance + * @since 2.2 + */ + @Override + protected double calculateErrorVariance() { + RealVector residuals = calculateResiduals(); + double t = residuals.dotProduct(getOmegaInverse().operate(residuals)); + return t / (X.getRowDimension() - X.getColumnDimension()); + + } + +} diff --git a/src/main/java/org/apache/commons/math/stat/regression/MultipleLinearRegression.java b/src/main/java/org/apache/commons/math/stat/regression/MultipleLinearRegression.java new file mode 100644 index 0000000..b7aabd4 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/regression/MultipleLinearRegression.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.regression; + +/** + * The multiple linear regression can be represented in matrix-notation. + * <pre> + * y=X*b+u + * </pre> + * where y is an <code>n-vector</code> <b>regressand</b>, X is a <code>[n,k]</code> matrix whose <code>k</code> columns are called + * <b>regressors</b>, b is <code>k-vector</code> of <b>regression parameters</b> and <code>u</code> is an <code>n-vector</code> + * of <b>error terms</b> or <b>residuals</b>. + * + * The notation is quite standard in literature, + * cf eg <a href="http://www.econ.queensu.ca/ETM">Davidson and MacKinnon, Econometrics Theory and Methods, 2004</a>. + * @version $Revision: 811685 $ $Date: 2009-09-05 19:36:48 +0200 (sam. 05 sept. 2009) $ + * @since 2.0 + */ +public interface MultipleLinearRegression { + + /** + * Estimates the regression parameters b. + * + * @return The [k,1] array representing b + */ + double[] estimateRegressionParameters(); + + /** + * Estimates the variance of the regression parameters, ie Var(b). + * + * @return The [k,k] array representing the variance of b + */ + double[][] estimateRegressionParametersVariance(); + + /** + * Estimates the residuals, ie u = y - X*b. + * + * @return The [n,1] array representing the residuals + */ + double[] estimateResiduals(); + + /** + * Returns the variance of the regressand, ie Var(y). + * + * @return The double representing the variance of y + */ + double estimateRegressandVariance(); + + /** + * Returns the standard errors of the regression parameters. + * + * @return standard errors of estimated regression parameters + */ + double[] estimateRegressionParametersStandardErrors(); + +} diff --git a/src/main/java/org/apache/commons/math/stat/regression/OLSMultipleLinearRegression.java b/src/main/java/org/apache/commons/math/stat/regression/OLSMultipleLinearRegression.java new file mode 100644 index 0000000..22a59e8 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/regression/OLSMultipleLinearRegression.java @@ -0,0 +1,233 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.regression; + +import org.apache.commons.math.linear.Array2DRowRealMatrix; +import org.apache.commons.math.linear.LUDecompositionImpl; +import org.apache.commons.math.linear.QRDecomposition; +import org.apache.commons.math.linear.QRDecompositionImpl; +import org.apache.commons.math.linear.RealMatrix; +import org.apache.commons.math.linear.RealVector; +import org.apache.commons.math.stat.StatUtils; +import org.apache.commons.math.stat.descriptive.moment.SecondMoment; + +/** + * <p>Implements ordinary least squares (OLS) to estimate the parameters of a + * multiple linear regression model.</p> + * + * <p>The regression coefficients, <code>b</code>, satisfy the normal equations: + * <pre><code> X<sup>T</sup> X b = X<sup>T</sup> y </code></pre></p> + * + * <p>To solve the normal equations, this implementation uses QR decomposition + * of the <code>X</code> matrix. (See {@link QRDecompositionImpl} for details on the + * decomposition algorithm.) The <code>X</code> matrix, also known as the <i>design matrix,</i> + * has rows corresponding to sample observations and columns corresponding to independent + * variables. When the model is estimated using an intercept term (i.e. when + * {@link #isNoIntercept() isNoIntercept} is false as it is by default), the <code>X</code> + * matrix includes an initial column identically equal to 1. We solve the normal equations + * as follows: + * <pre><code> X<sup>T</sup>X b = X<sup>T</sup> y + * (QR)<sup>T</sup> (QR) b = (QR)<sup>T</sup>y + * R<sup>T</sup> (Q<sup>T</sup>Q) R b = R<sup>T</sup> Q<sup>T</sup> y + * R<sup>T</sup> R b = R<sup>T</sup> Q<sup>T</sup> y + * (R<sup>T</sup>)<sup>-1</sup> R<sup>T</sup> R b = (R<sup>T</sup>)<sup>-1</sup> R<sup>T</sup> Q<sup>T</sup> y + * R b = Q<sup>T</sup> y </code></pre></p> + * + * <p>Given <code>Q</code> and <code>R</code>, the last equation is solved by back-substitution.</p> + * + * @version $Revision: 1073464 $ $Date: 2011-02-22 20:35:02 +0100 (mar. 22 févr. 2011) $ + * @since 2.0 + */ +public class OLSMultipleLinearRegression extends AbstractMultipleLinearRegression { + + /** Cached QR decomposition of X matrix */ + private QRDecomposition qr = null; + + /** + * Loads model x and y sample data, overriding any previous sample. + * + * Computes and caches QR decomposition of the X matrix. + * @param y the [n,1] array representing the y sample + * @param x the [n,k] array representing the x sample + * @throws IllegalArgumentException if the x and y array data are not + * compatible for the regression + */ + public void newSampleData(double[] y, double[][] x) { + validateSampleData(x, y); + newYSampleData(y); + newXSampleData(x); + } + + /** + * {@inheritDoc} + * <p>This implementation computes and caches the QR decomposition of the X matrix.</p> + */ + @Override + public void newSampleData(double[] data, int nobs, int nvars) { + super.newSampleData(data, nobs, nvars); + qr = new QRDecompositionImpl(X); + } + + /** + * <p>Compute the "hat" matrix. + * </p> + * <p>The hat matrix is defined in terms of the design matrix X + * by X(X<sup>T</sup>X)<sup>-1</sup>X<sup>T</sup> + * </p> + * <p>The implementation here uses the QR decomposition to compute the + * hat matrix as Q I<sub>p</sub>Q<sup>T</sup> where I<sub>p</sub> is the + * p-dimensional identity matrix augmented by 0's. This computational + * formula is from "The Hat Matrix in Regression and ANOVA", + * David C. Hoaglin and Roy E. Welsch, + * <i>The American Statistician</i>, Vol. 32, No. 1 (Feb., 1978), pp. 17-22. + * + * @return the hat matrix + */ + public RealMatrix calculateHat() { + // Create augmented identity matrix + RealMatrix Q = qr.getQ(); + final int p = qr.getR().getColumnDimension(); + final int n = Q.getColumnDimension(); + Array2DRowRealMatrix augI = new Array2DRowRealMatrix(n, n); + double[][] augIData = augI.getDataRef(); + for (int i = 0; i < n; i++) { + for (int j =0; j < n; j++) { + if (i == j && i < p) { + augIData[i][j] = 1d; + } else { + augIData[i][j] = 0d; + } + } + } + + // Compute and return Hat matrix + return Q.multiply(augI).multiply(Q.transpose()); + } + + /** + * <p>Returns the sum of squared deviations of Y from its mean.</p> + * + * <p>If the model has no intercept term, <code>0</code> is used for the + * mean of Y - i.e., what is returned is the sum of the squared Y values.</p> + * + * <p>The value returned by this method is the SSTO value used in + * the {@link #calculateRSquared() R-squared} computation.</p> + * + * @return SSTO - the total sum of squares + * @see #isNoIntercept() + * @since 2.2 + */ + public double calculateTotalSumOfSquares() { + if (isNoIntercept()) { + return StatUtils.sumSq(Y.getData()); + } else { + return new SecondMoment().evaluate(Y.getData()); + } + } + + /** + * Returns the sum of squared residuals. + * + * @return residual sum of squares + * @since 2.2 + */ + public double calculateResidualSumOfSquares() { + final RealVector residuals = calculateResiduals(); + return residuals.dotProduct(residuals); + } + + /** + * Returns the R-Squared statistic, defined by the formula <pre> + * R<sup>2</sup> = 1 - SSR / SSTO + * </pre> + * where SSR is the {@link #calculateResidualSumOfSquares() sum of squared residuals} + * and SSTO is the {@link #calculateTotalSumOfSquares() total sum of squares} + * + * @return R-square statistic + * @since 2.2 + */ + public double calculateRSquared() { + return 1 - calculateResidualSumOfSquares() / calculateTotalSumOfSquares(); + } + + /** + * <p>Returns the adjusted R-squared statistic, defined by the formula <pre> + * R<sup>2</sup><sub>adj</sub> = 1 - [SSR (n - 1)] / [SSTO (n - p)] + * </pre> + * where SSR is the {@link #calculateResidualSumOfSquares() sum of squared residuals}, + * SSTO is the {@link #calculateTotalSumOfSquares() total sum of squares}, n is the number + * of observations and p is the number of parameters estimated (including the intercept).</p> + * + * <p>If the regression is estimated without an intercept term, what is returned is <pre> + * <code> 1 - (1 - {@link #calculateRSquared()}) * (n / (n - p)) </code> + * </pre></p> + * + * @return adjusted R-Squared statistic + * @see #isNoIntercept() + * @since 2.2 + */ + public double calculateAdjustedRSquared() { + final double n = X.getRowDimension(); + if (isNoIntercept()) { + return 1 - (1 - calculateRSquared()) * (n / (n - X.getColumnDimension())); + } else { + return 1 - (calculateResidualSumOfSquares() * (n - 1)) / + (calculateTotalSumOfSquares() * (n - X.getColumnDimension())); + } + } + + /** + * {@inheritDoc} + * <p>This implementation computes and caches the QR decomposition of the X matrix + * once it is successfully loaded.</p> + */ + @Override + protected void newXSampleData(double[][] x) { + super.newXSampleData(x); + qr = new QRDecompositionImpl(X); + } + + /** + * Calculates the regression coefficients using OLS. + * + * @return beta + */ + @Override + protected RealVector calculateBeta() { + return qr.getSolver().solve(Y); + } + + /** + * <p>Calculates the variance-covariance matrix of the regression parameters. + * </p> + * <p>Var(b) = (X<sup>T</sup>X)<sup>-1</sup> + * </p> + * <p>Uses QR decomposition to reduce (X<sup>T</sup>X)<sup>-1</sup> + * to (R<sup>T</sup>R)<sup>-1</sup>, with only the top p rows of + * R included, where p = the length of the beta vector.</p> + * + * @return The beta variance-covariance matrix + */ + @Override + protected RealMatrix calculateBetaVariance() { + int p = X.getColumnDimension(); + RealMatrix Raug = qr.getR().getSubMatrix(0, p - 1 , 0, p - 1); + RealMatrix Rinv = new LUDecompositionImpl(Raug).getSolver().getInverse(); + return Rinv.multiply(Rinv.transpose()); + } + +} diff --git a/src/main/java/org/apache/commons/math/stat/regression/SimpleRegression.java b/src/main/java/org/apache/commons/math/stat/regression/SimpleRegression.java new file mode 100644 index 0000000..d950541 --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/regression/SimpleRegression.java @@ -0,0 +1,639 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.math.stat.regression; +import java.io.Serializable; + +import org.apache.commons.math.MathException; +import org.apache.commons.math.MathRuntimeException; +import org.apache.commons.math.distribution.TDistribution; +import org.apache.commons.math.distribution.TDistributionImpl; +import org.apache.commons.math.exception.util.LocalizedFormats; +import org.apache.commons.math.util.FastMath; + +/** + * Estimates an ordinary least squares regression model + * with one independent variable. + * <p> + * <code> y = intercept + slope * x </code></p> + * <p> + * Standard errors for <code>intercept</code> and <code>slope</code> are + * available as well as ANOVA, r-square and Pearson's r statistics.</p> + * <p> + * Observations (x,y pairs) can be added to the model one at a time or they + * can be provided in a 2-dimensional array. The observations are not stored + * in memory, so there is no limit to the number of observations that can be + * added to the model.</p> + * <p> + * <strong>Usage Notes</strong>: <ul> + * <li> When there are fewer than two observations in the model, or when + * there is no variation in the x values (i.e. all x values are the same) + * all statistics return <code>NaN</code>. At least two observations with + * different x coordinates are requred to estimate a bivariate regression + * model. + * </li> + * <li> getters for the statistics always compute values based on the current + * set of observations -- i.e., you can get statistics, then add more data + * and get updated statistics without using a new instance. There is no + * "compute" method that updates all statistics. Each of the getters performs + * the necessary computations to return the requested statistic.</li> + * </ul></p> + * + * @version $Revision: 1042336 $ $Date: 2010-12-05 13:40:48 +0100 (dim. 05 déc. 2010) $ + */ +public class SimpleRegression implements Serializable { + + /** Serializable version identifier */ + private static final long serialVersionUID = -3004689053607543335L; + + /** the distribution used to compute inference statistics. */ + private TDistribution distribution; + + /** sum of x values */ + private double sumX = 0d; + + /** total variation in x (sum of squared deviations from xbar) */ + private double sumXX = 0d; + + /** sum of y values */ + private double sumY = 0d; + + /** total variation in y (sum of squared deviations from ybar) */ + private double sumYY = 0d; + + /** sum of products */ + private double sumXY = 0d; + + /** number of observations */ + private long n = 0; + + /** mean of accumulated x values, used in updating formulas */ + private double xbar = 0; + + /** mean of accumulated y values, used in updating formulas */ + private double ybar = 0; + + // ---------------------Public methods-------------------------------------- + + /** + * Create an empty SimpleRegression instance + */ + public SimpleRegression() { + this(new TDistributionImpl(1.0)); + } + + /** + * Create an empty SimpleRegression using the given distribution object to + * compute inference statistics. + * @param t the distribution used to compute inference statistics. + * @since 1.2 + * @deprecated in 2.2 (to be removed in 3.0). Please use the {@link + * #SimpleRegression(int) other constructor} instead. + */ + @Deprecated + public SimpleRegression(TDistribution t) { + super(); + setDistribution(t); + } + + /** + * Create an empty SimpleRegression. + * + * @param degrees Number of degrees of freedom of the distribution + * used to compute inference statistics. + * @since 2.2 + */ + public SimpleRegression(int degrees) { + setDistribution(new TDistributionImpl(degrees)); + } + + /** + * Adds the observation (x,y) to the regression data set. + * <p> + * Uses updating formulas for means and sums of squares defined in + * "Algorithms for Computing the Sample Variance: Analysis and + * Recommendations", Chan, T.F., Golub, G.H., and LeVeque, R.J. + * 1983, American Statistician, vol. 37, pp. 242-247, referenced in + * Weisberg, S. "Applied Linear Regression". 2nd Ed. 1985.</p> + * + * + * @param x independent variable value + * @param y dependent variable value + */ + public void addData(double x, double y) { + if (n == 0) { + xbar = x; + ybar = y; + } else { + double dx = x - xbar; + double dy = y - ybar; + sumXX += dx * dx * (double) n / (n + 1d); + sumYY += dy * dy * (double) n / (n + 1d); + sumXY += dx * dy * (double) n / (n + 1d); + xbar += dx / (n + 1.0); + ybar += dy / (n + 1.0); + } + sumX += x; + sumY += y; + n++; + + if (n > 2) { + distribution.setDegreesOfFreedom(n - 2); + } + } + + + /** + * Removes the observation (x,y) from the regression data set. + * <p> + * Mirrors the addData method. This method permits the use of + * SimpleRegression instances in streaming mode where the regression + * is applied to a sliding "window" of observations, however the caller is + * responsible for maintaining the set of observations in the window.</p> + * + * The method has no effect if there are no points of data (i.e. n=0) + * + * @param x independent variable value + * @param y dependent variable value + */ + public void removeData(double x, double y) { + if (n > 0) { + double dx = x - xbar; + double dy = y - ybar; + sumXX -= dx * dx * (double) n / (n - 1d); + sumYY -= dy * dy * (double) n / (n - 1d); + sumXY -= dx * dy * (double) n / (n - 1d); + xbar -= dx / (n - 1.0); + ybar -= dy / (n - 1.0); + sumX -= x; + sumY -= y; + n--; + + if (n > 2) { + distribution.setDegreesOfFreedom(n - 2); + } + } + } + + /** + * Adds the observations represented by the elements in + * <code>data</code>. + * <p> + * <code>(data[0][0],data[0][1])</code> will be the first observation, then + * <code>(data[1][0],data[1][1])</code>, etc.</p> + * <p> + * This method does not replace data that has already been added. The + * observations represented by <code>data</code> are added to the existing + * dataset.</p> + * <p> + * To replace all data, use <code>clear()</code> before adding the new + * data.</p> + * + * @param data array of observations to be added + */ + public void addData(double[][] data) { + for (int i = 0; i < data.length; i++) { + addData(data[i][0], data[i][1]); + } + } + + + /** + * Removes observations represented by the elements in <code>data</code>. + * <p> + * If the array is larger than the current n, only the first n elements are + * processed. This method permits the use of SimpleRegression instances in + * streaming mode where the regression is applied to a sliding "window" of + * observations, however the caller is responsible for maintaining the set + * of observations in the window.</p> + * <p> + * To remove all data, use <code>clear()</code>.</p> + * + * @param data array of observations to be removed + */ + public void removeData(double[][] data) { + for (int i = 0; i < data.length && n > 0; i++) { + removeData(data[i][0], data[i][1]); + } + } + + /** + * Clears all data from the model. + */ + public void clear() { + sumX = 0d; + sumXX = 0d; + sumY = 0d; + sumYY = 0d; + sumXY = 0d; + n = 0; + } + + /** + * Returns the number of observations that have been added to the model. + * + * @return n number of observations that have been added. + */ + public long getN() { + return n; + } + + /** + * Returns the "predicted" <code>y</code> value associated with the + * supplied <code>x</code> value, based on the data that has been + * added to the model when this method is activated. + * <p> + * <code> predict(x) = intercept + slope * x </code></p> + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>At least two observations (with at least two different x values) + * must have been added before invoking this method. If this method is + * invoked before a model can be estimated, <code>Double,NaN</code> is + * returned. + * </li></ul></p> + * + * @param x input <code>x</code> value + * @return predicted <code>y</code> value + */ + public double predict(double x) { + double b1 = getSlope(); + return getIntercept(b1) + b1 * x; + } + + /** + * Returns the intercept of the estimated regression line. + * <p> + * The least squares estimate of the intercept is computed using the + * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>. + * The intercept is sometimes denoted b0.</p> + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>At least two observations (with at least two different x values) + * must have been added before invoking this method. If this method is + * invoked before a model can be estimated, <code>Double,NaN</code> is + * returned. + * </li></ul></p> + * + * @return the intercept of the regression line + */ + public double getIntercept() { + return getIntercept(getSlope()); + } + + /** + * Returns the slope of the estimated regression line. + * <p> + * The least squares estimate of the slope is computed using the + * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>. + * The slope is sometimes denoted b1.</p> + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>At least two observations (with at least two different x values) + * must have been added before invoking this method. If this method is + * invoked before a model can be estimated, <code>Double.NaN</code> is + * returned. + * </li></ul></p> + * + * @return the slope of the regression line + */ + public double getSlope() { + if (n < 2) { + return Double.NaN; //not enough data + } + if (FastMath.abs(sumXX) < 10 * Double.MIN_VALUE) { + return Double.NaN; //not enough variation in x + } + return sumXY / sumXX; + } + + /** + * Returns the <a href="http://www.xycoon.com/SumOfSquares.htm"> + * sum of squared errors</a> (SSE) associated with the regression + * model. + * <p> + * The sum is computed using the computational formula</p> + * <p> + * <code>SSE = SYY - (SXY * SXY / SXX)</code></p> + * <p> + * where <code>SYY</code> is the sum of the squared deviations of the y + * values about their mean, <code>SXX</code> is similarly defined and + * <code>SXY</code> is the sum of the products of x and y mean deviations. + * </p><p> + * The sums are accumulated using the updating algorithm referenced in + * {@link #addData}.</p> + * <p> + * The return value is constrained to be non-negative - i.e., if due to + * rounding errors the computational formula returns a negative result, + * 0 is returned.</p> + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>At least two observations (with at least two different x values) + * must have been added before invoking this method. If this method is + * invoked before a model can be estimated, <code>Double,NaN</code> is + * returned. + * </li></ul></p> + * + * @return sum of squared errors associated with the regression model + */ + public double getSumSquaredErrors() { + return FastMath.max(0d, sumYY - sumXY * sumXY / sumXX); + } + + /** + * Returns the sum of squared deviations of the y values about their mean. + * <p> + * This is defined as SSTO + * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>.</p> + * <p> + * If <code>n < 2</code>, this returns <code>Double.NaN</code>.</p> + * + * @return sum of squared deviations of y values + */ + public double getTotalSumSquares() { + if (n < 2) { + return Double.NaN; + } + return sumYY; + } + + /** + * Returns the sum of squared deviations of the x values about their mean. + * + * If <code>n < 2</code>, this returns <code>Double.NaN</code>.</p> + * + * @return sum of squared deviations of x values + */ + public double getXSumSquares() { + if (n < 2) { + return Double.NaN; + } + return sumXX; + } + + /** + * Returns the sum of crossproducts, x<sub>i</sub>*y<sub>i</sub>. + * + * @return sum of cross products + */ + public double getSumOfCrossProducts() { + return sumXY; + } + + /** + * Returns the sum of squared deviations of the predicted y values about + * their mean (which equals the mean of y). + * <p> + * This is usually abbreviated SSR or SSM. It is defined as SSM + * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a></p> + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>At least two observations (with at least two different x values) + * must have been added before invoking this method. If this method is + * invoked before a model can be estimated, <code>Double.NaN</code> is + * returned. + * </li></ul></p> + * + * @return sum of squared deviations of predicted y values + */ + public double getRegressionSumSquares() { + return getRegressionSumSquares(getSlope()); + } + + /** + * Returns the sum of squared errors divided by the degrees of freedom, + * usually abbreviated MSE. + * <p> + * If there are fewer than <strong>three</strong> data pairs in the model, + * or if there is no variation in <code>x</code>, this returns + * <code>Double.NaN</code>.</p> + * + * @return sum of squared deviations of y values + */ + public double getMeanSquareError() { + if (n < 3) { + return Double.NaN; + } + return getSumSquaredErrors() / (n - 2); + } + + /** + * Returns <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html"> + * Pearson's product moment correlation coefficient</a>, + * usually denoted r. + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>At least two observations (with at least two different x values) + * must have been added before invoking this method. If this method is + * invoked before a model can be estimated, <code>Double,NaN</code> is + * returned. + * </li></ul></p> + * + * @return Pearson's r + */ + public double getR() { + double b1 = getSlope(); + double result = FastMath.sqrt(getRSquare()); + if (b1 < 0) { + result = -result; + } + return result; + } + + /** + * Returns the <a href="http://www.xycoon.com/coefficient1.htm"> + * coefficient of determination</a>, + * usually denoted r-square. + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>At least two observations (with at least two different x values) + * must have been added before invoking this method. If this method is + * invoked before a model can be estimated, <code>Double,NaN</code> is + * returned. + * </li></ul></p> + * + * @return r-square + */ + public double getRSquare() { + double ssto = getTotalSumSquares(); + return (ssto - getSumSquaredErrors()) / ssto; + } + + /** + * Returns the <a href="http://www.xycoon.com/standarderrorb0.htm"> + * standard error of the intercept estimate</a>, + * usually denoted s(b0). + * <p> + * If there are fewer that <strong>three</strong> observations in the + * model, or if there is no variation in x, this returns + * <code>Double.NaN</code>.</p> + * + * @return standard error associated with intercept estimate + */ + public double getInterceptStdErr() { + return FastMath.sqrt( + getMeanSquareError() * ((1d / (double) n) + (xbar * xbar) / sumXX)); + } + + /** + * Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard + * error of the slope estimate</a>, + * usually denoted s(b1). + * <p> + * If there are fewer that <strong>three</strong> data pairs in the model, + * or if there is no variation in x, this returns <code>Double.NaN</code>. + * </p> + * + * @return standard error associated with slope estimate + */ + public double getSlopeStdErr() { + return FastMath.sqrt(getMeanSquareError() / sumXX); + } + + /** + * Returns the half-width of a 95% confidence interval for the slope + * estimate. + * <p> + * The 95% confidence interval is</p> + * <p> + * <code>(getSlope() - getSlopeConfidenceInterval(), + * getSlope() + getSlopeConfidenceInterval())</code></p> + * <p> + * If there are fewer that <strong>three</strong> observations in the + * model, or if there is no variation in x, this returns + * <code>Double.NaN</code>.</p> + * <p> + * <strong>Usage Note</strong>:<br> + * The validity of this statistic depends on the assumption that the + * observations included in the model are drawn from a + * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html"> + * Bivariate Normal Distribution</a>.</p> + * + * @return half-width of 95% confidence interval for the slope estimate + * @throws MathException if the confidence interval can not be computed. + */ + public double getSlopeConfidenceInterval() throws MathException { + return getSlopeConfidenceInterval(0.05d); + } + + /** + * Returns the half-width of a (100-100*alpha)% confidence interval for + * the slope estimate. + * <p> + * The (100-100*alpha)% confidence interval is </p> + * <p> + * <code>(getSlope() - getSlopeConfidenceInterval(), + * getSlope() + getSlopeConfidenceInterval())</code></p> + * <p> + * To request, for example, a 99% confidence interval, use + * <code>alpha = .01</code></p> + * <p> + * <strong>Usage Note</strong>:<br> + * The validity of this statistic depends on the assumption that the + * observations included in the model are drawn from a + * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html"> + * Bivariate Normal Distribution</a>.</p> + * <p> + * <strong> Preconditions:</strong><ul> + * <li>If there are fewer that <strong>three</strong> observations in the + * model, or if there is no variation in x, this returns + * <code>Double.NaN</code>. + * </li> + * <li><code>(0 < alpha < 1)</code>; otherwise an + * <code>IllegalArgumentException</code> is thrown. + * </li></ul></p> + * + * @param alpha the desired significance level + * @return half-width of 95% confidence interval for the slope estimate + * @throws MathException if the confidence interval can not be computed. + */ + public double getSlopeConfidenceInterval(double alpha) + throws MathException { + if (alpha >= 1 || alpha <= 0) { + throw MathRuntimeException.createIllegalArgumentException( + LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL, + alpha, 0.0, 1.0); + } + return getSlopeStdErr() * + distribution.inverseCumulativeProbability(1d - alpha / 2d); + } + + /** + * Returns the significance level of the slope (equiv) correlation. + * <p> + * Specifically, the returned value is the smallest <code>alpha</code> + * such that the slope confidence interval with significance level + * equal to <code>alpha</code> does not include <code>0</code>. + * On regression output, this is often denoted <code>Prob(|t| > 0)</code> + * </p><p> + * <strong>Usage Note</strong>:<br> + * The validity of this statistic depends on the assumption that the + * observations included in the model are drawn from a + * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html"> + * Bivariate Normal Distribution</a>.</p> + * <p> + * If there are fewer that <strong>three</strong> observations in the + * model, or if there is no variation in x, this returns + * <code>Double.NaN</code>.</p> + * + * @return significance level for slope/correlation + * @throws MathException if the significance level can not be computed. + */ + public double getSignificance() throws MathException { + return 2d * (1.0 - distribution.cumulativeProbability( + FastMath.abs(getSlope()) / getSlopeStdErr())); + } + + // ---------------------Private methods----------------------------------- + + /** + * Returns the intercept of the estimated regression line, given the slope. + * <p> + * Will return <code>NaN</code> if slope is <code>NaN</code>.</p> + * + * @param slope current slope + * @return the intercept of the regression line + */ + private double getIntercept(double slope) { + return (sumY - slope * sumX) / n; + } + + /** + * Computes SSR from b1. + * + * @param slope regression slope estimate + * @return sum of squared deviations of predicted y values + */ + private double getRegressionSumSquares(double slope) { + return slope * slope * sumXX; + } + + /** + * Modify the distribution used to compute inference statistics. + * @param value the new distribution + * @since 1.2 + * @deprecated in 2.2 (to be removed in 3.0). + */ + @Deprecated + public void setDistribution(TDistribution value) { + distribution = value; + + // modify degrees of freedom + if (n > 2) { + distribution.setDegreesOfFreedom(n - 2); + } + } +} diff --git a/src/main/java/org/apache/commons/math/stat/regression/package.html b/src/main/java/org/apache/commons/math/stat/regression/package.html new file mode 100644 index 0000000..2538c6e --- /dev/null +++ b/src/main/java/org/apache/commons/math/stat/regression/package.html @@ -0,0 +1,22 @@ +<html> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + --> + <!-- $Revision: 480440 $ $Date: 2006-11-29 08:14:12 +0100 (mer. 29 nov. 2006) $ --> + <body> + Statistical routines involving multivariate data. + </body> +</html> |