diff options
Diffstat (limited to 'src/main/java/org/apache/commons/math3/stat/ranking')
5 files changed, 645 insertions, 0 deletions
diff --git a/src/main/java/org/apache/commons/math3/stat/ranking/NaNStrategy.java b/src/main/java/org/apache/commons/math3/stat/ranking/NaNStrategy.java new file mode 100644 index 0000000..1a916ef --- /dev/null +++ b/src/main/java/org/apache/commons/math3/stat/ranking/NaNStrategy.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.math3.stat.ranking; + +/** + * Strategies for handling NaN values in rank transformations. + * <ul> + * <li>MINIMAL - NaNs are treated as minimal in the ordering, equivalent to + * (that is, tied with) <code>Double.NEGATIVE_INFINITY</code>.</li> + * <li>MAXIMAL - NaNs are treated as maximal in the ordering, equivalent to + * <code>Double.POSITIVE_INFINITY</code></li> + * <li>REMOVED - NaNs are removed before the rank transform is applied</li> + * <li>FIXED - NaNs are left "in place," that is the rank transformation is + * applied to the other elements in the input array, but the NaN elements + * are returned unchanged.</li> + * <li>FAILED - If any NaN is encountered in the input array, an appropriate + * exception is thrown</li> + * </ul> + * + * @since 2.0 + */ +public enum NaNStrategy { + + /** NaNs are considered minimal in the ordering */ + MINIMAL, + + /** NaNs are considered maximal in the ordering */ + MAXIMAL, + + /** NaNs are removed before computing ranks */ + REMOVED, + + /** NaNs are left in place */ + FIXED, + + /** NaNs result in an exception + * @since 3.1 + */ + FAILED +} diff --git a/src/main/java/org/apache/commons/math3/stat/ranking/NaturalRanking.java b/src/main/java/org/apache/commons/math3/stat/ranking/NaturalRanking.java new file mode 100644 index 0000000..6107c46 --- /dev/null +++ b/src/main/java/org/apache/commons/math3/stat/ranking/NaturalRanking.java @@ -0,0 +1,474 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.math3.stat.ranking; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; + +import org.apache.commons.math3.exception.MathInternalError; +import org.apache.commons.math3.exception.NotANumberException; +import org.apache.commons.math3.random.RandomDataGenerator; +import org.apache.commons.math3.random.RandomGenerator; +import org.apache.commons.math3.util.FastMath; + + +/** + * <p> Ranking based on the natural ordering on doubles.</p> + * <p>NaNs are treated according to the configured {@link NaNStrategy} and ties + * are handled using the selected {@link TiesStrategy}. + * Configuration settings are supplied in optional constructor arguments. + * Defaults are {@link NaNStrategy#FAILED} and {@link TiesStrategy#AVERAGE}, + * respectively. When using {@link TiesStrategy#RANDOM}, a + * {@link RandomGenerator} may be supplied as a constructor argument.</p> + * <p>Examples: + * <table border="1" cellpadding="3"> + * <tr><th colspan="3"> + * Input data: (20, 17, 30, 42.3, 17, 50, Double.NaN, Double.NEGATIVE_INFINITY, 17) + * </th></tr> + * <tr><th>NaNStrategy</th><th>TiesStrategy</th> + * <th><code>rank(data)</code></th> + * <tr> + * <td>default (NaNs maximal)</td> + * <td>default (ties averaged)</td> + * <td>(5, 3, 6, 7, 3, 8, 9, 1, 3)</td></tr> + * <tr> + * <td>default (NaNs maximal)</td> + * <td>MINIMUM</td> + * <td>(5, 2, 6, 7, 2, 8, 9, 1, 2)</td></tr> + * <tr> + * <td>MINIMAL</td> + * <td>default (ties averaged)</td> + * <td>(6, 4, 7, 8, 4, 9, 1.5, 1.5, 4)</td></tr> + * <tr> + * <td>REMOVED</td> + * <td>SEQUENTIAL</td> + * <td>(5, 2, 6, 7, 3, 8, 1, 4)</td></tr> + * <tr> + * <td>MINIMAL</td> + * <td>MAXIMUM</td> + * <td>(6, 5, 7, 8, 5, 9, 2, 2, 5)</td></tr></table></p> + * + * @since 2.0 + */ +public class NaturalRanking implements RankingAlgorithm { + + /** default NaN strategy */ + public static final NaNStrategy DEFAULT_NAN_STRATEGY = NaNStrategy.FAILED; + + /** default ties strategy */ + public static final TiesStrategy DEFAULT_TIES_STRATEGY = TiesStrategy.AVERAGE; + + /** NaN strategy - defaults to NaNs maximal */ + private final NaNStrategy nanStrategy; + + /** Ties strategy - defaults to ties averaged */ + private final TiesStrategy tiesStrategy; + + /** Source of random data - used only when ties strategy is RANDOM */ + private final RandomDataGenerator randomData; + + /** + * Create a NaturalRanking with default strategies for handling ties and NaNs. + */ + public NaturalRanking() { + super(); + tiesStrategy = DEFAULT_TIES_STRATEGY; + nanStrategy = DEFAULT_NAN_STRATEGY; + randomData = null; + } + + /** + * Create a NaturalRanking with the given TiesStrategy. + * + * @param tiesStrategy the TiesStrategy to use + */ + public NaturalRanking(TiesStrategy tiesStrategy) { + super(); + this.tiesStrategy = tiesStrategy; + nanStrategy = DEFAULT_NAN_STRATEGY; + randomData = new RandomDataGenerator(); + } + + /** + * Create a NaturalRanking with the given NaNStrategy. + * + * @param nanStrategy the NaNStrategy to use + */ + public NaturalRanking(NaNStrategy nanStrategy) { + super(); + this.nanStrategy = nanStrategy; + tiesStrategy = DEFAULT_TIES_STRATEGY; + randomData = null; + } + + /** + * Create a NaturalRanking with the given NaNStrategy and TiesStrategy. + * + * @param nanStrategy NaNStrategy to use + * @param tiesStrategy TiesStrategy to use + */ + public NaturalRanking(NaNStrategy nanStrategy, TiesStrategy tiesStrategy) { + super(); + this.nanStrategy = nanStrategy; + this.tiesStrategy = tiesStrategy; + randomData = new RandomDataGenerator(); + } + + /** + * Create a NaturalRanking with TiesStrategy.RANDOM and the given + * RandomGenerator as the source of random data. + * + * @param randomGenerator source of random data + */ + public NaturalRanking(RandomGenerator randomGenerator) { + super(); + this.tiesStrategy = TiesStrategy.RANDOM; + nanStrategy = DEFAULT_NAN_STRATEGY; + randomData = new RandomDataGenerator(randomGenerator); + } + + + /** + * Create a NaturalRanking with the given NaNStrategy, TiesStrategy.RANDOM + * and the given source of random data. + * + * @param nanStrategy NaNStrategy to use + * @param randomGenerator source of random data + */ + public NaturalRanking(NaNStrategy nanStrategy, + RandomGenerator randomGenerator) { + super(); + this.nanStrategy = nanStrategy; + this.tiesStrategy = TiesStrategy.RANDOM; + randomData = new RandomDataGenerator(randomGenerator); + } + + /** + * Return the NaNStrategy + * + * @return returns the NaNStrategy + */ + public NaNStrategy getNanStrategy() { + return nanStrategy; + } + + /** + * Return the TiesStrategy + * + * @return the TiesStrategy + */ + public TiesStrategy getTiesStrategy() { + return tiesStrategy; + } + + /** + * Rank <code>data</code> using the natural ordering on Doubles, with + * NaN values handled according to <code>nanStrategy</code> and ties + * resolved using <code>tiesStrategy.</code> + * + * @param data array to be ranked + * @return array of ranks + * @throws NotANumberException if the selected {@link NaNStrategy} is {@code FAILED} + * and a {@link Double#NaN} is encountered in the input data + */ + public double[] rank(double[] data) { + + // Array recording initial positions of data to be ranked + IntDoublePair[] ranks = new IntDoublePair[data.length]; + for (int i = 0; i < data.length; i++) { + ranks[i] = new IntDoublePair(data[i], i); + } + + // Recode, remove or record positions of NaNs + List<Integer> nanPositions = null; + switch (nanStrategy) { + case MAXIMAL: // Replace NaNs with +INFs + recodeNaNs(ranks, Double.POSITIVE_INFINITY); + break; + case MINIMAL: // Replace NaNs with -INFs + recodeNaNs(ranks, Double.NEGATIVE_INFINITY); + break; + case REMOVED: // Drop NaNs from data + ranks = removeNaNs(ranks); + break; + case FIXED: // Record positions of NaNs + nanPositions = getNanPositions(ranks); + break; + case FAILED: + nanPositions = getNanPositions(ranks); + if (nanPositions.size() > 0) { + throw new NotANumberException(); + } + break; + default: // this should not happen unless NaNStrategy enum is changed + throw new MathInternalError(); + } + + // Sort the IntDoublePairs + Arrays.sort(ranks); + + // Walk the sorted array, filling output array using sorted positions, + // resolving ties as we go + double[] out = new double[ranks.length]; + int pos = 1; // position in sorted array + out[ranks[0].getPosition()] = pos; + List<Integer> tiesTrace = new ArrayList<Integer>(); + tiesTrace.add(ranks[0].getPosition()); + for (int i = 1; i < ranks.length; i++) { + if (Double.compare(ranks[i].getValue(), ranks[i - 1].getValue()) > 0) { + // tie sequence has ended (or had length 1) + pos = i + 1; + if (tiesTrace.size() > 1) { // if seq is nontrivial, resolve + resolveTie(out, tiesTrace); + } + tiesTrace = new ArrayList<Integer>(); + tiesTrace.add(ranks[i].getPosition()); + } else { + // tie sequence continues + tiesTrace.add(ranks[i].getPosition()); + } + out[ranks[i].getPosition()] = pos; + } + if (tiesTrace.size() > 1) { // handle tie sequence at end + resolveTie(out, tiesTrace); + } + if (nanStrategy == NaNStrategy.FIXED) { + restoreNaNs(out, nanPositions); + } + return out; + } + + /** + * Returns an array that is a copy of the input array with IntDoublePairs + * having NaN values removed. + * + * @param ranks input array + * @return array with NaN-valued entries removed + */ + private IntDoublePair[] removeNaNs(IntDoublePair[] ranks) { + if (!containsNaNs(ranks)) { + return ranks; + } + IntDoublePair[] outRanks = new IntDoublePair[ranks.length]; + int j = 0; + for (int i = 0; i < ranks.length; i++) { + if (Double.isNaN(ranks[i].getValue())) { + // drop, but adjust original ranks of later elements + for (int k = i + 1; k < ranks.length; k++) { + ranks[k] = new IntDoublePair( + ranks[k].getValue(), ranks[k].getPosition() - 1); + } + } else { + outRanks[j] = new IntDoublePair( + ranks[i].getValue(), ranks[i].getPosition()); + j++; + } + } + IntDoublePair[] returnRanks = new IntDoublePair[j]; + System.arraycopy(outRanks, 0, returnRanks, 0, j); + return returnRanks; + } + + /** + * Recodes NaN values to the given value. + * + * @param ranks array to recode + * @param value the value to replace NaNs with + */ + private void recodeNaNs(IntDoublePair[] ranks, double value) { + for (int i = 0; i < ranks.length; i++) { + if (Double.isNaN(ranks[i].getValue())) { + ranks[i] = new IntDoublePair( + value, ranks[i].getPosition()); + } + } + } + + /** + * Checks for presence of NaNs in <code>ranks.</code> + * + * @param ranks array to be searched for NaNs + * @return true iff ranks contains one or more NaNs + */ + private boolean containsNaNs(IntDoublePair[] ranks) { + for (int i = 0; i < ranks.length; i++) { + if (Double.isNaN(ranks[i].getValue())) { + return true; + } + } + return false; + } + + /** + * Resolve a sequence of ties, using the configured {@link TiesStrategy}. + * The input <code>ranks</code> array is expected to take the same value + * for all indices in <code>tiesTrace</code>. The common value is recoded + * according to the tiesStrategy. For example, if ranks = <5,8,2,6,2,7,1,2>, + * tiesTrace = <2,4,7> and tiesStrategy is MINIMUM, ranks will be unchanged. + * The same array and trace with tiesStrategy AVERAGE will come out + * <5,8,3,6,3,7,1,3>. + * + * @param ranks array of ranks + * @param tiesTrace list of indices where <code>ranks</code> is constant + * -- that is, for any i and j in TiesTrace, <code> ranks[i] == ranks[j] + * </code> + */ + private void resolveTie(double[] ranks, List<Integer> tiesTrace) { + + // constant value of ranks over tiesTrace + final double c = ranks[tiesTrace.get(0)]; + + // length of sequence of tied ranks + final int length = tiesTrace.size(); + + switch (tiesStrategy) { + case AVERAGE: // Replace ranks with average + fill(ranks, tiesTrace, (2 * c + length - 1) / 2d); + break; + case MAXIMUM: // Replace ranks with maximum values + fill(ranks, tiesTrace, c + length - 1); + break; + case MINIMUM: // Replace ties with minimum + fill(ranks, tiesTrace, c); + break; + case RANDOM: // Fill with random integral values in [c, c + length - 1] + Iterator<Integer> iterator = tiesTrace.iterator(); + long f = FastMath.round(c); + while (iterator.hasNext()) { + // No advertised exception because args are guaranteed valid + ranks[iterator.next()] = + randomData.nextLong(f, f + length - 1); + } + break; + case SEQUENTIAL: // Fill sequentially from c to c + length - 1 + // walk and fill + iterator = tiesTrace.iterator(); + f = FastMath.round(c); + int i = 0; + while (iterator.hasNext()) { + ranks[iterator.next()] = f + i++; + } + break; + default: // this should not happen unless TiesStrategy enum is changed + throw new MathInternalError(); + } + } + + /** + * Sets<code>data[i] = value</code> for each i in <code>tiesTrace.</code> + * + * @param data array to modify + * @param tiesTrace list of index values to set + * @param value value to set + */ + private void fill(double[] data, List<Integer> tiesTrace, double value) { + Iterator<Integer> iterator = tiesTrace.iterator(); + while (iterator.hasNext()) { + data[iterator.next()] = value; + } + } + + /** + * Set <code>ranks[i] = Double.NaN</code> for each i in <code>nanPositions.</code> + * + * @param ranks array to modify + * @param nanPositions list of index values to set to <code>Double.NaN</code> + */ + private void restoreNaNs(double[] ranks, List<Integer> nanPositions) { + if (nanPositions.size() == 0) { + return; + } + Iterator<Integer> iterator = nanPositions.iterator(); + while (iterator.hasNext()) { + ranks[iterator.next().intValue()] = Double.NaN; + } + + } + + /** + * Returns a list of indexes where <code>ranks</code> is <code>NaN.</code> + * + * @param ranks array to search for <code>NaNs</code> + * @return list of indexes i such that <code>ranks[i] = NaN</code> + */ + private List<Integer> getNanPositions(IntDoublePair[] ranks) { + ArrayList<Integer> out = new ArrayList<Integer>(); + for (int i = 0; i < ranks.length; i++) { + if (Double.isNaN(ranks[i].getValue())) { + out.add(Integer.valueOf(i)); + } + } + return out; + } + + /** + * Represents the position of a double value in an ordering. + * Comparable interface is implemented so Arrays.sort can be used + * to sort an array of IntDoublePairs by value. Note that the + * implicitly defined natural ordering is NOT consistent with equals. + */ + private static class IntDoublePair implements Comparable<IntDoublePair> { + + /** Value of the pair */ + private final double value; + + /** Original position of the pair */ + private final int position; + + /** + * Construct an IntDoublePair with the given value and position. + * @param value the value of the pair + * @param position the original position + */ + IntDoublePair(double value, int position) { + this.value = value; + this.position = position; + } + + /** + * Compare this IntDoublePair to another pair. + * Only the <strong>values</strong> are compared. + * + * @param other the other pair to compare this to + * @return result of <code>Double.compare(value, other.value)</code> + */ + public int compareTo(IntDoublePair other) { + return Double.compare(value, other.value); + } + + // N.B. equals() and hashCode() are not implemented; see MATH-610 for discussion. + + /** + * Returns the value of the pair. + * @return value + */ + public double getValue() { + return value; + } + + /** + * Returns the original position of the pair. + * @return position + */ + public int getPosition() { + return position; + } + } +} diff --git a/src/main/java/org/apache/commons/math3/stat/ranking/RankingAlgorithm.java b/src/main/java/org/apache/commons/math3/stat/ranking/RankingAlgorithm.java new file mode 100644 index 0000000..188bc99 --- /dev/null +++ b/src/main/java/org/apache/commons/math3/stat/ranking/RankingAlgorithm.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.math3.stat.ranking; + +/** + * Interface representing a rank transformation. + * + * @since 2.0 + */ +public interface RankingAlgorithm { + /** + * <p>Performs a rank transformation on the input data, returning an array + * of ranks.</p> + * + * <p>Ranks should be 1-based - that is, the smallest value + * returned in an array of ranks should be greater than or equal to one, + * rather than 0. Ranks should in general take integer values, though + * implementations may return averages or other floating point values + * to resolve ties in the input data.</p> + * + * @param data array of data to be ranked + * @return an array of ranks corresponding to the elements of the input array + */ + double[] rank (double[] data); +} diff --git a/src/main/java/org/apache/commons/math3/stat/ranking/TiesStrategy.java b/src/main/java/org/apache/commons/math3/stat/ranking/TiesStrategy.java new file mode 100644 index 0000000..08ab99a --- /dev/null +++ b/src/main/java/org/apache/commons/math3/stat/ranking/TiesStrategy.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.math3.stat.ranking; + +/** + * Strategies for handling tied values in rank transformations. + * <ul> + * <li>SEQUENTIAL - Ties are assigned ranks in order of occurrence in the original array, + * for example (1,3,4,3) is ranked as (1,2,4,3)</li> + * <li>MINIMUM - Tied values are assigned the minimum applicable rank, or the rank + * of the first occurrence. For example, (1,3,4,3) is ranked as (1,2,4,2)</li> + * <li>MAXIMUM - Tied values are assigned the maximum applicable rank, or the rank + * of the last occurrence. For example, (1,3,4,3) is ranked as (1,3,4,3)</li> + * <li>AVERAGE - Tied values are assigned the average of the applicable ranks. + * For example, (1,3,4,3) is ranked as (1,2.5,4,2.5)</li> + * <li>RANDOM - Tied values are assigned a random integer rank from among the + * applicable values. The assigned rank will always be an integer, (inclusively) + * between the values returned by the MINIMUM and MAXIMUM strategies.</li> + * </ul> + * + * @since 2.0 + */ +public enum TiesStrategy { + + /** Ties assigned sequential ranks in order of occurrence */ + SEQUENTIAL, + + /** Ties get the minimum applicable rank */ + MINIMUM, + + /** Ties get the maximum applicable rank */ + MAXIMUM, + + /** Ties get the average of applicable ranks */ + AVERAGE, + + /** Ties get a random integral value from among applicable ranks */ + RANDOM +} diff --git a/src/main/java/org/apache/commons/math3/stat/ranking/package-info.java b/src/main/java/org/apache/commons/math3/stat/ranking/package-info.java new file mode 100644 index 0000000..b86575b --- /dev/null +++ b/src/main/java/org/apache/commons/math3/stat/ranking/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * + * Classes providing rank transformations. + * + */ +package org.apache.commons.math3.stat.ranking; |