diff options
Diffstat (limited to 'src/main/java/org/apache/commons/math3/distribution/ZipfDistribution.java')
-rw-r--r-- | src/main/java/org/apache/commons/math3/distribution/ZipfDistribution.java | 502 |
1 files changed, 502 insertions, 0 deletions
diff --git a/src/main/java/org/apache/commons/math3/distribution/ZipfDistribution.java b/src/main/java/org/apache/commons/math3/distribution/ZipfDistribution.java new file mode 100644 index 0000000..d452122 --- /dev/null +++ b/src/main/java/org/apache/commons/math3/distribution/ZipfDistribution.java @@ -0,0 +1,502 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.math3.distribution; + +import org.apache.commons.math3.exception.NotStrictlyPositiveException; +import org.apache.commons.math3.exception.util.LocalizedFormats; +import org.apache.commons.math3.random.RandomGenerator; +import org.apache.commons.math3.random.Well19937c; +import org.apache.commons.math3.util.FastMath; + +/** + * Implementation of the Zipf distribution. + * + * <p><strong>Parameters:</strong> For a random variable {@code X} whose values are distributed + * according to this distribution, the probability mass function is given by + * + * <pre> + * P(X = k) = H(N,s) * 1 / k^s for {@code k = 1,2,...,N}. + * </pre> + * + * {@code H(N,s)} is the normalizing constant which corresponds to the generalized harmonic number + * of order N of s. + * + * <p> + * + * <ul> + * <li>{@code N} is the number of elements + * <li>{@code s} is the exponent + * </ul> + * + * @see <a href="https://en.wikipedia.org/wiki/Zipf's_law">Zipf's law (Wikipedia)</a> + * @see <a + * href="https://en.wikipedia.org/wiki/Harmonic_number#Generalized_harmonic_numbers">Generalized + * harmonic numbers</a> + */ +public class ZipfDistribution extends AbstractIntegerDistribution { + /** Serializable version identifier. */ + private static final long serialVersionUID = -140627372283420404L; + + /** Number of elements. */ + private final int numberOfElements; + + /** Exponent parameter of the distribution. */ + private final double exponent; + + /** Cached numerical mean */ + private double numericalMean = Double.NaN; + + /** Whether or not the numerical mean has been calculated */ + private boolean numericalMeanIsCalculated = false; + + /** Cached numerical variance */ + private double numericalVariance = Double.NaN; + + /** Whether or not the numerical variance has been calculated */ + private boolean numericalVarianceIsCalculated = false; + + /** The sampler to be used for the sample() method */ + private transient ZipfRejectionInversionSampler sampler; + + /** + * Create a new Zipf distribution with the given number of elements and exponent. + * + * <p><b>Note:</b> this constructor will implicitly create an instance of {@link Well19937c} as + * random generator to be used for sampling only (see {@link #sample()} and {@link + * #sample(int)}). In case no sampling is needed for the created distribution, it is advised to + * pass {@code null} as random generator via the appropriate constructors to avoid the + * additional initialisation overhead. + * + * @param numberOfElements Number of elements. + * @param exponent Exponent. + * @exception NotStrictlyPositiveException if {@code numberOfElements <= 0} or {@code exponent + * <= 0}. + */ + public ZipfDistribution(final int numberOfElements, final double exponent) { + this(new Well19937c(), numberOfElements, exponent); + } + + /** + * Creates a Zipf distribution. + * + * @param rng Random number generator. + * @param numberOfElements Number of elements. + * @param exponent Exponent. + * @exception NotStrictlyPositiveException if {@code numberOfElements <= 0} or {@code exponent + * <= 0}. + * @since 3.1 + */ + public ZipfDistribution(RandomGenerator rng, int numberOfElements, double exponent) + throws NotStrictlyPositiveException { + super(rng); + + if (numberOfElements <= 0) { + throw new NotStrictlyPositiveException(LocalizedFormats.DIMENSION, numberOfElements); + } + if (exponent <= 0) { + throw new NotStrictlyPositiveException(LocalizedFormats.EXPONENT, exponent); + } + + this.numberOfElements = numberOfElements; + this.exponent = exponent; + } + + /** + * Get the number of elements (e.g. corpus size) for the distribution. + * + * @return the number of elements + */ + public int getNumberOfElements() { + return numberOfElements; + } + + /** + * Get the exponent characterizing the distribution. + * + * @return the exponent + */ + public double getExponent() { + return exponent; + } + + /** {@inheritDoc} */ + public double probability(final int x) { + if (x <= 0 || x > numberOfElements) { + return 0.0; + } + + return (1.0 / FastMath.pow(x, exponent)) / generalizedHarmonic(numberOfElements, exponent); + } + + /** {@inheritDoc} */ + @Override + public double logProbability(int x) { + if (x <= 0 || x > numberOfElements) { + return Double.NEGATIVE_INFINITY; + } + + return -FastMath.log(x) * exponent + - FastMath.log(generalizedHarmonic(numberOfElements, exponent)); + } + + /** {@inheritDoc} */ + public double cumulativeProbability(final int x) { + if (x <= 0) { + return 0.0; + } else if (x >= numberOfElements) { + return 1.0; + } + + return generalizedHarmonic(x, exponent) / generalizedHarmonic(numberOfElements, exponent); + } + + /** + * {@inheritDoc} + * + * <p>For number of elements {@code N} and exponent {@code s}, the mean is {@code Hs1 / Hs}, + * where + * + * <ul> + * <li>{@code Hs1 = generalizedHarmonic(N, s - 1)}, + * <li>{@code Hs = generalizedHarmonic(N, s)}. + * </ul> + */ + public double getNumericalMean() { + if (!numericalMeanIsCalculated) { + numericalMean = calculateNumericalMean(); + numericalMeanIsCalculated = true; + } + return numericalMean; + } + + /** + * Used by {@link #getNumericalMean()}. + * + * @return the mean of this distribution + */ + protected double calculateNumericalMean() { + final int N = getNumberOfElements(); + final double s = getExponent(); + + final double Hs1 = generalizedHarmonic(N, s - 1); + final double Hs = generalizedHarmonic(N, s); + + return Hs1 / Hs; + } + + /** + * {@inheritDoc} + * + * <p>For number of elements {@code N} and exponent {@code s}, the mean is {@code (Hs2 / Hs) - + * (Hs1^2 / Hs^2)}, where + * + * <ul> + * <li>{@code Hs2 = generalizedHarmonic(N, s - 2)}, + * <li>{@code Hs1 = generalizedHarmonic(N, s - 1)}, + * <li>{@code Hs = generalizedHarmonic(N, s)}. + * </ul> + */ + public double getNumericalVariance() { + if (!numericalVarianceIsCalculated) { + numericalVariance = calculateNumericalVariance(); + numericalVarianceIsCalculated = true; + } + return numericalVariance; + } + + /** + * Used by {@link #getNumericalVariance()}. + * + * @return the variance of this distribution + */ + protected double calculateNumericalVariance() { + final int N = getNumberOfElements(); + final double s = getExponent(); + + final double Hs2 = generalizedHarmonic(N, s - 2); + final double Hs1 = generalizedHarmonic(N, s - 1); + final double Hs = generalizedHarmonic(N, s); + + return (Hs2 / Hs) - ((Hs1 * Hs1) / (Hs * Hs)); + } + + /** + * Calculates the Nth generalized harmonic number. See <a + * href="http://mathworld.wolfram.com/HarmonicSeries.html">Harmonic Series</a>. + * + * @param n Term in the series to calculate (must be larger than 1) + * @param m Exponent (special case {@code m = 1} is the harmonic series). + * @return the n<sup>th</sup> generalized harmonic number. + */ + private double generalizedHarmonic(final int n, final double m) { + double value = 0; + for (int k = n; k > 0; --k) { + value += 1.0 / FastMath.pow(k, m); + } + return value; + } + + /** + * {@inheritDoc} + * + * <p>The lower bound of the support is always 1 no matter the parameters. + * + * @return lower bound of the support (always 1) + */ + public int getSupportLowerBound() { + return 1; + } + + /** + * {@inheritDoc} + * + * <p>The upper bound of the support is the number of elements. + * + * @return upper bound of the support + */ + public int getSupportUpperBound() { + return getNumberOfElements(); + } + + /** + * {@inheritDoc} + * + * <p>The support of this distribution is connected. + * + * @return {@code true} + */ + public boolean isSupportConnected() { + return true; + } + + /** {@inheritDoc} */ + @Override + public int sample() { + if (sampler == null) { + sampler = new ZipfRejectionInversionSampler(numberOfElements, exponent); + } + return sampler.sample(random); + } + + /** + * Utility class implementing a rejection inversion sampling method for a discrete, bounded Zipf + * distribution that is based on the method described in + * + * <p>Wolfgang Hörmann and Gerhard Derflinger "Rejection-inversion to generate variates from + * monotone discrete distributions." ACM Transactions on Modeling and Computer Simulation + * (TOMACS) 6.3 (1996): 169-184. + * + * <p>The paper describes an algorithm for exponents larger than 1 (Algorithm ZRI). The original + * method uses {@code H(x) := (v + x)^(1 - q) / (1 - q)} as the integral of the hat function. + * This function is undefined for q = 1, which is the reason for the limitation of the exponent. + * If instead the integral function {@code H(x) := ((v + x)^(1 - q) - 1) / (1 - q)} is used, for + * which a meaningful limit exists for q = 1, the method works for all positive exponents. + * + * <p>The following implementation uses v := 0 and generates integral numbers in the range [1, + * numberOfElements]. This is different to the original method where v is defined to be positive + * and numbers are taken from [0, i_max]. This explains why the implementation looks slightly + * different. + * + * @since 3.6 + */ + static final class ZipfRejectionInversionSampler { + + /** Exponent parameter of the distribution. */ + private final double exponent; + + /** Number of elements. */ + private final int numberOfElements; + + /** Constant equal to {@code hIntegral(1.5) - 1}. */ + private final double hIntegralX1; + + /** Constant equal to {@code hIntegral(numberOfElements + 0.5)}. */ + private final double hIntegralNumberOfElements; + + /** Constant equal to {@code 2 - hIntegralInverse(hIntegral(2.5) - h(2)}. */ + private final double s; + + /** + * Simple constructor. + * + * @param numberOfElements number of elements + * @param exponent exponent parameter of the distribution + */ + ZipfRejectionInversionSampler(final int numberOfElements, final double exponent) { + this.exponent = exponent; + this.numberOfElements = numberOfElements; + this.hIntegralX1 = hIntegral(1.5) - 1d; + this.hIntegralNumberOfElements = hIntegral(numberOfElements + 0.5); + this.s = 2d - hIntegralInverse(hIntegral(2.5) - h(2)); + } + + /** + * Generate one integral number in the range [1, numberOfElements]. + * + * @param random random generator to use + * @return generated integral number in the range [1, numberOfElements] + */ + int sample(final RandomGenerator random) { + while (true) { + + final double u = + hIntegralNumberOfElements + + random.nextDouble() * (hIntegralX1 - hIntegralNumberOfElements); + // u is uniformly distributed in (hIntegralX1, hIntegralNumberOfElements] + + double x = hIntegralInverse(u); + + int k = (int) (x + 0.5); + + // Limit k to the range [1, numberOfElements] + // (k could be outside due to numerical inaccuracies) + if (k < 1) { + k = 1; + } else if (k > numberOfElements) { + k = numberOfElements; + } + + // Here, the distribution of k is given by: + // + // P(k = 1) = C * (hIntegral(1.5) - hIntegralX1) = C + // P(k = m) = C * (hIntegral(m + 1/2) - hIntegral(m - 1/2)) for m >= 2 + // + // where C := 1 / (hIntegralNumberOfElements - hIntegralX1) + + if (k - x <= s || u >= hIntegral(k + 0.5) - h(k)) { + + // Case k = 1: + // + // The right inequality is always true, because replacing k by 1 gives + // u >= hIntegral(1.5) - h(1) = hIntegralX1 and u is taken from + // (hIntegralX1, hIntegralNumberOfElements]. + // + // Therefore, the acceptance rate for k = 1 is P(accepted | k = 1) = 1 + // and the probability that 1 is returned as random value is + // P(k = 1 and accepted) = P(accepted | k = 1) * P(k = 1) = C = C / 1^exponent + // + // Case k >= 2: + // + // The left inequality (k - x <= s) is just a short cut + // to avoid the more expensive evaluation of the right inequality + // (u >= hIntegral(k + 0.5) - h(k)) in many cases. + // + // If the left inequality is true, the right inequality is also true: + // Theorem 2 in the paper is valid for all positive exponents, because + // the requirements h'(x) = -exponent/x^(exponent + 1) < 0 and + // (-1/hInverse'(x))'' = (1+1/exponent) * x^(1/exponent-1) >= 0 + // are both fulfilled. + // Therefore, f(x) := x - hIntegralInverse(hIntegral(x + 0.5) - h(x)) + // is a non-decreasing function. If k - x <= s holds, + // k - x <= s + f(k) - f(2) is obviously also true which is equivalent to + // -x <= -hIntegralInverse(hIntegral(k + 0.5) - h(k)), + // -hIntegralInverse(u) <= -hIntegralInverse(hIntegral(k + 0.5) - h(k)), + // and finally u >= hIntegral(k + 0.5) - h(k). + // + // Hence, the right inequality determines the acceptance rate: + // P(accepted | k = m) = h(m) / (hIntegrated(m+1/2) - hIntegrated(m-1/2)) + // The probability that m is returned is given by + // P(k = m and accepted) = P(accepted | k = m) * P(k = m) = C * h(m) = C / + // m^exponent. + // + // In both cases the probabilities are proportional to the probability mass + // function + // of the Zipf distribution. + + return k; + } + } + } + + /** + * {@code H(x) :=} + * + * <ul> + * <li>{@code (x^(1-exponent) - 1)/(1 - exponent)}, if {@code exponent != 1} + * <li>{@code log(x)}, if {@code exponent == 1} + * </ul> + * + * H(x) is an integral function of h(x), the derivative of H(x) is h(x). + * + * @param x free parameter + * @return {@code H(x)} + */ + private double hIntegral(final double x) { + final double logX = FastMath.log(x); + return helper2((1d - exponent) * logX) * logX; + } + + /** + * {@code h(x) := 1/x^exponent} + * + * @param x free parameter + * @return h(x) + */ + private double h(final double x) { + return FastMath.exp(-exponent * FastMath.log(x)); + } + + /** + * The inverse function of H(x). + * + * @param x free parameter + * @return y for which {@code H(y) = x} + */ + private double hIntegralInverse(final double x) { + double t = x * (1d - exponent); + if (t < -1d) { + // Limit value to the range [-1, +inf). + // t could be smaller than -1 in some rare cases due to numerical errors. + t = -1; + } + return FastMath.exp(helper1(t) * x); + } + + /** + * Helper function that calculates {@code log(1+x)/x}. + * + * <p>A Taylor series expansion is used, if x is close to 0. + * + * @param x a value larger than or equal to -1 + * @return {@code log(1+x)/x} + */ + static double helper1(final double x) { + if (FastMath.abs(x) > 1e-8) { + return FastMath.log1p(x) / x; + } else { + return 1. - x * ((1. / 2.) - x * ((1. / 3.) - x * (1. / 4.))); + } + } + + /** + * Helper function to calculate {@code (exp(x)-1)/x}. + * + * <p>A Taylor series expansion is used, if x is close to 0. + * + * @param x free parameter + * @return {@code (exp(x)-1)/x} if x is non-zero, or 1 if x=0 + */ + static double helper2(final double x) { + if (FastMath.abs(x) > 1e-8) { + return FastMath.expm1(x) / x; + } else { + return 1. + x * (1. / 2.) * (1. + x * (1. / 3.) * (1. + x * (1. / 4.))); + } + } + } +} |