1 files changed, 502 insertions, 0 deletions
diff --git a/src/main/java/org/apache/commons/math3/distribution/ZipfDistribution.java b/src/main/java/org/apache/commons/math3/distribution/ZipfDistribution.java
new file mode 100644
index 0000000..d452122
--- /dev/null
+++ b/src/main/java/org/apache/commons/math3/distribution/ZipfDistribution.java
@@ -0,0 +1,502 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.commons.math3.distribution;
+
+import org.apache.commons.math3.exception.NotStrictlyPositiveException;
+import org.apache.commons.math3.exception.util.LocalizedFormats;
+import org.apache.commons.math3.random.RandomGenerator;
+import org.apache.commons.math3.random.Well19937c;
+import org.apache.commons.math3.util.FastMath;
+
+/**
+ * Implementation of the Zipf distribution.
+ *
+ * <p><strong>Parameters:</strong> For a random variable {@code X} whose values are distributed
+ * according to this distribution, the probability mass function is given by
+ *
+ * <pre>
+ *   P(X = k) = H(N,s) * 1 / k^s    for {@code k = 1,2,...,N}.
+ * </pre>
+ *
+ * {@code H(N,s)} is the normalizing constant which corresponds to the generalized harmonic number
+ * of order N of s.
+ *
+ * <p>
+ *
+ * <ul>
+ *   <li>{@code N} is the number of elements
+ *   <li>{@code s} is the exponent
+ * </ul>
+ *
+ * @see <a href="https://en.wikipedia.org/wiki/Zipf's_law">Zipf's law (Wikipedia)</a>
+ * @see <a
+ *     href="https://en.wikipedia.org/wiki/Harmonic_number#Generalized_harmonic_numbers">Generalized
+ *     harmonic numbers</a>
+ */
+public class ZipfDistribution extends AbstractIntegerDistribution {
+    /** Serializable version identifier. */
+    private static final long serialVersionUID = -140627372283420404L;
+
+    /** Number of elements. */
+    private final int numberOfElements;
+
+    /** Exponent parameter of the distribution. */
+    private final double exponent;
+
+    /** Cached numerical mean */
+    private double numericalMean = Double.NaN;
+
+    /** Whether or not the numerical mean has been calculated */
+    private boolean numericalMeanIsCalculated = false;
+
+    /** Cached numerical variance */
+    private double numericalVariance = Double.NaN;
+
+    /** Whether or not the numerical variance has been calculated */
+    private boolean numericalVarianceIsCalculated = false;
+
+    /** The sampler to be used for the sample() method */
+    private transient ZipfRejectionInversionSampler sampler;
+
+    /**
+     * Create a new Zipf distribution with the given number of elements and exponent.
+     *
+     * <p><b>Note:</b> this constructor will implicitly create an instance of {@link Well19937c} as
+     * random generator to be used for sampling only (see {@link #sample()} and {@link
+     * #sample(int)}). In case no sampling is needed for the created distribution, it is advised to
+     * pass {@code null} as random generator via the appropriate constructors to avoid the
+     * additional initialisation overhead.
+     *
+     * @param numberOfElements Number of elements.
+     * @param exponent Exponent.
+     * @exception NotStrictlyPositiveException if {@code numberOfElements <= 0} or {@code exponent
+     *     <= 0}.
+     */
+    public ZipfDistribution(final int numberOfElements, final double exponent) {
+        this(new Well19937c(), numberOfElements, exponent);
+    }
+
+    /**
+     * Creates a Zipf distribution.
+     *
+     * @param rng Random number generator.
+     * @param numberOfElements Number of elements.
+     * @param exponent Exponent.
+     * @exception NotStrictlyPositiveException if {@code numberOfElements <= 0} or {@code exponent
+     *     <= 0}.
+     * @since 3.1
+     */
+    public ZipfDistribution(RandomGenerator rng, int numberOfElements, double exponent)
+            throws NotStrictlyPositiveException {
+        super(rng);
+
+        if (numberOfElements <= 0) {
+            throw new NotStrictlyPositiveException(LocalizedFormats.DIMENSION, numberOfElements);
+        }
+        if (exponent <= 0) {
+            throw new NotStrictlyPositiveException(LocalizedFormats.EXPONENT, exponent);
+        }
+
+        this.numberOfElements = numberOfElements;
+        this.exponent = exponent;
+    }
+
+    /**
+     * Get the number of elements (e.g. corpus size) for the distribution.
+     *
+     * @return the number of elements
+     */
+    public int getNumberOfElements() {
+        return numberOfElements;
+    }
+
+    /**
+     * Get the exponent characterizing the distribution.
+     *
+     * @return the exponent
+     */
+    public double getExponent() {
+        return exponent;
+    }
+
+    /** {@inheritDoc} */
+    public double probability(final int x) {
+        if (x <= 0 || x > numberOfElements) {
+            return 0.0;
+        }
+
+        return (1.0 / FastMath.pow(x, exponent)) / generalizedHarmonic(numberOfElements, exponent);
+    }
+
+    /** {@inheritDoc} */
+    @Override
+    public double logProbability(int x) {
+        if (x <= 0 || x > numberOfElements) {
+            return Double.NEGATIVE_INFINITY;
+        }
+
+        return -FastMath.log(x) * exponent
+                - FastMath.log(generalizedHarmonic(numberOfElements, exponent));
+    }
+
+    /** {@inheritDoc} */
+    public double cumulativeProbability(final int x) {
+        if (x <= 0) {
+            return 0.0;
+        } else if (x >= numberOfElements) {
+            return 1.0;
+        }
+
+        return generalizedHarmonic(x, exponent) / generalizedHarmonic(numberOfElements, exponent);
+    }
+
+    /**
+     * {@inheritDoc}
+     *
+     * <p>For number of elements {@code N} and exponent {@code s}, the mean is {@code Hs1 / Hs},
+     * where
+     *
+     * <ul>
+     *   <li>{@code Hs1 = generalizedHarmonic(N, s - 1)},
+     *   <li>{@code Hs = generalizedHarmonic(N, s)}.
+     * </ul>
+     */
+    public double getNumericalMean() {
+        if (!numericalMeanIsCalculated) {
+            numericalMean = calculateNumericalMean();
+            numericalMeanIsCalculated = true;
+        }
+        return numericalMean;
+    }
+
+    /**
+     * Used by {@link #getNumericalMean()}.
+     *
+     * @return the mean of this distribution
+     */
+    protected double calculateNumericalMean() {
+        final int N = getNumberOfElements();
+        final double s = getExponent();
+
+        final double Hs1 = generalizedHarmonic(N, s - 1);
+        final double Hs = generalizedHarmonic(N, s);
+
+        return Hs1 / Hs;
+    }
+
+    /**
+     * {@inheritDoc}
+     *
+     * <p>For number of elements {@code N} and exponent {@code s}, the mean is {@code (Hs2 / Hs) -
+     * (Hs1^2 / Hs^2)}, where
+     *
+     * <ul>
+     *   <li>{@code Hs2 = generalizedHarmonic(N, s - 2)},
+     *   <li>{@code Hs1 = generalizedHarmonic(N, s - 1)},
+     *   <li>{@code Hs = generalizedHarmonic(N, s)}.
+     * </ul>
+     */
+    public double getNumericalVariance() {
+        if (!numericalVarianceIsCalculated) {
+            numericalVariance = calculateNumericalVariance();
+            numericalVarianceIsCalculated = true;
+        }
+        return numericalVariance;
+    }
+
+    /**
+     * Used by {@link #getNumericalVariance()}.
+     *
+     * @return the variance of this distribution
+     */
+    protected double calculateNumericalVariance() {
+        final int N = getNumberOfElements();
+        final double s = getExponent();
+
+        final double Hs2 = generalizedHarmonic(N, s - 2);
+        final double Hs1 = generalizedHarmonic(N, s - 1);
+        final double Hs = generalizedHarmonic(N, s);
+
+        return (Hs2 / Hs) - ((Hs1 * Hs1) / (Hs * Hs));
+    }
+
+    /**
+     * Calculates the Nth generalized harmonic number. See <a
+     * href="http://mathworld.wolfram.com/HarmonicSeries.html">Harmonic Series</a>.
+     *
+     * @param n Term in the series to calculate (must be larger than 1)
+     * @param m Exponent (special case {@code m = 1} is the harmonic series).
+     * @return the n<sup>th</sup> generalized harmonic number.
+     */
+    private double generalizedHarmonic(final int n, final double m) {
+        double value = 0;
+        for (int k = n; k > 0; --k) {
+            value += 1.0 / FastMath.pow(k, m);
+        }
+        return value;
+    }
+
+    /**
+     * {@inheritDoc}
+     *
+     * <p>The lower bound of the support is always 1 no matter the parameters.
+     *
+     * @return lower bound of the support (always 1)
+     */
+    public int getSupportLowerBound() {
+        return 1;
+    }
+
+    /**
+     * {@inheritDoc}
+     *
+     * <p>The upper bound of the support is the number of elements.
+     *
+     * @return upper bound of the support
+     */
+    public int getSupportUpperBound() {
+        return getNumberOfElements();
+    }
+
+    /**
+     * {@inheritDoc}
+     *
+     * <p>The support of this distribution is connected.
+     *
+     * @return {@code true}
+     */
+    public boolean isSupportConnected() {
+        return true;
+    }
+
+    /** {@inheritDoc} */
+    @Override
+    public int sample() {
+        if (sampler == null) {
+            sampler = new ZipfRejectionInversionSampler(numberOfElements, exponent);
+        }
+        return sampler.sample(random);
+    }
+
+    /**
+     * Utility class implementing a rejection inversion sampling method for a discrete, bounded Zipf
+     * distribution that is based on the method described in
+     *
+     * <p>Wolfgang Hörmann and Gerhard Derflinger "Rejection-inversion to generate variates from
+     * monotone discrete distributions." ACM Transactions on Modeling and Computer Simulation
+     * (TOMACS) 6.3 (1996): 169-184.
+     *
+     * <p>The paper describes an algorithm for exponents larger than 1 (Algorithm ZRI). The original
+     * method uses {@code H(x) := (v + x)^(1 - q) / (1 - q)} as the integral of the hat function.
+     * This function is undefined for q = 1, which is the reason for the limitation of the exponent.
+     * If instead the integral function {@code H(x) := ((v + x)^(1 - q) - 1) / (1 - q)} is used, for
+     * which a meaningful limit exists for q = 1, the method works for all positive exponents.
+     *
+     * <p>The following implementation uses v := 0 and generates integral numbers in the range [1,
+     * numberOfElements]. This is different to the original method where v is defined to be positive
+     * and numbers are taken from [0, i_max]. This explains why the implementation looks slightly
+     * different.
+     *
+     * @since 3.6
+     */
+    static final class ZipfRejectionInversionSampler {
+
+        /** Exponent parameter of the distribution. */
+        private final double exponent;
+
+        /** Number of elements. */
+        private final int numberOfElements;
+
+        /** Constant equal to {@code hIntegral(1.5) - 1}. */
+        private final double hIntegralX1;
+
+        /** Constant equal to {@code hIntegral(numberOfElements + 0.5)}. */
+        private final double hIntegralNumberOfElements;
+
+        /** Constant equal to {@code 2 - hIntegralInverse(hIntegral(2.5) - h(2)}. */
+        private final double s;
+
+        /**
+         * Simple constructor.
+         *
+         * @param numberOfElements number of elements
+         * @param exponent exponent parameter of the distribution
+         */
+        ZipfRejectionInversionSampler(final int numberOfElements, final double exponent) {
+            this.exponent = exponent;
+            this.numberOfElements = numberOfElements;
+            this.hIntegralX1 = hIntegral(1.5) - 1d;
+            this.hIntegralNumberOfElements = hIntegral(numberOfElements + 0.5);
+            this.s = 2d - hIntegralInverse(hIntegral(2.5) - h(2));
+        }
+
+        /**
+         * Generate one integral number in the range [1, numberOfElements].
+         *
+         * @param random random generator to use
+         * @return generated integral number in the range [1, numberOfElements]
+         */
+        int sample(final RandomGenerator random) {
+            while (true) {
+
+                final double u =
+                        hIntegralNumberOfElements
+                                + random.nextDouble() * (hIntegralX1 - hIntegralNumberOfElements);
+                // u is uniformly distributed in (hIntegralX1, hIntegralNumberOfElements]
+
+                double x = hIntegralInverse(u);
+
+                int k = (int) (x + 0.5);
+
+                // Limit k to the range [1, numberOfElements]
+                // (k could be outside due to numerical inaccuracies)
+                if (k < 1) {
+                    k = 1;
+                } else if (k > numberOfElements) {
+                    k = numberOfElements;
+                }
+
+                // Here, the distribution of k is given by:
+                //
+                //   P(k = 1) = C * (hIntegral(1.5) - hIntegralX1) = C
+                //   P(k = m) = C * (hIntegral(m + 1/2) - hIntegral(m - 1/2)) for m >= 2
+                //
+                //   where C := 1 / (hIntegralNumberOfElements - hIntegralX1)
+
+                if (k - x <= s || u >= hIntegral(k + 0.5) - h(k)) {
+
+                    // Case k = 1:
+                    //
+                    //   The right inequality is always true, because replacing k by 1 gives
+                    //   u >= hIntegral(1.5) - h(1) = hIntegralX1 and u is taken from
+                    //   (hIntegralX1, hIntegralNumberOfElements].
+                    //
+                    //   Therefore, the acceptance rate for k = 1 is P(accepted | k = 1) = 1
+                    //   and the probability that 1 is returned as random value is
+                    //   P(k = 1 and accepted) = P(accepted | k = 1) * P(k = 1) = C = C / 1^exponent
+                    //
+                    // Case k >= 2:
+                    //
+                    //   The left inequality (k - x <= s) is just a short cut
+                    //   to avoid the more expensive evaluation of the right inequality
+                    //   (u >= hIntegral(k + 0.5) - h(k)) in many cases.
+                    //
+                    //   If the left inequality is true, the right inequality is also true:
+                    //     Theorem 2 in the paper is valid for all positive exponents, because
+                    //     the requirements h'(x) = -exponent/x^(exponent + 1) < 0 and
+                    //     (-1/hInverse'(x))'' = (1+1/exponent) * x^(1/exponent-1) >= 0
+                    //     are both fulfilled.
+                    //     Therefore, f(x) := x - hIntegralInverse(hIntegral(x + 0.5) - h(x))
+                    //     is a non-decreasing function. If k - x <= s holds,
+                    //     k - x <= s + f(k) - f(2) is obviously also true which is equivalent to
+                    //     -x <= -hIntegralInverse(hIntegral(k + 0.5) - h(k)),
+                    //     -hIntegralInverse(u) <= -hIntegralInverse(hIntegral(k + 0.5) - h(k)),
+                    //     and finally u >= hIntegral(k + 0.5) - h(k).
+                    //
+                    //   Hence, the right inequality determines the acceptance rate:
+                    //   P(accepted | k = m) = h(m) / (hIntegrated(m+1/2) - hIntegrated(m-1/2))
+                    //   The probability that m is returned is given by
+                    //   P(k = m and accepted) = P(accepted | k = m) * P(k = m) = C * h(m) = C /
+                    // m^exponent.
+                    //
+                    // In both cases the probabilities are proportional to the probability mass
+                    // function
+                    // of the Zipf distribution.
+
+                    return k;
+                }
+            }
+        }
+
+        /**
+         * {@code H(x) :=}
+         *
+         * <ul>
+         *   <li>{@code (x^(1-exponent) - 1)/(1 - exponent)}, if {@code exponent != 1}
+         *   <li>{@code log(x)}, if {@code exponent == 1}
+         * </ul>
+         *
+         * H(x) is an integral function of h(x), the derivative of H(x) is h(x).
+         *
+         * @param x free parameter
+         * @return {@code H(x)}
+         */
+        private double hIntegral(final double x) {
+            final double logX = FastMath.log(x);
+            return helper2((1d - exponent) * logX) * logX;
+        }
+
+        /**
+         * {@code h(x) := 1/x^exponent}
+         *
+         * @param x free parameter
+         * @return h(x)
+         */
+        private double h(final double x) {
+            return FastMath.exp(-exponent * FastMath.log(x));
+        }
+
+        /**
+         * The inverse function of H(x).
+         *
+         * @param x free parameter
+         * @return y for which {@code H(y) = x}
+         */
+        private double hIntegralInverse(final double x) {
+            double t = x * (1d - exponent);
+            if (t < -1d) {
+                // Limit value to the range [-1, +inf).
+                // t could be smaller than -1 in some rare cases due to numerical errors.
+                t = -1;
+            }
+            return FastMath.exp(helper1(t) * x);
+        }
+
+        /**
+         * Helper function that calculates {@code log(1+x)/x}.
+         *
+         * <p>A Taylor series expansion is used, if x is close to 0.
+         *
+         * @param x a value larger than or equal to -1
+         * @return {@code log(1+x)/x}
+         */
+        static double helper1(final double x) {
+            if (FastMath.abs(x) > 1e-8) {
+                return FastMath.log1p(x) / x;
+            } else {
+                return 1. - x * ((1. / 2.) - x * ((1. / 3.) - x * (1. / 4.)));
+            }
+        }
+
+        /**
+         * Helper function to calculate {@code (exp(x)-1)/x}.
+         *
+         * <p>A Taylor series expansion is used, if x is close to 0.
+         *
+         * @param x free parameter
+         * @return {@code (exp(x)-1)/x} if x is non-zero, or 1 if x=0
+         */
+        static double helper2(final double x) {
+            if (FastMath.abs(x) > 1e-8) {
+                return FastMath.expm1(x) / x;
+            } else {
+                return 1. + x * (1. / 2.) * (1. + x * (1. / 3.) * (1. + x * (1. / 4.)));
+            }
+        }
+    }
+}