aboutsummaryrefslogtreecommitdiff
path: root/src/stats/univariate/kde/mod.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/stats/univariate/kde/mod.rs')
-rwxr-xr-xsrc/stats/univariate/kde/mod.rs140
1 files changed, 140 insertions, 0 deletions
diff --git a/src/stats/univariate/kde/mod.rs b/src/stats/univariate/kde/mod.rs
new file mode 100755
index 0000000..efc27cd
--- /dev/null
+++ b/src/stats/univariate/kde/mod.rs
@@ -0,0 +1,140 @@
+//! Kernel density estimation
+
+pub mod kernel;
+
+use self::kernel::Kernel;
+use crate::stats::float::Float;
+use crate::stats::univariate::Sample;
+use rayon::prelude::*;
+
+/// Univariate kernel density estimator
+pub struct Kde<'a, A, K>
+where
+ A: Float,
+ K: Kernel<A>,
+{
+ bandwidth: A,
+ kernel: K,
+ sample: &'a Sample<A>,
+}
+
+impl<'a, A, K> Kde<'a, A, K>
+where
+ A: 'a + Float,
+ K: Kernel<A>,
+{
+ /// Creates a new kernel density estimator from the `sample`, using a kernel and estimating
+ /// the bandwidth using the method `bw`
+ pub fn new(sample: &'a Sample<A>, kernel: K, bw: Bandwidth) -> Kde<'a, A, K> {
+ Kde {
+ bandwidth: bw.estimate(sample),
+ kernel,
+ sample,
+ }
+ }
+
+ /// Returns the bandwidth used by the estimator
+ pub fn bandwidth(&self) -> A {
+ self.bandwidth
+ }
+
+ /// Maps the KDE over `xs`
+ ///
+ /// - Multihreaded
+ pub fn map(&self, xs: &[A]) -> Box<[A]> {
+ xs.par_iter()
+ .map(|&x| self.estimate(x))
+ .collect::<Vec<_>>()
+ .into_boxed_slice()
+ }
+
+ /// Estimates the probability density of `x`
+ pub fn estimate(&self, x: A) -> A {
+ let _0 = A::cast(0);
+ let slice = self.sample;
+ let h = self.bandwidth;
+ let n = A::cast(slice.len());
+
+ let sum = slice
+ .iter()
+ .fold(_0, |acc, &x_i| acc + self.kernel.evaluate((x - x_i) / h));
+
+ sum / (h * n)
+ }
+}
+
+/// Method to estimate the bandwidth
+pub enum Bandwidth {
+ /// Use Silverman's rule of thumb to estimate the bandwidth from the sample
+ Silverman,
+}
+
+impl Bandwidth {
+ fn estimate<A: Float>(self, sample: &Sample<A>) -> A {
+ match self {
+ Bandwidth::Silverman => {
+ let factor = A::cast(4. / 3.);
+ let exponent = A::cast(1. / 5.);
+ let n = A::cast(sample.len());
+ let sigma = sample.std_dev(None);
+
+ sigma * (factor / n).powf(exponent)
+ }
+ }
+ }
+}
+
+#[cfg(test)]
+macro_rules! test {
+ ($ty:ident) => {
+ mod $ty {
+ use approx::relative_eq;
+ use quickcheck::quickcheck;
+ use quickcheck::TestResult;
+
+ use crate::stats::univariate::kde::kernel::Gaussian;
+ use crate::stats::univariate::kde::{Bandwidth, Kde};
+ use crate::stats::univariate::Sample;
+
+ // The [-inf inf] integral of the estimated PDF should be one
+ quickcheck! {
+ fn integral(size: usize, start: usize) -> TestResult {
+ const DX: $ty = 1e-3;
+
+ if let Some(v) = crate::stats::test::vec::<$ty>(size, start) {
+ let slice = &v[start..];
+ let data = Sample::new(slice);
+ let kde = Kde::new(data, Gaussian, Bandwidth::Silverman);
+ let h = kde.bandwidth();
+ // NB Obviously a [-inf inf] integral is not feasible, but this range works
+ // quite well
+ let (a, b) = (data.min() - 5. * h, data.max() + 5. * h);
+
+ let mut acc = 0.;
+ let mut x = a;
+ let mut y = kde.estimate(a);
+
+ while x < b {
+ acc += DX * y / 2.;
+
+ x += DX;
+ y = kde.estimate(x);
+
+ acc += DX * y / 2.;
+ }
+
+ TestResult::from_bool(relative_eq!(acc, 1., epsilon = 2e-5))
+ } else {
+ TestResult::discard()
+ }
+ }
+ }
+ }
+ };
+}
+
+#[cfg(test)]
+mod test {
+ test!(f32);
+ test!(f64);
+}