diff options
Diffstat (limited to 'src/stats/univariate/kde/mod.rs')
-rwxr-xr-x | src/stats/univariate/kde/mod.rs | 140 |
1 files changed, 140 insertions, 0 deletions
diff --git a/src/stats/univariate/kde/mod.rs b/src/stats/univariate/kde/mod.rs new file mode 100755 index 0000000..efc27cd --- /dev/null +++ b/src/stats/univariate/kde/mod.rs @@ -0,0 +1,140 @@ +//! Kernel density estimation + +pub mod kernel; + +use self::kernel::Kernel; +use crate::stats::float::Float; +use crate::stats::univariate::Sample; +use rayon::prelude::*; + +/// Univariate kernel density estimator +pub struct Kde<'a, A, K> +where + A: Float, + K: Kernel<A>, +{ + bandwidth: A, + kernel: K, + sample: &'a Sample<A>, +} + +impl<'a, A, K> Kde<'a, A, K> +where + A: 'a + Float, + K: Kernel<A>, +{ + /// Creates a new kernel density estimator from the `sample`, using a kernel and estimating + /// the bandwidth using the method `bw` + pub fn new(sample: &'a Sample<A>, kernel: K, bw: Bandwidth) -> Kde<'a, A, K> { + Kde { + bandwidth: bw.estimate(sample), + kernel, + sample, + } + } + + /// Returns the bandwidth used by the estimator + pub fn bandwidth(&self) -> A { + self.bandwidth + } + + /// Maps the KDE over `xs` + /// + /// - Multihreaded + pub fn map(&self, xs: &[A]) -> Box<[A]> { + xs.par_iter() + .map(|&x| self.estimate(x)) + .collect::<Vec<_>>() + .into_boxed_slice() + } + + /// Estimates the probability density of `x` + pub fn estimate(&self, x: A) -> A { + let _0 = A::cast(0); + let slice = self.sample; + let h = self.bandwidth; + let n = A::cast(slice.len()); + + let sum = slice + .iter() + .fold(_0, |acc, &x_i| acc + self.kernel.evaluate((x - x_i) / h)); + + sum / (h * n) + } +} + +/// Method to estimate the bandwidth +pub enum Bandwidth { + /// Use Silverman's rule of thumb to estimate the bandwidth from the sample + Silverman, +} + +impl Bandwidth { + fn estimate<A: Float>(self, sample: &Sample<A>) -> A { + match self { + Bandwidth::Silverman => { + let factor = A::cast(4. / 3.); + let exponent = A::cast(1. / 5.); + let n = A::cast(sample.len()); + let sigma = sample.std_dev(None); + + sigma * (factor / n).powf(exponent) + } + } + } +} + +#[cfg(test)] +macro_rules! test { + ($ty:ident) => { + mod $ty { + use approx::relative_eq; + use quickcheck::quickcheck; + use quickcheck::TestResult; + + use crate::stats::univariate::kde::kernel::Gaussian; + use crate::stats::univariate::kde::{Bandwidth, Kde}; + use crate::stats::univariate::Sample; + + // The [-inf inf] integral of the estimated PDF should be one + quickcheck! { + fn integral(size: usize, start: usize) -> TestResult { + const DX: $ty = 1e-3; + + if let Some(v) = crate::stats::test::vec::<$ty>(size, start) { + let slice = &v[start..]; + let data = Sample::new(slice); + let kde = Kde::new(data, Gaussian, Bandwidth::Silverman); + let h = kde.bandwidth(); + // NB Obviously a [-inf inf] integral is not feasible, but this range works + // quite well + let (a, b) = (data.min() - 5. * h, data.max() + 5. * h); + + let mut acc = 0.; + let mut x = a; + let mut y = kde.estimate(a); + + while x < b { + acc += DX * y / 2.; + + x += DX; + y = kde.estimate(x); + + acc += DX * y / 2.; + } + + TestResult::from_bool(relative_eq!(acc, 1., epsilon = 2e-5)) + } else { + TestResult::discard() + } + } + } + } + }; +} + +#[cfg(test)] +mod test { + test!(f32); + test!(f64); +} |