aboutsummaryrefslogtreecommitdiff
path: root/src/stats
diff options
context:
space:
mode:
Diffstat (limited to 'src/stats')
-rwxr-xr-xsrc/stats/bivariate/bootstrap.rs83
-rwxr-xr-xsrc/stats/bivariate/mod.rs131
-rwxr-xr-xsrc/stats/bivariate/regression.rs53
-rwxr-xr-xsrc/stats/bivariate/resamples.rs61
-rwxr-xr-xsrc/stats/float.rs15
-rwxr-xr-xsrc/stats/mod.rs112
-rwxr-xr-xsrc/stats/rand_util.rs21
-rwxr-xr-xsrc/stats/test.rs16
-rwxr-xr-xsrc/stats/tuple.rs253
-rwxr-xr-xsrc/stats/univariate/bootstrap.rs161
-rwxr-xr-xsrc/stats/univariate/kde/kernel.rs82
-rwxr-xr-xsrc/stats/univariate/kde/mod.rs140
-rwxr-xr-xsrc/stats/univariate/mixed.rs57
-rwxr-xr-xsrc/stats/univariate/mod.rs72
-rwxr-xr-xsrc/stats/univariate/outliers/mod.rs7
-rwxr-xr-xsrc/stats/univariate/outliers/tukey.rs291
-rwxr-xr-xsrc/stats/univariate/percentiles.rs80
-rwxr-xr-xsrc/stats/univariate/resamples.rs117
-rwxr-xr-xsrc/stats/univariate/sample.rs255
19 files changed, 2007 insertions, 0 deletions
diff --git a/src/stats/bivariate/bootstrap.rs b/src/stats/bivariate/bootstrap.rs
new file mode 100755
index 0000000..8fe8ede
--- /dev/null
+++ b/src/stats/bivariate/bootstrap.rs
@@ -0,0 +1,83 @@
+#[cfg(test)]
+macro_rules! test {
+ ($ty:ident) => {
+ mod $ty {
+ use quickcheck::TestResult;
+ use quickcheck::quickcheck;
+ use approx::relative_eq;
+
+ use crate::stats::bivariate::regression::Slope;
+ use crate::stats::bivariate::Data;
+
+ quickcheck! {
+ fn means(size: usize, start: usize,
+ offset: usize, nresamples: usize) -> TestResult {
+ if let Some(x) = crate::stats::test::vec::<$ty>(size, start) {
+ let y = crate::stats::test::vec::<$ty>(size + offset, start + offset).unwrap();
+ let data = Data::new(&x[start..], &y[start+offset..]);
+
+ let (x_means, y_means) = if nresamples > 0 {
+ data.bootstrap(nresamples, |d| (d.x().mean(), d.y().mean()))
+ } else {
+ return TestResult::discard();
+ };
+
+ let x_min = data.x().min();
+ let x_max = data.x().max();
+ let y_min = data.y().min();
+ let y_max = data.y().max();
+
+ TestResult::from_bool(
+ // Computed the correct number of resamples
+ x_means.len() == nresamples &&
+ y_means.len() == nresamples &&
+ // No uninitialized values
+ x_means.iter().all(|&x| {
+ (x > x_min || relative_eq!(x, x_min)) &&
+ (x < x_max || relative_eq!(x, x_max))
+ }) &&
+ y_means.iter().all(|&y| {
+ (y > y_min || relative_eq!(y, y_min)) &&
+ (y < y_max || relative_eq!(y, y_max))
+ })
+ )
+ } else {
+ TestResult::discard()
+ }
+ }
+ }
+
+ quickcheck! {
+ fn slope(size: usize, start: usize,
+ offset: usize, nresamples: usize) -> TestResult {
+ if let Some(x) = crate::stats::test::vec::<$ty>(size, start) {
+ let y = crate::stats::test::vec::<$ty>(size + offset, start + offset).unwrap();
+ let data = Data::new(&x[start..], &y[start+offset..]);
+
+ let slopes = if nresamples > 0 {
+ data.bootstrap(nresamples, |d| (Slope::fit(&d),)).0
+ } else {
+ return TestResult::discard();
+ };
+
+ TestResult::from_bool(
+ // Computed the correct number of resamples
+ slopes.len() == nresamples &&
+ // No uninitialized values
+ slopes.iter().all(|s| s.0 > 0.)
+ )
+ } else {
+ TestResult::discard()
+ }
+ }
+ }
+
+ }
+ };
+}
+
+#[cfg(test)]
+mod test {
+ test!(f32);
+ test!(f64);
+}
diff --git a/src/stats/bivariate/mod.rs b/src/stats/bivariate/mod.rs
new file mode 100755
index 0000000..b233b60
--- /dev/null
+++ b/src/stats/bivariate/mod.rs
@@ -0,0 +1,131 @@
+//! Bivariate analysis
+
+mod bootstrap;
+pub mod regression;
+mod resamples;
+
+use crate::stats::bivariate::resamples::Resamples;
+use crate::stats::float::Float;
+use crate::stats::tuple::{Tuple, TupledDistributionsBuilder};
+use crate::stats::univariate::Sample;
+use rayon::iter::{IntoParallelIterator, ParallelIterator};
+
+/// Bivariate `(X, Y)` data
+///
+/// Invariants:
+///
+/// - No `NaN`s in the data
+/// - At least two data points in the set
+pub struct Data<'a, X, Y>(&'a [X], &'a [Y]);
+
+impl<'a, X, Y> Copy for Data<'a, X, Y> {}
+
+#[cfg_attr(feature = "cargo-clippy", allow(clippy::expl_impl_clone_on_copy))]
+impl<'a, X, Y> Clone for Data<'a, X, Y> {
+ fn clone(&self) -> Data<'a, X, Y> {
+ *self
+ }
+}
+
+impl<'a, X, Y> Data<'a, X, Y> {
+ /// Returns the length of the data set
+ pub fn len(&self) -> usize {
+ self.0.len()
+ }
+
+ /// Iterate over the data set
+ pub fn iter(&self) -> Pairs<'a, X, Y> {
+ Pairs {
+ data: *self,
+ state: 0,
+ }
+ }
+}
+
+impl<'a, X, Y> Data<'a, X, Y>
+where
+ X: Float,
+ Y: Float,
+{
+ /// Creates a new data set from two existing slices
+ pub fn new(xs: &'a [X], ys: &'a [Y]) -> Data<'a, X, Y> {
+ assert!(
+ xs.len() == ys.len()
+ && xs.len() > 1
+ && xs.iter().all(|x| !x.is_nan())
+ && ys.iter().all(|y| !y.is_nan())
+ );
+
+ Data(xs, ys)
+ }
+
+ // TODO Remove the `T` parameter in favor of `S::Output`
+ /// Returns the bootstrap distributions of the parameters estimated by the `statistic`
+ ///
+ /// - Multi-threaded
+ /// - Time: `O(nresamples)`
+ /// - Memory: `O(nresamples)`
+ pub fn bootstrap<T, S>(&self, nresamples: usize, statistic: S) -> T::Distributions
+ where
+ S: Fn(Data<X, Y>) -> T + Sync,
+ T: Tuple + Send,
+ T::Distributions: Send,
+ T::Builder: Send,
+ {
+ (0..nresamples)
+ .into_par_iter()
+ .map_init(
+ || Resamples::new(*self),
+ |resamples, _| statistic(resamples.next()),
+ )
+ .fold(
+ || T::Builder::new(0),
+ |mut sub_distributions, sample| {
+ sub_distributions.push(sample);
+ sub_distributions
+ },
+ )
+ .reduce(
+ || T::Builder::new(0),
+ |mut a, mut b| {
+ a.extend(&mut b);
+ a
+ },
+ )
+ .complete()
+ }
+
+ /// Returns a view into the `X` data
+ pub fn x(&self) -> &'a Sample<X> {
+ Sample::new(&self.0)
+ }
+
+ /// Returns a view into the `Y` data
+ pub fn y(&self) -> &'a Sample<Y> {
+ Sample::new(&self.1)
+ }
+}
+
+/// Iterator over `Data`
+pub struct Pairs<'a, X: 'a, Y: 'a> {
+ data: Data<'a, X, Y>,
+ state: usize,
+}
+
+impl<'a, X, Y> Iterator for Pairs<'a, X, Y> {
+ type Item = (&'a X, &'a Y);
+
+ fn next(&mut self) -> Option<(&'a X, &'a Y)> {
+ if self.state < self.data.len() {
+ let i = self.state;
+ self.state += 1;
+
+ // This is safe because i will always be < self.data.{0,1}.len()
+ debug_assert!(i < self.data.0.len());
+ debug_assert!(i < self.data.1.len());
+ unsafe { Some((self.data.0.get_unchecked(i), self.data.1.get_unchecked(i))) }
+ } else {
+ None
+ }
+ }
+}
diff --git a/src/stats/bivariate/regression.rs b/src/stats/bivariate/regression.rs
new file mode 100755
index 0000000..f09443f
--- /dev/null
+++ b/src/stats/bivariate/regression.rs
@@ -0,0 +1,53 @@
+//! Regression analysis
+
+use crate::stats::bivariate::Data;
+use crate::stats::float::Float;
+
+/// A straight line that passes through the origin `y = m * x`
+#[derive(Clone, Copy)]
+pub struct Slope<A>(pub A)
+where
+ A: Float;
+
+impl<A> Slope<A>
+where
+ A: Float,
+{
+ /// Fits the data to a straight line that passes through the origin using ordinary least
+ /// squares
+ ///
+ /// - Time: `O(length)`
+ pub fn fit(data: &Data<'_, A, A>) -> Slope<A> {
+ let xs = data.0;
+ let ys = data.1;
+
+ let xy = crate::stats::dot(xs, ys);
+ let x2 = crate::stats::dot(xs, xs);
+
+ Slope(xy / x2)
+ }
+
+ /// Computes the goodness of fit (coefficient of determination) for this data set
+ ///
+ /// - Time: `O(length)`
+ pub fn r_squared(&self, data: &Data<'_, A, A>) -> A {
+ let _0 = A::cast(0);
+ let _1 = A::cast(1);
+ let m = self.0;
+ let xs = data.0;
+ let ys = data.1;
+
+ let n = A::cast(xs.len());
+ let y_bar = crate::stats::sum(ys) / n;
+
+ let mut ss_res = _0;
+ let mut ss_tot = _0;
+
+ for (&x, &y) in data.iter() {
+ ss_res = ss_res + (y - m * x).powi(2);
+ ss_tot = ss_res + (y - y_bar).powi(2);
+ }
+
+ _1 - ss_res / ss_tot
+ }
+}
diff --git a/src/stats/bivariate/resamples.rs b/src/stats/bivariate/resamples.rs
new file mode 100755
index 0000000..e254dc7
--- /dev/null
+++ b/src/stats/bivariate/resamples.rs
@@ -0,0 +1,61 @@
+use crate::stats::bivariate::Data;
+use crate::stats::float::Float;
+use crate::stats::rand_util::{new_rng, Rng};
+
+pub struct Resamples<'a, X, Y>
+where
+ X: 'a + Float,
+ Y: 'a + Float,
+{
+ rng: Rng,
+ data: (&'a [X], &'a [Y]),
+ stage: Option<(Vec<X>, Vec<Y>)>,
+}
+
+#[cfg_attr(feature = "cargo-clippy", allow(clippy::should_implement_trait))]
+impl<'a, X, Y> Resamples<'a, X, Y>
+where
+ X: 'a + Float,
+ Y: 'a + Float,
+{
+ pub fn new(data: Data<'a, X, Y>) -> Resamples<'a, X, Y> {
+ Resamples {
+ rng: new_rng(),
+ data: (data.x(), data.y()),
+ stage: None,
+ }
+ }
+
+ pub fn next(&mut self) -> Data<'_, X, Y> {
+ let n = self.data.0.len();
+
+ match self.stage {
+ None => {
+ let mut stage = (Vec::with_capacity(n), Vec::with_capacity(n));
+
+ for _ in 0..n {
+ let i = self.rng.rand_range(0u64..(self.data.0.len() as u64)) as usize;
+
+ stage.0.push(self.data.0[i]);
+ stage.1.push(self.data.1[i]);
+ }
+
+ self.stage = Some(stage);
+ }
+ Some(ref mut stage) => {
+ for i in 0..n {
+ let j = self.rng.rand_range(0u64..(self.data.0.len() as u64)) as usize;
+
+ stage.0[i] = self.data.0[j];
+ stage.1[i] = self.data.1[j];
+ }
+ }
+ }
+
+ if let Some((ref x, ref y)) = self.stage {
+ Data(x, y)
+ } else {
+ unreachable!();
+ }
+ }
+}
diff --git a/src/stats/float.rs b/src/stats/float.rs
new file mode 100755
index 0000000..b7748dd
--- /dev/null
+++ b/src/stats/float.rs
@@ -0,0 +1,15 @@
+//! Float trait
+
+use cast::From;
+use num_traits::float;
+
+/// This is an extension of `num_traits::float::Float` that adds safe
+/// casting and Sync + Send. Once `num_traits` has these features this
+/// can be removed.
+pub trait Float:
+ float::Float + From<usize, Output = Self> + From<f32, Output = Self> + Sync + Send
+{
+}
+
+impl Float for f32 {}
+impl Float for f64 {}
diff --git a/src/stats/mod.rs b/src/stats/mod.rs
new file mode 100755
index 0000000..4f926de
--- /dev/null
+++ b/src/stats/mod.rs
@@ -0,0 +1,112 @@
+//! [Criterion]'s statistics library.
+//!
+//! [Criterion]: https://github.com/bheisler/criterion.rs
+//!
+//! **WARNING** This library is criterion's implementation detail and there no plans to stabilize
+//! it. In other words, the API may break at any time without notice.
+
+#[cfg(test)]
+mod test;
+
+pub mod bivariate;
+pub mod tuple;
+pub mod univariate;
+
+mod float;
+mod rand_util;
+
+use std::mem;
+use std::ops::Deref;
+
+use crate::stats::float::Float;
+use crate::stats::univariate::Sample;
+
+/// The bootstrap distribution of some parameter
+#[derive(Clone)]
+pub struct Distribution<A>(Box<[A]>);
+
+impl<A> Distribution<A>
+where
+ A: Float,
+{
+ /// Create a distribution from the given values
+ pub fn from(values: Box<[A]>) -> Distribution<A> {
+ Distribution(values)
+ }
+
+ /// Computes the confidence interval of the population parameter using percentiles
+ ///
+ /// # Panics
+ ///
+ /// Panics if the `confidence_level` is not in the `(0, 1)` range.
+ pub fn confidence_interval(&self, confidence_level: A) -> (A, A)
+ where
+ usize: cast::From<A, Output = Result<usize, cast::Error>>,
+ {
+ let _0 = A::cast(0);
+ let _1 = A::cast(1);
+ let _50 = A::cast(50);
+
+ assert!(confidence_level > _0 && confidence_level < _1);
+
+ let percentiles = self.percentiles();
+
+ // FIXME(privacy) this should use the `at_unchecked()` method
+ (
+ percentiles.at(_50 * (_1 - confidence_level)),
+ percentiles.at(_50 * (_1 + confidence_level)),
+ )
+ }
+
+ /// Computes the "likelihood" of seeing the value `t` or "more extreme" values in the
+ /// distribution.
+ pub fn p_value(&self, t: A, tails: &Tails) -> A {
+ use std::cmp;
+
+ let n = self.0.len();
+ let hits = self.0.iter().filter(|&&x| x < t).count();
+
+ let tails = A::cast(match *tails {
+ Tails::One => 1,
+ Tails::Two => 2,
+ });
+
+ A::cast(cmp::min(hits, n - hits)) / A::cast(n) * tails
+ }
+}
+
+impl<A> Deref for Distribution<A> {
+ type Target = Sample<A>;
+
+ fn deref(&self) -> &Sample<A> {
+ let slice: &[_] = &self.0;
+
+ unsafe { mem::transmute(slice) }
+ }
+}
+
+/// Number of tails for significance testing
+pub enum Tails {
+ /// One tailed test
+ One,
+ /// Two tailed test
+ Two,
+}
+
+fn dot<A>(xs: &[A], ys: &[A]) -> A
+where
+ A: Float,
+{
+ xs.iter()
+ .zip(ys)
+ .fold(A::cast(0), |acc, (&x, &y)| acc + x * y)
+}
+
+fn sum<A>(xs: &[A]) -> A
+where
+ A: Float,
+{
+ use std::ops::Add;
+
+ xs.iter().cloned().fold(A::cast(0), Add::add)
+}
diff --git a/src/stats/rand_util.rs b/src/stats/rand_util.rs
new file mode 100755
index 0000000..ed374cf
--- /dev/null
+++ b/src/stats/rand_util.rs
@@ -0,0 +1,21 @@
+use oorandom::Rand64;
+use std::cell::RefCell;
+use std::time::{SystemTime, UNIX_EPOCH};
+
+pub type Rng = Rand64;
+
+thread_local! {
+ static SEED_RAND: RefCell<Rand64> = RefCell::new(Rand64::new(
+ SystemTime::now().duration_since(UNIX_EPOCH)
+ .expect("Time went backwards")
+ .as_millis()
+ ));
+}
+
+pub fn new_rng() -> Rng {
+ SEED_RAND.with(|r| {
+ let mut r = r.borrow_mut();
+ let seed = ((r.rand_u64() as u128) << 64) | (r.rand_u64() as u128);
+ Rand64::new(seed)
+ })
+}
diff --git a/src/stats/test.rs b/src/stats/test.rs
new file mode 100755
index 0000000..9e13f30
--- /dev/null
+++ b/src/stats/test.rs
@@ -0,0 +1,16 @@
+use rand::distributions::{Distribution, Standard};
+use rand::prelude::*;
+use rand::rngs::StdRng;
+
+pub fn vec<T>(size: usize, start: usize) -> Option<Vec<T>>
+where
+ Standard: Distribution<T>,
+{
+ if size > start + 2 {
+ let mut rng = StdRng::from_entropy();
+
+ Some((0..size).map(|_| rng.gen()).collect())
+ } else {
+ None
+ }
+}
diff --git a/src/stats/tuple.rs b/src/stats/tuple.rs
new file mode 100755
index 0000000..1c07515
--- /dev/null
+++ b/src/stats/tuple.rs
@@ -0,0 +1,253 @@
+//! Helper traits for tupling/untupling
+
+use crate::stats::Distribution;
+
+/// Any tuple: `(A, B, ..)`
+pub trait Tuple: Sized {
+ /// A tuple of distributions associated with this tuple
+ type Distributions: TupledDistributions<Item = Self>;
+
+ /// A tuple of vectors associated with this tuple
+ type Builder: TupledDistributionsBuilder<Item = Self>;
+}
+
+/// A tuple of distributions: `(Distribution<A>, Distribution<B>, ..)`
+pub trait TupledDistributions: Sized {
+ /// A tuple that can be pushed/inserted into the tupled distributions
+ type Item: Tuple<Distributions = Self>;
+}
+
+/// A tuple of vecs used to build distributions.
+pub trait TupledDistributionsBuilder: Sized {
+ /// A tuple that can be pushed/inserted into the tupled distributions
+ type Item: Tuple<Builder = Self>;
+
+ /// Creates a new tuple of vecs
+ fn new(size: usize) -> Self;
+
+ /// Push one element into each of the vecs
+ fn push(&mut self, tuple: Self::Item);
+
+ /// Append one tuple of vecs to this one, leaving the vecs in the other tuple empty
+ fn extend(&mut self, other: &mut Self);
+
+ /// Convert the tuple of vectors into a tuple of distributions
+ fn complete(self) -> <Self::Item as Tuple>::Distributions;
+}
+
+impl<A> Tuple for (A,)
+where
+ A: Copy,
+{
+ type Distributions = (Distribution<A>,);
+ type Builder = (Vec<A>,);
+}
+
+impl<A> TupledDistributions for (Distribution<A>,)
+where
+ A: Copy,
+{
+ type Item = (A,);
+}
+impl<A> TupledDistributionsBuilder for (Vec<A>,)
+where
+ A: Copy,
+{
+ type Item = (A,);
+
+ fn new(size: usize) -> (Vec<A>,) {
+ (Vec::with_capacity(size),)
+ }
+
+ fn push(&mut self, tuple: (A,)) {
+ (self.0).push(tuple.0);
+ }
+
+ fn extend(&mut self, other: &mut (Vec<A>,)) {
+ (self.0).append(&mut other.0);
+ }
+
+ fn complete(self) -> (Distribution<A>,) {
+ (Distribution(self.0.into_boxed_slice()),)
+ }
+}
+
+impl<A, B> Tuple for (A, B)
+where
+ A: Copy,
+ B: Copy,
+{
+ type Distributions = (Distribution<A>, Distribution<B>);
+ type Builder = (Vec<A>, Vec<B>);
+}
+
+impl<A, B> TupledDistributions for (Distribution<A>, Distribution<B>)
+where
+ A: Copy,
+ B: Copy,
+{
+ type Item = (A, B);
+}
+impl<A, B> TupledDistributionsBuilder for (Vec<A>, Vec<B>)
+where
+ A: Copy,
+ B: Copy,
+{
+ type Item = (A, B);
+
+ fn new(size: usize) -> (Vec<A>, Vec<B>) {
+ (Vec::with_capacity(size), Vec::with_capacity(size))
+ }
+
+ fn push(&mut self, tuple: (A, B)) {
+ (self.0).push(tuple.0);
+ (self.1).push(tuple.1);
+ }
+
+ fn extend(&mut self, other: &mut (Vec<A>, Vec<B>)) {
+ (self.0).append(&mut other.0);
+ (self.1).append(&mut other.1);
+ }
+
+ fn complete(self) -> (Distribution<A>, Distribution<B>) {
+ (
+ Distribution(self.0.into_boxed_slice()),
+ Distribution(self.1.into_boxed_slice()),
+ )
+ }
+}
+
+impl<A, B, C> Tuple for (A, B, C)
+where
+ A: Copy,
+ B: Copy,
+ C: Copy,
+{
+ type Distributions = (Distribution<A>, Distribution<B>, Distribution<C>);
+ type Builder = (Vec<A>, Vec<B>, Vec<C>);
+}
+
+impl<A, B, C> TupledDistributions for (Distribution<A>, Distribution<B>, Distribution<C>)
+where
+ A: Copy,
+ B: Copy,
+ C: Copy,
+{
+ type Item = (A, B, C);
+}
+impl<A, B, C> TupledDistributionsBuilder for (Vec<A>, Vec<B>, Vec<C>)
+where
+ A: Copy,
+ B: Copy,
+ C: Copy,
+{
+ type Item = (A, B, C);
+
+ fn new(size: usize) -> (Vec<A>, Vec<B>, Vec<C>) {
+ (
+ Vec::with_capacity(size),
+ Vec::with_capacity(size),
+ Vec::with_capacity(size),
+ )
+ }
+
+ fn push(&mut self, tuple: (A, B, C)) {
+ (self.0).push(tuple.0);
+ (self.1).push(tuple.1);
+ (self.2).push(tuple.2);
+ }
+
+ fn extend(&mut self, other: &mut (Vec<A>, Vec<B>, Vec<C>)) {
+ (self.0).append(&mut other.0);
+ (self.1).append(&mut other.1);
+ (self.2).append(&mut other.2);
+ }
+
+ fn complete(self) -> (Distribution<A>, Distribution<B>, Distribution<C>) {
+ (
+ Distribution(self.0.into_boxed_slice()),
+ Distribution(self.1.into_boxed_slice()),
+ Distribution(self.2.into_boxed_slice()),
+ )
+ }
+}
+
+impl<A, B, C, D> Tuple for (A, B, C, D)
+where
+ A: Copy,
+ B: Copy,
+ C: Copy,
+ D: Copy,
+{
+ type Distributions = (
+ Distribution<A>,
+ Distribution<B>,
+ Distribution<C>,
+ Distribution<D>,
+ );
+ type Builder = (Vec<A>, Vec<B>, Vec<C>, Vec<D>);
+}
+
+impl<A, B, C, D> TupledDistributions
+ for (
+ Distribution<A>,
+ Distribution<B>,
+ Distribution<C>,
+ Distribution<D>,
+ )
+where
+ A: Copy,
+ B: Copy,
+ C: Copy,
+ D: Copy,
+{
+ type Item = (A, B, C, D);
+}
+impl<A, B, C, D> TupledDistributionsBuilder for (Vec<A>, Vec<B>, Vec<C>, Vec<D>)
+where
+ A: Copy,
+ B: Copy,
+ C: Copy,
+ D: Copy,
+{
+ type Item = (A, B, C, D);
+
+ fn new(size: usize) -> (Vec<A>, Vec<B>, Vec<C>, Vec<D>) {
+ (
+ Vec::with_capacity(size),
+ Vec::with_capacity(size),
+ Vec::with_capacity(size),
+ Vec::with_capacity(size),
+ )
+ }
+
+ fn push(&mut self, tuple: (A, B, C, D)) {
+ (self.0).push(tuple.0);
+ (self.1).push(tuple.1);
+ (self.2).push(tuple.2);
+ (self.3).push(tuple.3);
+ }
+
+ fn extend(&mut self, other: &mut (Vec<A>, Vec<B>, Vec<C>, Vec<D>)) {
+ (self.0).append(&mut other.0);
+ (self.1).append(&mut other.1);
+ (self.2).append(&mut other.2);
+ (self.3).append(&mut other.3);
+ }
+
+ fn complete(
+ self,
+ ) -> (
+ Distribution<A>,
+ Distribution<B>,
+ Distribution<C>,
+ Distribution<D>,
+ ) {
+ (
+ Distribution(self.0.into_boxed_slice()),
+ Distribution(self.1.into_boxed_slice()),
+ Distribution(self.2.into_boxed_slice()),
+ Distribution(self.3.into_boxed_slice()),
+ )
+ }
+}
diff --git a/src/stats/univariate/bootstrap.rs b/src/stats/univariate/bootstrap.rs
new file mode 100755
index 0000000..dbb52f5
--- /dev/null
+++ b/src/stats/univariate/bootstrap.rs
@@ -0,0 +1,161 @@
+#[cfg(test)]
+macro_rules! test {
+ ($ty:ident) => {
+ mod $ty {
+ use approx::relative_eq;
+ use quickcheck::quickcheck;
+ use quickcheck::TestResult;
+
+ use crate::stats::univariate::{Sample, mixed, self};
+
+ quickcheck!{
+ fn mean(size: usize, start: usize, nresamples: usize) -> TestResult {
+ if let Some(v) = crate::stats::test::vec::<$ty>(size, start) {
+ let sample = Sample::new(&v[start..]);
+
+ let means = if nresamples > 0 {
+ sample.bootstrap(nresamples, |s| (s.mean(),)).0
+ } else {
+ return TestResult::discard();
+ };
+
+ let min = sample.min();
+ let max = sample.max();
+
+ TestResult::from_bool(
+ // Computed the correct number of resamples
+ means.len() == nresamples &&
+ // No uninitialized values
+ means.iter().all(|&x| {
+ (x > min || relative_eq!(x, min)) &&
+ (x < max || relative_eq!(x, max))
+ })
+ )
+ } else {
+ TestResult::discard()
+ }
+ }
+ }
+
+ quickcheck!{
+ fn mean_median(size: usize, start: usize, nresamples: usize) -> TestResult {
+ if let Some(v) = crate::stats::test::vec::<$ty>(size, start) {
+ let sample = Sample::new(&v[start..]);
+
+ let (means, medians) = if nresamples > 0 {
+ sample.bootstrap(nresamples, |s| (s.mean(), s.median()))
+ } else {
+ return TestResult::discard();
+ };
+
+ let min = sample.min();
+ let max = sample.max();
+
+ TestResult::from_bool(
+ // Computed the correct number of resamples
+ means.len() == nresamples &&
+ medians.len() == nresamples &&
+ // No uninitialized values
+ means.iter().all(|&x| {
+ (x > min || relative_eq!(x, min)) &&
+ (x < max || relative_eq!(x, max))
+ }) &&
+ medians.iter().all(|&x| {
+ (x > min || relative_eq!(x, min)) &&
+ (x < max || relative_eq!(x, max))
+ })
+ )
+ } else {
+ TestResult::discard()
+ }
+ }
+ }
+
+ quickcheck!{
+ fn mixed_two_sample(
+ a_size: usize, a_start: usize,
+ b_size: usize, b_start: usize,
+ nresamples: usize
+ ) -> TestResult {
+ if let (Some(a), Some(b)) =
+ (crate::stats::test::vec::<$ty>(a_size, a_start), crate::stats::test::vec::<$ty>(b_size, b_start))
+ {
+ let a = Sample::new(&a);
+ let b = Sample::new(&b);
+
+ let distribution = if nresamples > 0 {
+ mixed::bootstrap(a, b, nresamples, |a, b| (a.mean() - b.mean(),)).0
+ } else {
+ return TestResult::discard();
+ };
+
+ let min = <$ty>::min(a.min() - b.max(), b.min() - a.max());
+ let max = <$ty>::max(a.max() - b.min(), b.max() - a.min());
+
+ TestResult::from_bool(
+ // Computed the correct number of resamples
+ distribution.len() == nresamples &&
+ // No uninitialized values
+ distribution.iter().all(|&x| {
+ (x > min || relative_eq!(x, min)) &&
+ (x < max || relative_eq!(x, max))
+ })
+ )
+ } else {
+ TestResult::discard()
+ }
+ }
+ }
+
+ quickcheck!{
+ fn two_sample(
+ a_size: usize, a_start: usize,
+ b_size: usize, b_start: usize,
+ nresamples: usize
+ ) -> TestResult {
+ if let (Some(a), Some(b)) =
+ (crate::stats::test::vec::<$ty>(a_size, a_start), crate::stats::test::vec::<$ty>(b_size, b_start))
+ {
+ let a = Sample::new(&a[a_start..]);
+ let b = Sample::new(&b[b_start..]);
+
+ let distribution = if nresamples > 0 {
+ univariate::bootstrap(a, b, nresamples, |a, b| (a.mean() - b.mean(),)).0
+ } else {
+ return TestResult::discard();
+ };
+
+ let min = <$ty>::min(a.min() - b.max(), b.min() - a.max());
+ let max = <$ty>::max(a.max() - b.min(), b.max() - a.min());
+
+ // Computed the correct number of resamples
+ let pass = distribution.len() == nresamples &&
+ // No uninitialized values
+ distribution.iter().all(|&x| {
+ (x > min || relative_eq!(x, min)) &&
+ (x < max || relative_eq!(x, max))
+ });
+
+ if !pass {
+ println!("A: {:?} (len={})", a.as_ref(), a.len());
+ println!("B: {:?} (len={})", b.as_ref(), b.len());
+ println!("Dist: {:?} (len={})", distribution.as_ref(), distribution.len());
+ println!("Min: {}, Max: {}, nresamples: {}",
+ min, max, nresamples);
+ }
+
+ TestResult::from_bool(pass)
+ } else {
+ TestResult::discard()
+ }
+ }
+ }
+ }
+ }
+}
+
+#[cfg(test)]
+mod test {
+ test!(f32);
+ test!(f64);
+}
diff --git a/src/stats/univariate/kde/kernel.rs b/src/stats/univariate/kde/kernel.rs
new file mode 100755
index 0000000..b4204f5
--- /dev/null
+++ b/src/stats/univariate/kde/kernel.rs
@@ -0,0 +1,82 @@
+//! Kernels
+
+use crate::stats::float::Float;
+
+/// Kernel function
+pub trait Kernel<A>: Copy + Sync
+where
+ A: Float,
+{
+ /// Apply the kernel function to the given x-value.
+ fn evaluate(&self, x: A) -> A;
+}
+
+/// Gaussian kernel
+#[derive(Clone, Copy)]
+pub struct Gaussian;
+
+impl<A> Kernel<A> for Gaussian
+where
+ A: Float,
+{
+ fn evaluate(&self, x: A) -> A {
+ use std::f32::consts::PI;
+
+ (x.powi(2).exp() * A::cast(2. * PI)).sqrt().recip()
+ }
+}
+
+#[cfg(test)]
+macro_rules! test {
+ ($ty:ident) => {
+ mod $ty {
+ mod gaussian {
+ use approx::relative_eq;
+ use quickcheck::quickcheck;
+ use quickcheck::TestResult;
+
+ use crate::stats::univariate::kde::kernel::{Gaussian, Kernel};
+
+ quickcheck! {
+ fn symmetric(x: $ty) -> bool {
+ relative_eq!(Gaussian.evaluate(-x), Gaussian.evaluate(x))
+ }
+ }
+
+ // Any [a b] integral should be in the range [0 1]
+ quickcheck! {
+ fn integral(a: $ty, b: $ty) -> TestResult {
+ const DX: $ty = 1e-3;
+
+ if a > b {
+ TestResult::discard()
+ } else {
+ let mut acc = 0.;
+ let mut x = a;
+ let mut y = Gaussian.evaluate(a);
+
+ while x < b {
+ acc += DX * y / 2.;
+
+ x += DX;
+ y = Gaussian.evaluate(x);
+
+ acc += DX * y / 2.;
+ }
+
+ TestResult::from_bool(
+ (acc > 0. || relative_eq!(acc, 0.)) &&
+ (acc < 1. || relative_eq!(acc, 1.)))
+ }
+ }
+ }
+ }
+ }
+ };
+}
+
+#[cfg(test)]
+mod test {
+ test!(f32);
+ test!(f64);
+}
diff --git a/src/stats/univariate/kde/mod.rs b/src/stats/univariate/kde/mod.rs
new file mode 100755
index 0000000..efc27cd
--- /dev/null
+++ b/src/stats/univariate/kde/mod.rs
@@ -0,0 +1,140 @@
+//! Kernel density estimation
+
+pub mod kernel;
+
+use self::kernel::Kernel;
+use crate::stats::float::Float;
+use crate::stats::univariate::Sample;
+use rayon::prelude::*;
+
+/// Univariate kernel density estimator
+pub struct Kde<'a, A, K>
+where
+ A: Float,
+ K: Kernel<A>,
+{
+ bandwidth: A,
+ kernel: K,
+ sample: &'a Sample<A>,
+}
+
+impl<'a, A, K> Kde<'a, A, K>
+where
+ A: 'a + Float,
+ K: Kernel<A>,
+{
+ /// Creates a new kernel density estimator from the `sample`, using a kernel and estimating
+ /// the bandwidth using the method `bw`
+ pub fn new(sample: &'a Sample<A>, kernel: K, bw: Bandwidth) -> Kde<'a, A, K> {
+ Kde {
+ bandwidth: bw.estimate(sample),
+ kernel,
+ sample,
+ }
+ }
+
+ /// Returns the bandwidth used by the estimator
+ pub fn bandwidth(&self) -> A {
+ self.bandwidth
+ }
+
+ /// Maps the KDE over `xs`
+ ///
+ /// - Multihreaded
+ pub fn map(&self, xs: &[A]) -> Box<[A]> {
+ xs.par_iter()
+ .map(|&x| self.estimate(x))
+ .collect::<Vec<_>>()
+ .into_boxed_slice()
+ }
+
+ /// Estimates the probability density of `x`
+ pub fn estimate(&self, x: A) -> A {
+ let _0 = A::cast(0);
+ let slice = self.sample;
+ let h = self.bandwidth;
+ let n = A::cast(slice.len());
+
+ let sum = slice
+ .iter()
+ .fold(_0, |acc, &x_i| acc + self.kernel.evaluate((x - x_i) / h));
+
+ sum / (h * n)
+ }
+}
+
+/// Method to estimate the bandwidth
+pub enum Bandwidth {
+ /// Use Silverman's rule of thumb to estimate the bandwidth from the sample
+ Silverman,
+}
+
+impl Bandwidth {
+ fn estimate<A: Float>(self, sample: &Sample<A>) -> A {
+ match self {
+ Bandwidth::Silverman => {
+ let factor = A::cast(4. / 3.);
+ let exponent = A::cast(1. / 5.);
+ let n = A::cast(sample.len());
+ let sigma = sample.std_dev(None);
+
+ sigma * (factor / n).powf(exponent)
+ }
+ }
+ }
+}
+
+#[cfg(test)]
+macro_rules! test {
+ ($ty:ident) => {
+ mod $ty {
+ use approx::relative_eq;
+ use quickcheck::quickcheck;
+ use quickcheck::TestResult;
+
+ use crate::stats::univariate::kde::kernel::Gaussian;
+ use crate::stats::univariate::kde::{Bandwidth, Kde};
+ use crate::stats::univariate::Sample;
+
+ // The [-inf inf] integral of the estimated PDF should be one
+ quickcheck! {
+ fn integral(size: usize, start: usize) -> TestResult {
+ const DX: $ty = 1e-3;
+
+ if let Some(v) = crate::stats::test::vec::<$ty>(size, start) {
+ let slice = &v[start..];
+ let data = Sample::new(slice);
+ let kde = Kde::new(data, Gaussian, Bandwidth::Silverman);
+ let h = kde.bandwidth();
+ // NB Obviously a [-inf inf] integral is not feasible, but this range works
+ // quite well
+ let (a, b) = (data.min() - 5. * h, data.max() + 5. * h);
+
+ let mut acc = 0.;
+ let mut x = a;
+ let mut y = kde.estimate(a);
+
+ while x < b {
+ acc += DX * y / 2.;
+
+ x += DX;
+ y = kde.estimate(x);
+
+ acc += DX * y / 2.;
+ }
+
+ TestResult::from_bool(relative_eq!(acc, 1., epsilon = 2e-5))
+ } else {
+ TestResult::discard()
+ }
+ }
+ }
+ }
+ };
+}
+
+#[cfg(test)]
+mod test {
+ test!(f32);
+ test!(f64);
+}
diff --git a/src/stats/univariate/mixed.rs b/src/stats/univariate/mixed.rs
new file mode 100755
index 0000000..5c0a59f
--- /dev/null
+++ b/src/stats/univariate/mixed.rs
@@ -0,0 +1,57 @@
+//! Mixed bootstrap
+
+use crate::stats::float::Float;
+use crate::stats::tuple::{Tuple, TupledDistributionsBuilder};
+use crate::stats::univariate::Resamples;
+use crate::stats::univariate::Sample;
+use rayon::prelude::*;
+
+/// Performs a *mixed* two-sample bootstrap
+pub fn bootstrap<A, T, S>(
+ a: &Sample<A>,
+ b: &Sample<A>,
+ nresamples: usize,
+ statistic: S,
+) -> T::Distributions
+where
+ A: Float,
+ S: Fn(&Sample<A>, &Sample<A>) -> T + Sync,
+ T: Tuple + Send,
+ T::Distributions: Send,
+ T::Builder: Send,
+{
+ let n_a = a.len();
+ let n_b = b.len();
+ let mut c = Vec::with_capacity(n_a + n_b);
+ c.extend_from_slice(a);
+ c.extend_from_slice(b);
+ let c = Sample::new(&c);
+
+ (0..nresamples)
+ .into_par_iter()
+ .map_init(
+ || Resamples::new(c),
+ |resamples, _| {
+ let resample = resamples.next();
+ let a: &Sample<A> = Sample::new(&resample[..n_a]);
+ let b: &Sample<A> = Sample::new(&resample[n_a..]);
+
+ statistic(a, b)
+ },
+ )
+ .fold(
+ || T::Builder::new(0),
+ |mut sub_distributions, sample| {
+ sub_distributions.push(sample);
+ sub_distributions
+ },
+ )
+ .reduce(
+ || T::Builder::new(0),
+ |mut a, mut b| {
+ a.extend(&mut b);
+ a
+ },
+ )
+ .complete()
+}
diff --git a/src/stats/univariate/mod.rs b/src/stats/univariate/mod.rs
new file mode 100755
index 0000000..8dfb5f8
--- /dev/null
+++ b/src/stats/univariate/mod.rs
@@ -0,0 +1,72 @@
+//! Univariate analysis
+
+mod bootstrap;
+mod percentiles;
+mod resamples;
+mod sample;
+
+pub mod kde;
+pub mod mixed;
+pub mod outliers;
+
+use crate::stats::float::Float;
+use crate::stats::tuple::{Tuple, TupledDistributionsBuilder};
+use rayon::prelude::*;
+use std::cmp;
+
+use self::resamples::Resamples;
+
+pub use self::percentiles::Percentiles;
+pub use self::sample::Sample;
+
+/// Performs a two-sample bootstrap
+///
+/// - Multithreaded
+/// - Time: `O(nresamples)`
+/// - Memory: `O(nresamples)`
+#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_lossless))]
+pub fn bootstrap<A, B, T, S>(
+ a: &Sample<A>,
+ b: &Sample<B>,
+ nresamples: usize,
+ statistic: S,
+) -> T::Distributions
+where
+ A: Float,
+ B: Float,
+ S: Fn(&Sample<A>, &Sample<B>) -> T + Sync,
+ T: Tuple + Send,
+ T::Distributions: Send,
+ T::Builder: Send,
+{
+ let nresamples_sqrt = (nresamples as f64).sqrt().ceil() as usize;
+ let per_chunk = (nresamples + nresamples_sqrt - 1) / nresamples_sqrt;
+
+ (0..nresamples_sqrt)
+ .into_par_iter()
+ .map_init(
+ || (Resamples::new(a), Resamples::new(b)),
+ |(a_resamples, b_resamples), i| {
+ let start = i * per_chunk;
+ let end = cmp::min((i + 1) * per_chunk, nresamples);
+ let a_resample = a_resamples.next();
+
+ let mut sub_distributions: T::Builder =
+ TupledDistributionsBuilder::new(end - start);
+
+ for _ in start..end {
+ let b_resample = b_resamples.next();
+ sub_distributions.push(statistic(a_resample, b_resample));
+ }
+ sub_distributions
+ },
+ )
+ .reduce(
+ || T::Builder::new(0),
+ |mut a, mut b| {
+ a.extend(&mut b);
+ a
+ },
+ )
+ .complete()
+}
diff --git a/src/stats/univariate/outliers/mod.rs b/src/stats/univariate/outliers/mod.rs
new file mode 100755
index 0000000..b8ed7c7
--- /dev/null
+++ b/src/stats/univariate/outliers/mod.rs
@@ -0,0 +1,7 @@
+//! Classification of outliers
+//!
+//! WARNING: There's no formal/mathematical definition of what an outlier actually is. Therefore,
+//! all outlier classifiers are *subjective*, however some classifiers that have become *de facto*
+//! standard are provided here.
+
+pub mod tukey;
diff --git a/src/stats/univariate/outliers/tukey.rs b/src/stats/univariate/outliers/tukey.rs
new file mode 100755
index 0000000..bfd08f1
--- /dev/null
+++ b/src/stats/univariate/outliers/tukey.rs
@@ -0,0 +1,291 @@
+//! Tukey's method
+//!
+//! The original method uses two "fences" to classify the data. All the observations "inside" the
+//! fences are considered "normal", and the rest are considered outliers.
+//!
+//! The fences are computed from the quartiles of the sample, according to the following formula:
+//!
+//! ``` ignore
+//! // q1, q3 are the first and third quartiles
+//! let iqr = q3 - q1; // The interquartile range
+//! let (f1, f2) = (q1 - 1.5 * iqr, q3 + 1.5 * iqr); // the "fences"
+//!
+//! let is_outlier = |x| if x > f1 && x < f2 { true } else { false };
+//! ```
+//!
+//! The classifier provided here adds two extra outer fences:
+//!
+//! ``` ignore
+//! let (f3, f4) = (q1 - 3 * iqr, q3 + 3 * iqr); // the outer "fences"
+//! ```
+//!
+//! The extra fences add a sense of "severity" to the classification. Data points outside of the
+//! outer fences are considered "severe" outliers, whereas points outside the inner fences are just
+//! "mild" outliers, and, as the original method, everything inside the inner fences is considered
+//! "normal" data.
+//!
+//! Some ASCII art for the visually oriented people:
+//!
+//! ``` ignore
+//! LOW-ish NORMAL-ish HIGH-ish
+//! x | + | o o o o o o o | + | x
+//! f3 f1 f2 f4
+//!
+//! Legend:
+//! o: "normal" data (not an outlier)
+//! +: "mild" outlier
+//! x: "severe" outlier
+//! ```
+
+use std::iter::IntoIterator;
+use std::ops::{Deref, Index};
+use std::slice;
+
+use crate::stats::float::Float;
+use crate::stats::univariate::Sample;
+
+use self::Label::*;
+
+/// A classified/labeled sample.
+///
+/// The labeled data can be accessed using the indexing operator. The order of the data points is
+/// retained.
+///
+/// NOTE: Due to limitations in the indexing traits, only the label is returned. Once the
+/// `IndexGet` trait lands in stdlib, the indexing operation will return a `(data_point, label)`
+/// pair.
+#[derive(Clone, Copy)]
+pub struct LabeledSample<'a, A>
+where
+ A: Float,
+{
+ fences: (A, A, A, A),
+ sample: &'a Sample<A>,
+}
+
+impl<'a, A> LabeledSample<'a, A>
+where
+ A: Float,
+{
+ /// Returns the number of data points per label
+ ///
+ /// - Time: `O(length)`
+ #[cfg_attr(feature = "cargo-clippy", allow(clippy::similar_names))]
+ pub fn count(&self) -> (usize, usize, usize, usize, usize) {
+ let (mut los, mut lom, mut noa, mut him, mut his) = (0, 0, 0, 0, 0);
+
+ for (_, label) in self {
+ match label {
+ LowSevere => {
+ los += 1;
+ }
+ LowMild => {
+ lom += 1;
+ }
+ NotAnOutlier => {
+ noa += 1;
+ }
+ HighMild => {
+ him += 1;
+ }
+ HighSevere => {
+ his += 1;
+ }
+ }
+ }
+
+ (los, lom, noa, him, his)
+ }
+
+ /// Returns the fences used to classify the outliers
+ pub fn fences(&self) -> (A, A, A, A) {
+ self.fences
+ }
+
+ /// Returns an iterator over the labeled data
+ pub fn iter(&self) -> Iter<'a, A> {
+ Iter {
+ fences: self.fences,
+ iter: self.sample.iter(),
+ }
+ }
+}
+
+impl<'a, A> Deref for LabeledSample<'a, A>
+where
+ A: Float,
+{
+ type Target = Sample<A>;
+
+ fn deref(&self) -> &Sample<A> {
+ self.sample
+ }
+}
+
+// FIXME Use the `IndexGet` trait
+impl<'a, A> Index<usize> for LabeledSample<'a, A>
+where
+ A: Float,
+{
+ type Output = Label;
+
+ #[cfg_attr(feature = "cargo-clippy", allow(clippy::similar_names))]
+ fn index(&self, i: usize) -> &Label {
+ static LOW_SEVERE: Label = LowSevere;
+ static LOW_MILD: Label = LowMild;
+ static HIGH_MILD: Label = HighMild;
+ static HIGH_SEVERE: Label = HighSevere;
+ static NOT_AN_OUTLIER: Label = NotAnOutlier;
+
+ let x = self.sample[i];
+ let (lost, lomt, himt, hist) = self.fences;
+
+ if x < lost {
+ &LOW_SEVERE
+ } else if x > hist {
+ &HIGH_SEVERE
+ } else if x < lomt {
+ &LOW_MILD
+ } else if x > himt {
+ &HIGH_MILD
+ } else {
+ &NOT_AN_OUTLIER
+ }
+ }
+}
+
+impl<'a, 'b, A> IntoIterator for &'b LabeledSample<'a, A>
+where
+ A: Float,
+{
+ type Item = (A, Label);
+ type IntoIter = Iter<'a, A>;
+
+ fn into_iter(self) -> Iter<'a, A> {
+ self.iter()
+ }
+}
+
+/// Iterator over the labeled data
+pub struct Iter<'a, A>
+where
+ A: Float,
+{
+ fences: (A, A, A, A),
+ iter: slice::Iter<'a, A>,
+}
+
+impl<'a, A> Iterator for Iter<'a, A>
+where
+ A: Float,
+{
+ type Item = (A, Label);
+
+ #[cfg_attr(feature = "cargo-clippy", allow(clippy::similar_names))]
+ fn next(&mut self) -> Option<(A, Label)> {
+ self.iter.next().map(|&x| {
+ let (lost, lomt, himt, hist) = self.fences;
+
+ let label = if x < lost {
+ LowSevere
+ } else if x > hist {
+ HighSevere
+ } else if x < lomt {
+ LowMild
+ } else if x > himt {
+ HighMild
+ } else {
+ NotAnOutlier
+ };
+
+ (x, label)
+ })
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.iter.size_hint()
+ }
+}
+
+/// Labels used to classify outliers
+pub enum Label {
+ /// A "mild" outlier in the "high" spectrum
+ HighMild,
+ /// A "severe" outlier in the "high" spectrum
+ HighSevere,
+ /// A "mild" outlier in the "low" spectrum
+ LowMild,
+ /// A "severe" outlier in the "low" spectrum
+ LowSevere,
+ /// A normal data point
+ NotAnOutlier,
+}
+
+impl Label {
+ /// Checks if the data point has an "unusually" high value
+ pub fn is_high(&self) -> bool {
+ match *self {
+ HighMild | HighSevere => true,
+ _ => false,
+ }
+ }
+
+ /// Checks if the data point is labeled as a "mild" outlier
+ pub fn is_mild(&self) -> bool {
+ match *self {
+ HighMild | LowMild => true,
+ _ => false,
+ }
+ }
+
+ /// Checks if the data point has an "unusually" low value
+ pub fn is_low(&self) -> bool {
+ match *self {
+ LowMild | LowSevere => true,
+ _ => false,
+ }
+ }
+
+ /// Checks if the data point is labeled as an outlier
+ pub fn is_outlier(&self) -> bool {
+ match *self {
+ NotAnOutlier => false,
+ _ => true,
+ }
+ }
+
+ /// Checks if the data point is labeled as a "severe" outlier
+ pub fn is_severe(&self) -> bool {
+ match *self {
+ HighSevere | LowSevere => true,
+ _ => false,
+ }
+ }
+}
+
+/// Classifies the sample, and returns a labeled sample.
+///
+/// - Time: `O(N log N) where N = length`
+pub fn classify<A>(sample: &Sample<A>) -> LabeledSample<'_, A>
+where
+ A: Float,
+ usize: cast::From<A, Output = Result<usize, cast::Error>>,
+{
+ let (q1, _, q3) = sample.percentiles().quartiles();
+ let iqr = q3 - q1;
+
+ // Mild
+ let k_m = A::cast(1.5_f32);
+ // Severe
+ let k_s = A::cast(3);
+
+ LabeledSample {
+ fences: (
+ q1 - k_s * iqr,
+ q1 - k_m * iqr,
+ q3 + k_m * iqr,
+ q3 + k_s * iqr,
+ ),
+ sample,
+ }
+}
diff --git a/src/stats/univariate/percentiles.rs b/src/stats/univariate/percentiles.rs
new file mode 100755
index 0000000..be6bcf3
--- /dev/null
+++ b/src/stats/univariate/percentiles.rs
@@ -0,0 +1,80 @@
+use crate::stats::float::Float;
+use cast::{self, usize};
+
+/// A "view" into the percentiles of a sample
+pub struct Percentiles<A>(Box<[A]>)
+where
+ A: Float;
+
+// TODO(rust-lang/rfcs#735) move this `impl` into a private percentiles module
+impl<A> Percentiles<A>
+where
+ A: Float,
+ usize: cast::From<A, Output = Result<usize, cast::Error>>,
+{
+ /// Returns the percentile at `p`%
+ ///
+ /// Safety:
+ ///
+ /// - Make sure that `p` is in the range `[0, 100]`
+ unsafe fn at_unchecked(&self, p: A) -> A {
+ let _100 = A::cast(100);
+ debug_assert!(p >= A::cast(0) && p <= _100);
+ debug_assert!(self.0.len() > 0);
+ let len = self.0.len() - 1;
+
+ if p == _100 {
+ self.0[len]
+ } else {
+ let rank = (p / _100) * A::cast(len);
+ let integer = rank.floor();
+ let fraction = rank - integer;
+ let n = usize(integer).unwrap();
+ let &floor = self.0.get_unchecked(n);
+ let &ceiling = self.0.get_unchecked(n + 1);
+
+ floor + (ceiling - floor) * fraction
+ }
+ }
+
+ /// Returns the percentile at `p`%
+ ///
+ /// # Panics
+ ///
+ /// Panics if `p` is outside the closed `[0, 100]` range
+ pub fn at(&self, p: A) -> A {
+ let _0 = A::cast(0);
+ let _100 = A::cast(100);
+
+ assert!(p >= _0 && p <= _100);
+ assert!(self.0.len() > 0);
+
+ unsafe { self.at_unchecked(p) }
+ }
+
+ /// Returns the interquartile range
+ pub fn iqr(&self) -> A {
+ unsafe {
+ let q1 = self.at_unchecked(A::cast(25));
+ let q3 = self.at_unchecked(A::cast(75));
+
+ q3 - q1
+ }
+ }
+
+ /// Returns the 50th percentile
+ pub fn median(&self) -> A {
+ unsafe { self.at_unchecked(A::cast(50)) }
+ }
+
+ /// Returns the 25th, 50th and 75th percentiles
+ pub fn quartiles(&self) -> (A, A, A) {
+ unsafe {
+ (
+ self.at_unchecked(A::cast(25)),
+ self.at_unchecked(A::cast(50)),
+ self.at_unchecked(A::cast(75)),
+ )
+ }
+ }
+}
diff --git a/src/stats/univariate/resamples.rs b/src/stats/univariate/resamples.rs
new file mode 100755
index 0000000..831bc7a
--- /dev/null
+++ b/src/stats/univariate/resamples.rs
@@ -0,0 +1,117 @@
+use std::mem;
+
+use crate::stats::float::Float;
+use crate::stats::rand_util::{new_rng, Rng};
+use crate::stats::univariate::Sample;
+
+pub struct Resamples<'a, A>
+where
+ A: Float,
+{
+ rng: Rng,
+ sample: &'a [A],
+ stage: Option<Vec<A>>,
+}
+
+#[cfg_attr(feature = "cargo-clippy", allow(clippy::should_implement_trait))]
+impl<'a, A> Resamples<'a, A>
+where
+ A: 'a + Float,
+{
+ pub fn new(sample: &'a Sample<A>) -> Resamples<'a, A> {
+ let slice = sample;
+
+ Resamples {
+ rng: new_rng(),
+ sample: slice,
+ stage: None,
+ }
+ }
+
+ pub fn next(&mut self) -> &Sample<A> {
+ let n = self.sample.len();
+ let rng = &mut self.rng;
+
+ match self.stage {
+ None => {
+ let mut stage = Vec::with_capacity(n);
+
+ for _ in 0..n {
+ let idx = rng.rand_range(0u64..(self.sample.len() as u64));
+ stage.push(self.sample[idx as usize])
+ }
+
+ self.stage = Some(stage);
+ }
+ Some(ref mut stage) => {
+ for elem in stage.iter_mut() {
+ let idx = rng.rand_range(0u64..(self.sample.len() as u64));
+ *elem = self.sample[idx as usize]
+ }
+ }
+ }
+
+ if let Some(ref v) = self.stage {
+ unsafe { mem::transmute::<&[_], _>(v) }
+ } else {
+ unreachable!();
+ }
+ }
+}
+
+#[cfg(test)]
+mod test {
+ use quickcheck::quickcheck;
+ use quickcheck::TestResult;
+ use std::collections::HashSet;
+
+ use crate::stats::univariate::resamples::Resamples;
+ use crate::stats::univariate::Sample;
+
+ // Check that the resample is a subset of the sample
+ quickcheck! {
+ fn subset(size: usize, nresamples: usize) -> TestResult {
+ if size > 1 {
+ let v: Vec<_> = (0..size).map(|i| i as f32).collect();
+ let sample = Sample::new(&v);
+ let mut resamples = Resamples::new(sample);
+ let sample = v.iter().map(|&x| x as i64).collect::<HashSet<_>>();
+
+ TestResult::from_bool((0..nresamples).all(|_| {
+ let resample = resamples.next()
+
+ .iter()
+ .map(|&x| x as i64)
+ .collect::<HashSet<_>>();
+
+ resample.is_subset(&sample)
+ }))
+ } else {
+ TestResult::discard()
+ }
+ }
+ }
+
+ #[test]
+ fn different_subsets() {
+ let size = 1000;
+ let v: Vec<_> = (0..size).map(|i| i as f32).collect();
+ let sample = Sample::new(&v);
+ let mut resamples = Resamples::new(sample);
+
+ // Hypothetically, we might see one duplicate, but more than one is likely to be a bug.
+ let mut num_duplicated = 0;
+ for _ in 0..1000 {
+ let sample_1 = resamples.next().iter().cloned().collect::<Vec<_>>();
+ let sample_2 = resamples.next().iter().cloned().collect::<Vec<_>>();
+
+ if sample_1 == sample_2 {
+ num_duplicated += 1;
+ }
+ }
+
+ if num_duplicated > 1 {
+ panic!("Found {} duplicate samples", num_duplicated);
+ }
+ }
+}
diff --git a/src/stats/univariate/sample.rs b/src/stats/univariate/sample.rs
new file mode 100755
index 0000000..8f10db7
--- /dev/null
+++ b/src/stats/univariate/sample.rs
@@ -0,0 +1,255 @@
+use std::{mem, ops};
+
+use crate::stats::float::Float;
+use crate::stats::tuple::{Tuple, TupledDistributionsBuilder};
+use crate::stats::univariate::Percentiles;
+use crate::stats::univariate::Resamples;
+use rayon::prelude::*;
+
+/// A collection of data points drawn from a population
+///
+/// Invariants:
+///
+/// - The sample contains at least 2 data points
+/// - The sample contains no `NaN`s
+pub struct Sample<A>([A]);
+
+// TODO(rust-lang/rfcs#735) move this `impl` into a private percentiles module
+impl<A> Sample<A>
+where
+ A: Float,
+{
+ /// Creates a new sample from an existing slice
+ ///
+ /// # Panics
+ ///
+ /// Panics if `slice` contains any `NaN` or if `slice` has less than two elements
+ #[cfg_attr(feature = "cargo-clippy", allow(clippy::new_ret_no_self))]
+ pub fn new(slice: &[A]) -> &Sample<A> {
+ assert!(slice.len() > 1 && slice.iter().all(|x| !x.is_nan()));
+
+ unsafe { mem::transmute(slice) }
+ }
+
+ /// Returns the biggest element in the sample
+ ///
+ /// - Time: `O(length)`
+ pub fn max(&self) -> A {
+ let mut elems = self.iter();
+
+ match elems.next() {
+ Some(&head) => elems.fold(head, |a, &b| a.max(b)),
+ // NB `unreachable!` because `Sample` is guaranteed to have at least one data point
+ None => unreachable!(),
+ }
+ }
+
+ /// Returns the arithmetic average of the sample
+ ///
+ /// - Time: `O(length)`
+ pub fn mean(&self) -> A {
+ let n = self.len();
+
+ self.sum() / A::cast(n)
+ }
+
+ /// Returns the median absolute deviation
+ ///
+ /// The `median` can be optionally passed along to speed up (2X) the computation
+ ///
+ /// - Time: `O(length)`
+ /// - Memory: `O(length)`
+ pub fn median_abs_dev(&self, median: Option<A>) -> A
+ where
+ usize: cast::From<A, Output = Result<usize, cast::Error>>,
+ {
+ let median = median.unwrap_or_else(|| self.percentiles().median());
+
+ // NB Although this operation can be SIMD accelerated, the gain is negligible because the
+ // bottle neck is the sorting operation which is part of the computation of the median
+ let abs_devs = self.iter().map(|&x| (x - median).abs()).collect::<Vec<_>>();
+
+ let abs_devs: &Self = Self::new(&abs_devs);
+
+ abs_devs.percentiles().median() * A::cast(1.4826)
+ }
+
+ /// Returns the median absolute deviation as a percentage of the median
+ ///
+ /// - Time: `O(length)`
+ /// - Memory: `O(length)`
+ pub fn median_abs_dev_pct(&self) -> A
+ where
+ usize: cast::From<A, Output = Result<usize, cast::Error>>,
+ {
+ let _100 = A::cast(100);
+ let median = self.percentiles().median();
+ let mad = self.median_abs_dev(Some(median));
+
+ (mad / median) * _100
+ }
+
+ /// Returns the smallest element in the sample
+ ///
+ /// - Time: `O(length)`
+ pub fn min(&self) -> A {
+ let mut elems = self.iter();
+
+ match elems.next() {
+ Some(&elem) => elems.fold(elem, |a, &b| a.min(b)),
+ // NB `unreachable!` because `Sample` is guaranteed to have at least one data point
+ None => unreachable!(),
+ }
+ }
+
+ /// Returns a "view" into the percentiles of the sample
+ ///
+ /// This "view" makes consecutive computations of percentiles much faster (`O(1)`)
+ ///
+ /// - Time: `O(N log N) where N = length`
+ /// - Memory: `O(length)`
+ pub fn percentiles(&self) -> Percentiles<A>
+ where
+ usize: cast::From<A, Output = Result<usize, cast::Error>>,
+ {
+ use std::cmp::Ordering;
+
+ // NB This function assumes that there are no `NaN`s in the sample
+ fn cmp<T>(a: &T, b: &T) -> Ordering
+ where
+ T: PartialOrd,
+ {
+ match a.partial_cmp(b) {
+ Some(o) => o,
+ // Arbitrary way to handle NaNs that should never happen
+ None => Ordering::Equal,
+ }
+ }
+
+ let mut v = self.to_vec().into_boxed_slice();
+ v.par_sort_unstable_by(cmp);
+
+ // NB :-1: to intra-crate privacy rules
+ unsafe { mem::transmute(v) }
+ }
+
+ /// Returns the standard deviation of the sample
+ ///
+ /// The `mean` can be optionally passed along to speed up (2X) the computation
+ ///
+ /// - Time: `O(length)`
+ pub fn std_dev(&self, mean: Option<A>) -> A {
+ self.var(mean).sqrt()
+ }
+
+ /// Returns the standard deviation as a percentage of the mean
+ ///
+ /// - Time: `O(length)`
+ pub fn std_dev_pct(&self) -> A {
+ let _100 = A::cast(100);
+ let mean = self.mean();
+ let std_dev = self.std_dev(Some(mean));
+
+ (std_dev / mean) * _100
+ }
+
+ /// Returns the sum of all the elements of the sample
+ ///
+ /// - Time: `O(length)`
+ pub fn sum(&self) -> A {
+ crate::stats::sum(self)
+ }
+
+ /// Returns the t score between these two samples
+ ///
+ /// - Time: `O(length)`
+ pub fn t(&self, other: &Sample<A>) -> A {
+ let (x_bar, y_bar) = (self.mean(), other.mean());
+ let (s2_x, s2_y) = (self.var(Some(x_bar)), other.var(Some(y_bar)));
+ let n_x = A::cast(self.len());
+ let n_y = A::cast(other.len());
+ let num = x_bar - y_bar;
+ let den = (s2_x / n_x + s2_y / n_y).sqrt();
+
+ num / den
+ }
+
+ /// Returns the variance of the sample
+ ///
+ /// The `mean` can be optionally passed along to speed up (2X) the computation
+ ///
+ /// - Time: `O(length)`
+ pub fn var(&self, mean: Option<A>) -> A {
+ use std::ops::Add;
+
+ let mean = mean.unwrap_or_else(|| self.mean());
+ let slice = self;
+
+ let sum = slice
+ .iter()
+ .map(|&x| (x - mean).powi(2))
+ .fold(A::cast(0), Add::add);
+
+ sum / A::cast(slice.len() - 1)
+ }
+
+ // TODO Remove the `T` parameter in favor of `S::Output`
+ /// Returns the bootstrap distributions of the parameters estimated by the 1-sample statistic
+ ///
+ /// - Multi-threaded
+ /// - Time: `O(nresamples)`
+ /// - Memory: `O(nresamples)`
+ pub fn bootstrap<T, S>(&self, nresamples: usize, statistic: S) -> T::Distributions
+ where
+ S: Fn(&Sample<A>) -> T + Sync,
+ T: Tuple + Send,
+ T::Distributions: Send,
+ T::Builder: Send,
+ {
+ (0..nresamples)
+ .into_par_iter()
+ .map_init(
+ || Resamples::new(self),
+ |resamples, _| statistic(resamples.next()),
+ )
+ .fold(
+ || T::Builder::new(0),
+ |mut sub_distributions, sample| {
+ sub_distributions.push(sample);
+ sub_distributions
+ },
+ )
+ .reduce(
+ || T::Builder::new(0),
+ |mut a, mut b| {
+ a.extend(&mut b);
+ a
+ },
+ )
+ .complete()
+ }
+
+ #[cfg(test)]
+ pub fn iqr(&self) -> A
+ where
+ usize: cast::From<A, Output = Result<usize, cast::Error>>,
+ {
+ self.percentiles().iqr()
+ }
+
+ #[cfg(test)]
+ pub fn median(&self) -> A
+ where
+ usize: cast::From<A, Output = Result<usize, cast::Error>>,
+ {
+ self.percentiles().median()
+ }
+}
+
+impl<A> ops::Deref for Sample<A> {
+ type Target = [A];
+
+ fn deref(&self) -> &[A] {
+ &self.0
+ }
+}