diff options
Diffstat (limited to 'src/stats/univariate/outliers/tukey.rs')
-rwxr-xr-x | src/stats/univariate/outliers/tukey.rs | 291 |
1 files changed, 291 insertions, 0 deletions
diff --git a/src/stats/univariate/outliers/tukey.rs b/src/stats/univariate/outliers/tukey.rs new file mode 100755 index 0000000..bfd08f1 --- /dev/null +++ b/src/stats/univariate/outliers/tukey.rs @@ -0,0 +1,291 @@ +//! Tukey's method +//! +//! The original method uses two "fences" to classify the data. All the observations "inside" the +//! fences are considered "normal", and the rest are considered outliers. +//! +//! The fences are computed from the quartiles of the sample, according to the following formula: +//! +//! ``` ignore +//! // q1, q3 are the first and third quartiles +//! let iqr = q3 - q1; // The interquartile range +//! let (f1, f2) = (q1 - 1.5 * iqr, q3 + 1.5 * iqr); // the "fences" +//! +//! let is_outlier = |x| if x > f1 && x < f2 { true } else { false }; +//! ``` +//! +//! The classifier provided here adds two extra outer fences: +//! +//! ``` ignore +//! let (f3, f4) = (q1 - 3 * iqr, q3 + 3 * iqr); // the outer "fences" +//! ``` +//! +//! The extra fences add a sense of "severity" to the classification. Data points outside of the +//! outer fences are considered "severe" outliers, whereas points outside the inner fences are just +//! "mild" outliers, and, as the original method, everything inside the inner fences is considered +//! "normal" data. +//! +//! Some ASCII art for the visually oriented people: +//! +//! ``` ignore +//! LOW-ish NORMAL-ish HIGH-ish +//! x | + | o o o o o o o | + | x +//! f3 f1 f2 f4 +//! +//! Legend: +//! o: "normal" data (not an outlier) +//! +: "mild" outlier +//! x: "severe" outlier +//! ``` + +use std::iter::IntoIterator; +use std::ops::{Deref, Index}; +use std::slice; + +use crate::stats::float::Float; +use crate::stats::univariate::Sample; + +use self::Label::*; + +/// A classified/labeled sample. +/// +/// The labeled data can be accessed using the indexing operator. The order of the data points is +/// retained. +/// +/// NOTE: Due to limitations in the indexing traits, only the label is returned. Once the +/// `IndexGet` trait lands in stdlib, the indexing operation will return a `(data_point, label)` +/// pair. +#[derive(Clone, Copy)] +pub struct LabeledSample<'a, A> +where + A: Float, +{ + fences: (A, A, A, A), + sample: &'a Sample<A>, +} + +impl<'a, A> LabeledSample<'a, A> +where + A: Float, +{ + /// Returns the number of data points per label + /// + /// - Time: `O(length)` + #[cfg_attr(feature = "cargo-clippy", allow(clippy::similar_names))] + pub fn count(&self) -> (usize, usize, usize, usize, usize) { + let (mut los, mut lom, mut noa, mut him, mut his) = (0, 0, 0, 0, 0); + + for (_, label) in self { + match label { + LowSevere => { + los += 1; + } + LowMild => { + lom += 1; + } + NotAnOutlier => { + noa += 1; + } + HighMild => { + him += 1; + } + HighSevere => { + his += 1; + } + } + } + + (los, lom, noa, him, his) + } + + /// Returns the fences used to classify the outliers + pub fn fences(&self) -> (A, A, A, A) { + self.fences + } + + /// Returns an iterator over the labeled data + pub fn iter(&self) -> Iter<'a, A> { + Iter { + fences: self.fences, + iter: self.sample.iter(), + } + } +} + +impl<'a, A> Deref for LabeledSample<'a, A> +where + A: Float, +{ + type Target = Sample<A>; + + fn deref(&self) -> &Sample<A> { + self.sample + } +} + +// FIXME Use the `IndexGet` trait +impl<'a, A> Index<usize> for LabeledSample<'a, A> +where + A: Float, +{ + type Output = Label; + + #[cfg_attr(feature = "cargo-clippy", allow(clippy::similar_names))] + fn index(&self, i: usize) -> &Label { + static LOW_SEVERE: Label = LowSevere; + static LOW_MILD: Label = LowMild; + static HIGH_MILD: Label = HighMild; + static HIGH_SEVERE: Label = HighSevere; + static NOT_AN_OUTLIER: Label = NotAnOutlier; + + let x = self.sample[i]; + let (lost, lomt, himt, hist) = self.fences; + + if x < lost { + &LOW_SEVERE + } else if x > hist { + &HIGH_SEVERE + } else if x < lomt { + &LOW_MILD + } else if x > himt { + &HIGH_MILD + } else { + &NOT_AN_OUTLIER + } + } +} + +impl<'a, 'b, A> IntoIterator for &'b LabeledSample<'a, A> +where + A: Float, +{ + type Item = (A, Label); + type IntoIter = Iter<'a, A>; + + fn into_iter(self) -> Iter<'a, A> { + self.iter() + } +} + +/// Iterator over the labeled data +pub struct Iter<'a, A> +where + A: Float, +{ + fences: (A, A, A, A), + iter: slice::Iter<'a, A>, +} + +impl<'a, A> Iterator for Iter<'a, A> +where + A: Float, +{ + type Item = (A, Label); + + #[cfg_attr(feature = "cargo-clippy", allow(clippy::similar_names))] + fn next(&mut self) -> Option<(A, Label)> { + self.iter.next().map(|&x| { + let (lost, lomt, himt, hist) = self.fences; + + let label = if x < lost { + LowSevere + } else if x > hist { + HighSevere + } else if x < lomt { + LowMild + } else if x > himt { + HighMild + } else { + NotAnOutlier + }; + + (x, label) + }) + } + + fn size_hint(&self) -> (usize, Option<usize>) { + self.iter.size_hint() + } +} + +/// Labels used to classify outliers +pub enum Label { + /// A "mild" outlier in the "high" spectrum + HighMild, + /// A "severe" outlier in the "high" spectrum + HighSevere, + /// A "mild" outlier in the "low" spectrum + LowMild, + /// A "severe" outlier in the "low" spectrum + LowSevere, + /// A normal data point + NotAnOutlier, +} + +impl Label { + /// Checks if the data point has an "unusually" high value + pub fn is_high(&self) -> bool { + match *self { + HighMild | HighSevere => true, + _ => false, + } + } + + /// Checks if the data point is labeled as a "mild" outlier + pub fn is_mild(&self) -> bool { + match *self { + HighMild | LowMild => true, + _ => false, + } + } + + /// Checks if the data point has an "unusually" low value + pub fn is_low(&self) -> bool { + match *self { + LowMild | LowSevere => true, + _ => false, + } + } + + /// Checks if the data point is labeled as an outlier + pub fn is_outlier(&self) -> bool { + match *self { + NotAnOutlier => false, + _ => true, + } + } + + /// Checks if the data point is labeled as a "severe" outlier + pub fn is_severe(&self) -> bool { + match *self { + HighSevere | LowSevere => true, + _ => false, + } + } +} + +/// Classifies the sample, and returns a labeled sample. +/// +/// - Time: `O(N log N) where N = length` +pub fn classify<A>(sample: &Sample<A>) -> LabeledSample<'_, A> +where + A: Float, + usize: cast::From<A, Output = Result<usize, cast::Error>>, +{ + let (q1, _, q3) = sample.percentiles().quartiles(); + let iqr = q3 - q1; + + // Mild + let k_m = A::cast(1.5_f32); + // Severe + let k_s = A::cast(3); + + LabeledSample { + fences: ( + q1 - k_s * iqr, + q1 - k_m * iqr, + q3 + k_m * iqr, + q3 + k_s * iqr, + ), + sample, + } +} |