aboutsummaryrefslogtreecommitdiff
path: root/src/search/prefilter.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/search/prefilter.rs')
-rw-r--r--src/search/prefilter.rs424
1 files changed, 0 insertions, 424 deletions
diff --git a/src/search/prefilter.rs b/src/search/prefilter.rs
deleted file mode 100644
index 00e6acf..0000000
--- a/src/search/prefilter.rs
+++ /dev/null
@@ -1,424 +0,0 @@
-use core::mem;
-
-use ext_slice::ByteSlice;
-use search::byte_frequencies::BYTE_FREQUENCIES;
-
-/// PrefilterState tracks state associated with the effectiveness of a
-/// prefilter. It is used to track how many bytes, on average, are skipped by
-/// the prefilter. If this average dips below a certain threshold over time,
-/// then the state renders the prefilter inert and stops using it.
-///
-/// A prefilter state should be created for each search. (Where creating an
-/// iterator via, e.g., `find_iter`, is treated as a single search.)
-#[derive(Clone, Debug)]
-pub struct PrefilterState {
- /// The number of skips that has been executed.
- skips: usize,
- /// The total number of bytes that have been skipped.
- skipped: usize,
- /// The maximum length of a match. This is used to help determine how many
- /// bytes on average should be skipped in order for a prefilter to be
- /// effective.
- max_match_len: usize,
- /// Once this heuristic has been deemed ineffective, it will be inert
- /// throughout the rest of its lifetime. This serves as a cheap way to
- /// check inertness.
- inert: bool,
-}
-
-impl PrefilterState {
- /// The minimum number of skip attempts to try before considering whether
- /// a prefilter is effective or not.
- const MIN_SKIPS: usize = 50;
-
- /// The minimum amount of bytes that skipping must average.
- ///
- /// This value was chosen based on varying it and checking the bstr/find/
- /// microbenchmarks. In particular, this can impact the
- /// pathological/repeated-{huge,small} benchmarks quite a bit if it's
- /// set too low.
- const MIN_SKIP_BYTES: usize = 8;
-
- /// Create a fresh prefilter state.
- pub fn new(max_match_len: usize) -> PrefilterState {
- if max_match_len == 0 {
- return PrefilterState::inert();
- }
- PrefilterState { skips: 0, skipped: 0, max_match_len, inert: false }
- }
-
- /// Create a fresh prefilter state that is always inert.
- fn inert() -> PrefilterState {
- PrefilterState { skips: 0, skipped: 0, max_match_len: 0, inert: true }
- }
-
- /// Update this state with the number of bytes skipped on the last
- /// invocation of the prefilter.
- #[inline]
- pub fn update(&mut self, skipped: usize) {
- self.skips += 1;
- self.skipped += skipped;
- }
-
- /// Return true if and only if this state indicates that a prefilter is
- /// still effective.
- #[inline]
- pub fn is_effective(&mut self) -> bool {
- if self.inert {
- return false;
- }
- if self.skips < PrefilterState::MIN_SKIPS {
- return true;
- }
- if self.skipped >= PrefilterState::MIN_SKIP_BYTES * self.skips {
- return true;
- }
-
- // We're inert.
- self.inert = true;
- false
- }
-}
-
-/// A heuristic frequency based prefilter for searching a single needle.
-///
-/// This prefilter attempts to pick out the byte in a needle that is predicted
-/// to occur least frequently, and search for that using fast vectorized
-/// routines. If a rare enough byte could not be found, then this prefilter's
-/// constructors will return `None`.
-///
-/// This can be combined with `PrefilterState` to dynamically render this
-/// prefilter inert if it proves to ineffective.
-#[derive(Clone, Debug)]
-pub struct Freqy {
- /// Whether this prefilter should be used or not.
- inert: bool,
- /// The length of the needle we're searching for.
- needle_len: usize,
- /// The rarest byte in the needle, according to pre-computed frequency
- /// analysis.
- rare1: u8,
- /// The leftmost offset of the rarest byte in the needle.
- rare1i: usize,
- /// The second rarest byte in the needle, according to pre-computed
- /// frequency analysis. (This may be equivalent to the rarest byte.)
- ///
- /// The second rarest byte is used as a type of guard for quickly detecting
- /// a mismatch after memchr locates an instance of the rarest byte. This
- /// is a hedge against pathological cases where the pre-computed frequency
- /// analysis may be off. (But of course, does not prevent *all*
- /// pathological cases.)
- rare2: u8,
- /// The leftmost offset of the second rarest byte in the needle.
- rare2i: usize,
-}
-
-impl Freqy {
- /// The maximum frequency rank permitted. If the rarest byte in the needle
- /// has a frequency rank above this value, then Freqy is not used.
- const MAX_RANK: usize = 200;
-
- /// Return a fresh prefilter state that can be used with this prefilter. A
- /// prefilter state is used to track the effectiveness of a prefilter for
- /// speeding up searches. Therefore, the prefilter state should generally
- /// be reused on subsequent searches (such as in an iterator). For searches
- /// on a different haystack, then a new prefilter state should be used.
- pub fn prefilter_state(&self) -> PrefilterState {
- if self.inert {
- PrefilterState::inert()
- } else {
- PrefilterState::new(self.needle_len)
- }
- }
-
- /// Returns a valid but inert prefilter. This is valid for both the forward
- /// and reverse direction.
- ///
- /// It is never correct to use an inert prefilter. The results of finding
- /// the next (or previous) candidate are unspecified.
- fn inert() -> Freqy {
- Freqy {
- inert: true,
- needle_len: 0,
- rare1: 0,
- rare1i: 0,
- rare2: 0,
- rare2i: 0,
- }
- }
-
- /// Return search info for the given needle in the forward direction.
- pub fn forward(needle: &[u8]) -> Freqy {
- if needle.is_empty() {
- return Freqy::inert();
- }
-
- // Find the rarest two bytes. Try to make them distinct (but it's not
- // required).
- let (mut rare1, mut rare1i) = (needle[0], 0);
- let (mut rare2, mut rare2i) = (needle[0], 0);
- if needle.len() >= 2 {
- rare2 = needle[1];
- rare2i = 1;
- }
- if Freqy::rank(rare2) < Freqy::rank(rare1) {
- mem::swap(&mut rare1, &mut rare2);
- mem::swap(&mut rare1i, &mut rare2i);
- }
- for (i, b) in needle.bytes().enumerate().skip(2) {
- if Freqy::rank(b) < Freqy::rank(rare1) {
- rare2 = rare1;
- rare2i = rare1i;
- rare1 = b;
- rare1i = i;
- } else if b != rare1 && Freqy::rank(b) < Freqy::rank(rare2) {
- rare2 = b;
- rare2i = i;
- }
- }
- if Freqy::rank(rare1) > Freqy::MAX_RANK {
- return Freqy::inert();
- }
- let needle_len = needle.len();
- Freqy { inert: false, needle_len, rare1, rare1i, rare2, rare2i }
- }
-
- /// Return search info for the given needle in the reverse direction.
- pub fn reverse(needle: &[u8]) -> Freqy {
- if needle.is_empty() {
- return Freqy::inert();
- }
-
- // Find the rarest two bytes. Try to make them distinct (but it's not
- // required). In reverse, the offsets correspond to the number of bytes
- // from the end of the needle. So `0` is the last byte in the needle.
- let (mut rare1i, mut rare2i) = (0, 0);
- if needle.len() >= 2 {
- rare2i += 1;
- }
- let mut rare1 = needle[needle.len() - rare1i - 1];
- let mut rare2 = needle[needle.len() - rare2i - 1];
- if Freqy::rank(rare2) < Freqy::rank(rare1) {
- mem::swap(&mut rare1, &mut rare2);
- mem::swap(&mut rare1i, &mut rare2i);
- }
- for (i, b) in needle.bytes().rev().enumerate().skip(2) {
- if Freqy::rank(b) < Freqy::rank(rare1) {
- rare2 = rare1;
- rare2i = rare1i;
- rare1 = b;
- rare1i = i;
- } else if b != rare1 && Freqy::rank(b) < Freqy::rank(rare2) {
- rare2 = b;
- rare2i = i;
- }
- }
- if Freqy::rank(rare1) > Freqy::MAX_RANK {
- return Freqy::inert();
- }
- let needle_len = needle.len();
- Freqy { inert: false, needle_len, rare1, rare1i, rare2, rare2i }
- }
-
- /// Look for a possible occurrence of needle. The position returned
- /// corresponds to the beginning of the occurrence, if one exists.
- ///
- /// Callers may assume that this never returns false negatives (i.e., it
- /// never misses an actual occurrence), but must check that the returned
- /// position corresponds to a match. That is, it can return false
- /// positives.
- ///
- /// This should only be used when Freqy is constructed for forward
- /// searching.
- pub fn find_candidate(
- &self,
- prestate: &mut PrefilterState,
- haystack: &[u8],
- ) -> Option<usize> {
- debug_assert!(!self.inert);
-
- let mut i = 0;
- while prestate.is_effective() {
- // Use a fast vectorized implementation to skip to the next
- // occurrence of the rarest byte (heuristically chosen) in the
- // needle.
- i += match haystack[i..].find_byte(self.rare1) {
- None => return None,
- Some(found) => {
- prestate.update(found);
- found
- }
- };
-
- // If we can't align our first match with the haystack, then a
- // match is impossible.
- if i < self.rare1i {
- i += 1;
- continue;
- }
-
- // Align our rare2 byte with the haystack. A mismatch means that
- // a match is impossible.
- let aligned_rare2i = i - self.rare1i + self.rare2i;
- if haystack.get(aligned_rare2i) != Some(&self.rare2) {
- i += 1;
- continue;
- }
-
- // We've done what we can. There might be a match here.
- return Some(i - self.rare1i);
- }
- // The only way we get here is if we believe our skipping heuristic
- // has become ineffective. We're allowed to return false positives,
- // so return the position at which we advanced to, aligned to the
- // haystack.
- Some(i.saturating_sub(self.rare1i))
- }
-
- /// Look for a possible occurrence of needle, in reverse, starting from the
- /// end of the given haystack. The position returned corresponds to the
- /// position immediately after the end of the occurrence, if one exists.
- ///
- /// Callers may assume that this never returns false negatives (i.e., it
- /// never misses an actual occurrence), but must check that the returned
- /// position corresponds to a match. That is, it can return false
- /// positives.
- ///
- /// This should only be used when Freqy is constructed for reverse
- /// searching.
- pub fn rfind_candidate(
- &self,
- prestate: &mut PrefilterState,
- haystack: &[u8],
- ) -> Option<usize> {
- debug_assert!(!self.inert);
-
- let mut i = haystack.len();
- while prestate.is_effective() {
- // Use a fast vectorized implementation to skip to the next
- // occurrence of the rarest byte (heuristically chosen) in the
- // needle.
- i = match haystack[..i].rfind_byte(self.rare1) {
- None => return None,
- Some(found) => {
- prestate.update(i - found);
- found
- }
- };
-
- // If we can't align our first match with the haystack, then a
- // match is impossible.
- if i + self.rare1i + 1 > haystack.len() {
- continue;
- }
-
- // Align our rare2 byte with the haystack. A mismatch means that
- // a match is impossible.
- let aligned = match (i + self.rare1i).checked_sub(self.rare2i) {
- None => continue,
- Some(aligned) => aligned,
- };
- if haystack.get(aligned) != Some(&self.rare2) {
- continue;
- }
-
- // We've done what we can. There might be a match here.
- return Some(i + self.rare1i + 1);
- }
- // The only way we get here is if we believe our skipping heuristic
- // has become ineffective. We're allowed to return false positives,
- // so return the position at which we advanced to, aligned to the
- // haystack.
- Some(i + self.rare1i + 1)
- }
-
- /// Return the heuristical frequency rank of the given byte. A lower rank
- /// means the byte is believed to occur less frequently.
- fn rank(b: u8) -> usize {
- BYTE_FREQUENCIES[b as usize] as usize
- }
-}
-
-#[cfg(test)]
-mod tests {
- use super::*;
- use ext_slice::B;
-
- #[test]
- fn freqy_forward() {
- // N.B. We sometimes use uppercase here since that mostly ensures freqy
- // will be constructable. Lowercase letters may be too common for freqy
- // to work.
-
- let s = Freqy::forward(B("BAR"));
- let mut pre = s.prefilter_state();
- assert_eq!(Some(0), s.find_candidate(&mut pre, B("BARFOO")));
-
- let s = Freqy::forward(B("BAR"));
- let mut pre = s.prefilter_state();
- assert_eq!(Some(3), s.find_candidate(&mut pre, B("FOOBAR")));
-
- let s = Freqy::forward(B("zyzy"));
- let mut pre = s.prefilter_state();
- assert_eq!(Some(0), s.find_candidate(&mut pre, B("zyzz")));
-
- let s = Freqy::forward(B("zyzy"));
- let mut pre = s.prefilter_state();
- assert_eq!(Some(2), s.find_candidate(&mut pre, B("zzzy")));
-
- let s = Freqy::forward(B("zyzy"));
- let mut pre = s.prefilter_state();
- assert_eq!(None, s.find_candidate(&mut pre, B("zazb")));
-
- let s = Freqy::forward(B("yzyz"));
- let mut pre = s.prefilter_state();
- assert_eq!(Some(0), s.find_candidate(&mut pre, B("yzyy")));
-
- let s = Freqy::forward(B("yzyz"));
- let mut pre = s.prefilter_state();
- assert_eq!(Some(2), s.find_candidate(&mut pre, B("yyyz")));
-
- let s = Freqy::forward(B("yzyz"));
- let mut pre = s.prefilter_state();
- assert_eq!(None, s.find_candidate(&mut pre, B("yayb")));
- }
-
- #[test]
- fn freqy_reverse() {
- // N.B. We sometimes use uppercase here since that mostly ensures freqy
- // will be constructable. Lowercase letters may be too common for freqy
- // to work.
-
- let s = Freqy::reverse(B("BAR"));
- let mut pre = s.prefilter_state();
- assert_eq!(Some(3), s.rfind_candidate(&mut pre, B("BARFOO")));
-
- let s = Freqy::reverse(B("BAR"));
- let mut pre = s.prefilter_state();
- assert_eq!(Some(6), s.rfind_candidate(&mut pre, B("FOOBAR")));
-
- let s = Freqy::reverse(B("zyzy"));
- let mut pre = s.prefilter_state();
- assert_eq!(Some(2), s.rfind_candidate(&mut pre, B("zyzz")));
-
- let s = Freqy::reverse(B("zyzy"));
- let mut pre = s.prefilter_state();
- assert_eq!(Some(4), s.rfind_candidate(&mut pre, B("zzzy")));
-
- let s = Freqy::reverse(B("zyzy"));
- let mut pre = s.prefilter_state();
- assert_eq!(None, s.rfind_candidate(&mut pre, B("zazb")));
-
- let s = Freqy::reverse(B("yzyz"));
- let mut pre = s.prefilter_state();
- assert_eq!(Some(2), s.rfind_candidate(&mut pre, B("yzyy")));
-
- let s = Freqy::reverse(B("yzyz"));
- let mut pre = s.prefilter_state();
- assert_eq!(Some(4), s.rfind_candidate(&mut pre, B("yyyz")));
-
- let s = Freqy::reverse(B("yzyz"));
- let mut pre = s.prefilter_state();
- assert_eq!(None, s.rfind_candidate(&mut pre, B("yayb")));
- }
-}