diff options
Diffstat (limited to 'src/search/prefilter.rs')
-rw-r--r-- | src/search/prefilter.rs | 424 |
1 files changed, 0 insertions, 424 deletions
diff --git a/src/search/prefilter.rs b/src/search/prefilter.rs deleted file mode 100644 index 00e6acf..0000000 --- a/src/search/prefilter.rs +++ /dev/null @@ -1,424 +0,0 @@ -use core::mem; - -use ext_slice::ByteSlice; -use search::byte_frequencies::BYTE_FREQUENCIES; - -/// PrefilterState tracks state associated with the effectiveness of a -/// prefilter. It is used to track how many bytes, on average, are skipped by -/// the prefilter. If this average dips below a certain threshold over time, -/// then the state renders the prefilter inert and stops using it. -/// -/// A prefilter state should be created for each search. (Where creating an -/// iterator via, e.g., `find_iter`, is treated as a single search.) -#[derive(Clone, Debug)] -pub struct PrefilterState { - /// The number of skips that has been executed. - skips: usize, - /// The total number of bytes that have been skipped. - skipped: usize, - /// The maximum length of a match. This is used to help determine how many - /// bytes on average should be skipped in order for a prefilter to be - /// effective. - max_match_len: usize, - /// Once this heuristic has been deemed ineffective, it will be inert - /// throughout the rest of its lifetime. This serves as a cheap way to - /// check inertness. - inert: bool, -} - -impl PrefilterState { - /// The minimum number of skip attempts to try before considering whether - /// a prefilter is effective or not. - const MIN_SKIPS: usize = 50; - - /// The minimum amount of bytes that skipping must average. - /// - /// This value was chosen based on varying it and checking the bstr/find/ - /// microbenchmarks. In particular, this can impact the - /// pathological/repeated-{huge,small} benchmarks quite a bit if it's - /// set too low. - const MIN_SKIP_BYTES: usize = 8; - - /// Create a fresh prefilter state. - pub fn new(max_match_len: usize) -> PrefilterState { - if max_match_len == 0 { - return PrefilterState::inert(); - } - PrefilterState { skips: 0, skipped: 0, max_match_len, inert: false } - } - - /// Create a fresh prefilter state that is always inert. - fn inert() -> PrefilterState { - PrefilterState { skips: 0, skipped: 0, max_match_len: 0, inert: true } - } - - /// Update this state with the number of bytes skipped on the last - /// invocation of the prefilter. - #[inline] - pub fn update(&mut self, skipped: usize) { - self.skips += 1; - self.skipped += skipped; - } - - /// Return true if and only if this state indicates that a prefilter is - /// still effective. - #[inline] - pub fn is_effective(&mut self) -> bool { - if self.inert { - return false; - } - if self.skips < PrefilterState::MIN_SKIPS { - return true; - } - if self.skipped >= PrefilterState::MIN_SKIP_BYTES * self.skips { - return true; - } - - // We're inert. - self.inert = true; - false - } -} - -/// A heuristic frequency based prefilter for searching a single needle. -/// -/// This prefilter attempts to pick out the byte in a needle that is predicted -/// to occur least frequently, and search for that using fast vectorized -/// routines. If a rare enough byte could not be found, then this prefilter's -/// constructors will return `None`. -/// -/// This can be combined with `PrefilterState` to dynamically render this -/// prefilter inert if it proves to ineffective. -#[derive(Clone, Debug)] -pub struct Freqy { - /// Whether this prefilter should be used or not. - inert: bool, - /// The length of the needle we're searching for. - needle_len: usize, - /// The rarest byte in the needle, according to pre-computed frequency - /// analysis. - rare1: u8, - /// The leftmost offset of the rarest byte in the needle. - rare1i: usize, - /// The second rarest byte in the needle, according to pre-computed - /// frequency analysis. (This may be equivalent to the rarest byte.) - /// - /// The second rarest byte is used as a type of guard for quickly detecting - /// a mismatch after memchr locates an instance of the rarest byte. This - /// is a hedge against pathological cases where the pre-computed frequency - /// analysis may be off. (But of course, does not prevent *all* - /// pathological cases.) - rare2: u8, - /// The leftmost offset of the second rarest byte in the needle. - rare2i: usize, -} - -impl Freqy { - /// The maximum frequency rank permitted. If the rarest byte in the needle - /// has a frequency rank above this value, then Freqy is not used. - const MAX_RANK: usize = 200; - - /// Return a fresh prefilter state that can be used with this prefilter. A - /// prefilter state is used to track the effectiveness of a prefilter for - /// speeding up searches. Therefore, the prefilter state should generally - /// be reused on subsequent searches (such as in an iterator). For searches - /// on a different haystack, then a new prefilter state should be used. - pub fn prefilter_state(&self) -> PrefilterState { - if self.inert { - PrefilterState::inert() - } else { - PrefilterState::new(self.needle_len) - } - } - - /// Returns a valid but inert prefilter. This is valid for both the forward - /// and reverse direction. - /// - /// It is never correct to use an inert prefilter. The results of finding - /// the next (or previous) candidate are unspecified. - fn inert() -> Freqy { - Freqy { - inert: true, - needle_len: 0, - rare1: 0, - rare1i: 0, - rare2: 0, - rare2i: 0, - } - } - - /// Return search info for the given needle in the forward direction. - pub fn forward(needle: &[u8]) -> Freqy { - if needle.is_empty() { - return Freqy::inert(); - } - - // Find the rarest two bytes. Try to make them distinct (but it's not - // required). - let (mut rare1, mut rare1i) = (needle[0], 0); - let (mut rare2, mut rare2i) = (needle[0], 0); - if needle.len() >= 2 { - rare2 = needle[1]; - rare2i = 1; - } - if Freqy::rank(rare2) < Freqy::rank(rare1) { - mem::swap(&mut rare1, &mut rare2); - mem::swap(&mut rare1i, &mut rare2i); - } - for (i, b) in needle.bytes().enumerate().skip(2) { - if Freqy::rank(b) < Freqy::rank(rare1) { - rare2 = rare1; - rare2i = rare1i; - rare1 = b; - rare1i = i; - } else if b != rare1 && Freqy::rank(b) < Freqy::rank(rare2) { - rare2 = b; - rare2i = i; - } - } - if Freqy::rank(rare1) > Freqy::MAX_RANK { - return Freqy::inert(); - } - let needle_len = needle.len(); - Freqy { inert: false, needle_len, rare1, rare1i, rare2, rare2i } - } - - /// Return search info for the given needle in the reverse direction. - pub fn reverse(needle: &[u8]) -> Freqy { - if needle.is_empty() { - return Freqy::inert(); - } - - // Find the rarest two bytes. Try to make them distinct (but it's not - // required). In reverse, the offsets correspond to the number of bytes - // from the end of the needle. So `0` is the last byte in the needle. - let (mut rare1i, mut rare2i) = (0, 0); - if needle.len() >= 2 { - rare2i += 1; - } - let mut rare1 = needle[needle.len() - rare1i - 1]; - let mut rare2 = needle[needle.len() - rare2i - 1]; - if Freqy::rank(rare2) < Freqy::rank(rare1) { - mem::swap(&mut rare1, &mut rare2); - mem::swap(&mut rare1i, &mut rare2i); - } - for (i, b) in needle.bytes().rev().enumerate().skip(2) { - if Freqy::rank(b) < Freqy::rank(rare1) { - rare2 = rare1; - rare2i = rare1i; - rare1 = b; - rare1i = i; - } else if b != rare1 && Freqy::rank(b) < Freqy::rank(rare2) { - rare2 = b; - rare2i = i; - } - } - if Freqy::rank(rare1) > Freqy::MAX_RANK { - return Freqy::inert(); - } - let needle_len = needle.len(); - Freqy { inert: false, needle_len, rare1, rare1i, rare2, rare2i } - } - - /// Look for a possible occurrence of needle. The position returned - /// corresponds to the beginning of the occurrence, if one exists. - /// - /// Callers may assume that this never returns false negatives (i.e., it - /// never misses an actual occurrence), but must check that the returned - /// position corresponds to a match. That is, it can return false - /// positives. - /// - /// This should only be used when Freqy is constructed for forward - /// searching. - pub fn find_candidate( - &self, - prestate: &mut PrefilterState, - haystack: &[u8], - ) -> Option<usize> { - debug_assert!(!self.inert); - - let mut i = 0; - while prestate.is_effective() { - // Use a fast vectorized implementation to skip to the next - // occurrence of the rarest byte (heuristically chosen) in the - // needle. - i += match haystack[i..].find_byte(self.rare1) { - None => return None, - Some(found) => { - prestate.update(found); - found - } - }; - - // If we can't align our first match with the haystack, then a - // match is impossible. - if i < self.rare1i { - i += 1; - continue; - } - - // Align our rare2 byte with the haystack. A mismatch means that - // a match is impossible. - let aligned_rare2i = i - self.rare1i + self.rare2i; - if haystack.get(aligned_rare2i) != Some(&self.rare2) { - i += 1; - continue; - } - - // We've done what we can. There might be a match here. - return Some(i - self.rare1i); - } - // The only way we get here is if we believe our skipping heuristic - // has become ineffective. We're allowed to return false positives, - // so return the position at which we advanced to, aligned to the - // haystack. - Some(i.saturating_sub(self.rare1i)) - } - - /// Look for a possible occurrence of needle, in reverse, starting from the - /// end of the given haystack. The position returned corresponds to the - /// position immediately after the end of the occurrence, if one exists. - /// - /// Callers may assume that this never returns false negatives (i.e., it - /// never misses an actual occurrence), but must check that the returned - /// position corresponds to a match. That is, it can return false - /// positives. - /// - /// This should only be used when Freqy is constructed for reverse - /// searching. - pub fn rfind_candidate( - &self, - prestate: &mut PrefilterState, - haystack: &[u8], - ) -> Option<usize> { - debug_assert!(!self.inert); - - let mut i = haystack.len(); - while prestate.is_effective() { - // Use a fast vectorized implementation to skip to the next - // occurrence of the rarest byte (heuristically chosen) in the - // needle. - i = match haystack[..i].rfind_byte(self.rare1) { - None => return None, - Some(found) => { - prestate.update(i - found); - found - } - }; - - // If we can't align our first match with the haystack, then a - // match is impossible. - if i + self.rare1i + 1 > haystack.len() { - continue; - } - - // Align our rare2 byte with the haystack. A mismatch means that - // a match is impossible. - let aligned = match (i + self.rare1i).checked_sub(self.rare2i) { - None => continue, - Some(aligned) => aligned, - }; - if haystack.get(aligned) != Some(&self.rare2) { - continue; - } - - // We've done what we can. There might be a match here. - return Some(i + self.rare1i + 1); - } - // The only way we get here is if we believe our skipping heuristic - // has become ineffective. We're allowed to return false positives, - // so return the position at which we advanced to, aligned to the - // haystack. - Some(i + self.rare1i + 1) - } - - /// Return the heuristical frequency rank of the given byte. A lower rank - /// means the byte is believed to occur less frequently. - fn rank(b: u8) -> usize { - BYTE_FREQUENCIES[b as usize] as usize - } -} - -#[cfg(test)] -mod tests { - use super::*; - use ext_slice::B; - - #[test] - fn freqy_forward() { - // N.B. We sometimes use uppercase here since that mostly ensures freqy - // will be constructable. Lowercase letters may be too common for freqy - // to work. - - let s = Freqy::forward(B("BAR")); - let mut pre = s.prefilter_state(); - assert_eq!(Some(0), s.find_candidate(&mut pre, B("BARFOO"))); - - let s = Freqy::forward(B("BAR")); - let mut pre = s.prefilter_state(); - assert_eq!(Some(3), s.find_candidate(&mut pre, B("FOOBAR"))); - - let s = Freqy::forward(B("zyzy")); - let mut pre = s.prefilter_state(); - assert_eq!(Some(0), s.find_candidate(&mut pre, B("zyzz"))); - - let s = Freqy::forward(B("zyzy")); - let mut pre = s.prefilter_state(); - assert_eq!(Some(2), s.find_candidate(&mut pre, B("zzzy"))); - - let s = Freqy::forward(B("zyzy")); - let mut pre = s.prefilter_state(); - assert_eq!(None, s.find_candidate(&mut pre, B("zazb"))); - - let s = Freqy::forward(B("yzyz")); - let mut pre = s.prefilter_state(); - assert_eq!(Some(0), s.find_candidate(&mut pre, B("yzyy"))); - - let s = Freqy::forward(B("yzyz")); - let mut pre = s.prefilter_state(); - assert_eq!(Some(2), s.find_candidate(&mut pre, B("yyyz"))); - - let s = Freqy::forward(B("yzyz")); - let mut pre = s.prefilter_state(); - assert_eq!(None, s.find_candidate(&mut pre, B("yayb"))); - } - - #[test] - fn freqy_reverse() { - // N.B. We sometimes use uppercase here since that mostly ensures freqy - // will be constructable. Lowercase letters may be too common for freqy - // to work. - - let s = Freqy::reverse(B("BAR")); - let mut pre = s.prefilter_state(); - assert_eq!(Some(3), s.rfind_candidate(&mut pre, B("BARFOO"))); - - let s = Freqy::reverse(B("BAR")); - let mut pre = s.prefilter_state(); - assert_eq!(Some(6), s.rfind_candidate(&mut pre, B("FOOBAR"))); - - let s = Freqy::reverse(B("zyzy")); - let mut pre = s.prefilter_state(); - assert_eq!(Some(2), s.rfind_candidate(&mut pre, B("zyzz"))); - - let s = Freqy::reverse(B("zyzy")); - let mut pre = s.prefilter_state(); - assert_eq!(Some(4), s.rfind_candidate(&mut pre, B("zzzy"))); - - let s = Freqy::reverse(B("zyzy")); - let mut pre = s.prefilter_state(); - assert_eq!(None, s.rfind_candidate(&mut pre, B("zazb"))); - - let s = Freqy::reverse(B("yzyz")); - let mut pre = s.prefilter_state(); - assert_eq!(Some(2), s.rfind_candidate(&mut pre, B("yzyy"))); - - let s = Freqy::reverse(B("yzyz")); - let mut pre = s.prefilter_state(); - assert_eq!(Some(4), s.rfind_candidate(&mut pre, B("yyyz"))); - - let s = Freqy::reverse(B("yzyz")); - let mut pre = s.prefilter_state(); - assert_eq!(None, s.rfind_candidate(&mut pre, B("yayb"))); - } -} |