aboutsummaryrefslogtreecommitdiff
path: root/src/nfa.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/nfa.rs')
-rw-r--r--src/nfa.rs321
1 files changed, 79 insertions, 242 deletions
diff --git a/src/nfa.rs b/src/nfa.rs
index e29bb27..05c5cfb 100644
--- a/src/nfa.rs
+++ b/src/nfa.rs
@@ -65,7 +65,7 @@ pub struct NFA<S> {
/// A set of equivalence classes in terms of bytes. We compute this while
/// building the NFA, but don't use it in the NFA's states. Instead, we
/// use this for building the DFA. We store it on the NFA since it's easy
- /// to compute while visiting the the patterns.
+ /// to compute while visiting the patterns.
byte_classes: ByteClasses,
/// A set of states. Each state defines its own transitions, a fail
/// transition and a set of indices corresponding to matches.
@@ -313,17 +313,6 @@ impl<S: StateID> State<S> {
!self.matches.is_empty()
}
- fn get_longest_match_len(&self) -> Option<usize> {
- // Why is this true? Because the first match in any matching state
- // will always correspond to the match added to it during trie
- // construction (since when we copy matches due to failure transitions,
- // we always append them). Therefore, it follows that the first match
- // must always be longest since any subsequent match must be from a
- // failure transition, and a failure transition by construction points
- // to a proper suffix. A proper suffix is, by definition, smaller.
- self.matches.get(0).map(|&(_, len)| len)
- }
-
fn next_state(&self, input: u8) -> S {
self.trans.next_state(input)
}
@@ -649,11 +638,7 @@ impl<'a, S: StateID> Compiler<'a, S> {
self.add_start_state_loop();
self.add_dead_state_loop();
if !self.builder.anchored {
- if self.match_kind().is_leftmost() {
- self.fill_failure_transitions_leftmost();
- } else {
- self.fill_failure_transitions_standard();
- }
+ self.fill_failure_transitions();
}
self.close_start_state_loop();
self.nfa.byte_classes = self.byte_classes.build();
@@ -739,7 +724,8 @@ impl<'a, S: StateID> Compiler<'a, S> {
}
/// This routine creates failure transitions according to the standard
- /// textbook formulation of the Aho-Corasick algorithm.
+ /// textbook formulation of the Aho-Corasick algorithm, with a couple small
+ /// tweaks to support "leftmost" semantics.
///
/// Building failure transitions is the most interesting part of building
/// the Aho-Corasick automaton, because they are what allow searches to
@@ -807,11 +793,15 @@ impl<'a, S: StateID> Compiler<'a, S> {
/// 'abcd', 'b', 'bcd' and 'cd':
///
/// ```ignore
- /// - a - S1 - b - S2 - c - S3 - d - S4*
- /// /
- /// S0 - b - S5* - c - S6 - d - S7*
- /// \
- /// - c - S8 - d - S9*
+ /// - a - S1 - b - S2* - c - S3 - d - S4*
+ /// / / /
+ /// / ------- -------
+ /// / / /
+ /// S0 --- b - S5* - c - S6 - d - S7*
+ /// \ /
+ /// \ --------
+ /// \ /
+ /// - c - S8 - d - S9*
/// ```
///
/// The failure transitions for this trie are defined from S2 to S5,
@@ -839,20 +829,50 @@ impl<'a, S: StateID> Compiler<'a, S> {
/// We don't actually use recursion to implement this, but instead, use a
/// breadth first search of the automaton. Our base case is the start
/// state, whose failure transition is just a transition to itself.
- fn fill_failure_transitions_standard(&mut self) {
+ ///
+ /// When building a leftmost automaton, we proceed as above, but only
+ /// include a subset of failure transitions. Namely, we omit any failure
+ /// transitions that appear after a match state in the trie. This is
+ /// because failure transitions always point back to a proper suffix of
+ /// what has been seen so far. Thus, following a failure transition after
+ /// a match implies looking for a match that starts after the one that has
+ /// already been seen, which is of course therefore not the leftmost match.
+ ///
+ /// N.B. I came up with this algorithm on my own, and after scouring all of
+ /// the other AC implementations I know of (Perl, Snort, many on GitHub).
+ /// I couldn't find any that implement leftmost semantics like this.
+ /// Perl of course needs leftmost-first semantics, but they implement it
+ /// with a seeming hack at *search* time instead of encoding it into the
+ /// automaton. There are also a couple Java libraries that support leftmost
+ /// longest semantics, but they do it by building a queue of matches at
+ /// search time, which is even worse than what Perl is doing. ---AG
+ fn fill_failure_transitions(&mut self) {
+ let kind = self.match_kind();
// Initialize the queue for breadth first search with all transitions
// out of the start state. We handle the start state specially because
// we only want to follow non-self transitions. If we followed self
// transitions, then this would never terminate.
let mut queue = VecDeque::new();
let mut seen = self.queued_set();
- for b in AllBytesIter::new() {
- let next = self.nfa.start().next_state(b);
- if next != self.nfa.start_id {
- if !seen.contains(next) {
- queue.push_back(next);
- seen.insert(next);
- }
+ let mut it = self.nfa.iter_transitions_mut(self.nfa.start_id);
+ while let Some((_, next)) = it.next() {
+ // Skip anything we've seen before and any self-transitions on the
+ // start state.
+ if next == it.nfa().start_id || seen.contains(next) {
+ continue;
+ }
+ queue.push_back(next);
+ seen.insert(next);
+ // Under leftmost semantics, if a state immediately following
+ // the start state is a match state, then we never want to
+ // follow its failure transition since the failure transition
+ // necessarily leads back to the start state, which we never
+ // want to do for leftmost matching after a match has been
+ // found.
+ //
+ // We apply the same logic to non-start states below as well.
+ if kind.is_leftmost() && it.nfa().state(next).is_match() {
+ it.nfa().state_mut(next).fail = dead_id();
}
}
while let Some(id) = queue.pop_front() {
@@ -870,6 +890,31 @@ impl<'a, S: StateID> Compiler<'a, S> {
queue.push_back(next);
seen.insert(next);
+ // As above for start states, under leftmost semantics, once
+ // we see a match all subsequent states should have no failure
+ // transitions because failure transitions always imply looking
+ // for a match that is a suffix of what has been seen so far
+ // (where "seen so far" corresponds to the string formed by
+ // following the transitions from the start state to the
+ // current state). Under leftmost semantics, we specifically do
+ // not want to allow this to happen because we always want to
+ // report the match found at the leftmost position.
+ //
+ // The difference between leftmost-first and leftmost-longest
+ // occurs previously while we build the trie. For
+ // leftmost-first, we simply omit any entries that would
+ // otherwise require passing through a match state.
+ //
+ // Note that for correctness, the failure transition has to be
+ // set to the dead state for ALL states following a match, not
+ // just the match state itself. However, by setting the failure
+ // transition to the dead state on all match states, the dead
+ // state will automatically propagate to all subsequent states
+ // via the failure state computation below.
+ if kind.is_leftmost() && it.nfa().state(next).is_match() {
+ it.nfa().state_mut(next).fail = dead_id();
+ continue;
+ }
let mut fail = it.nfa().state(id).fail;
while it.nfa().state(fail).next_state(b) == fail_id() {
fail = it.nfa().state(fail).fail;
@@ -887,217 +932,9 @@ impl<'a, S: StateID> Compiler<'a, S> {
// in addition to its own. For the non-overlapping case, such
// states only report the first match, which is never empty since
// it isn't a start state.
- it.nfa().copy_empty_matches(id);
- }
- }
-
- /// This routine is just like fill_failure_transitions_standard, except
- /// it adds failure transitions in a way that preserves leftmost match
- /// semantics (for both leftmost-first and leftmost-longest).
- ///
- /// The algorithms are so similar that it would be possible to write it
- /// generically. But doing so without overhead would require a bit of
- /// ceremony, so we just copy it and add in the extra leftmost logic.
- /// Moreover, the standard algorithm above is so simple that it feels like
- /// a crime to disturb it.
- ///
- /// In effect, this proceeds just like the standard approach, but we
- /// specifically add only a subset of all failure transitions. Namely, we
- /// only add failure transitions that either do not occur after a match
- /// or failure transitions that do occur after a match but preserve the
- /// match. The comments in the implementation below should help.
- ///
- /// N.B. The only differences in the automaton between leftmost-first and
- /// leftmost-longest are in trie construction. Otherwise, both have exactly
- /// the same set of failure transitions. leftmost-longest adds everything
- /// to the trie, where as leftmost-first skips any patterns for which there
- /// exists a prefix of it that was added earlier.
- ///
- /// N.B. I came up with this algorithm on my own, and after scouring all of
- /// the other AC implementations I know of (Perl, Snort, many on GitHub).
- /// I couldn't find any that implement leftmost semantics like this.
- /// Perl of course needs leftmost-first semantics, but they implement it
- /// with a seeming hack at *search* time instead of encoding it into the
- /// automaton. There are also a couple Java libraries that support leftmost
- /// longest semantics, but they do it by building a queue of matches at
- /// search time, which is even worse than what Perl is doing. ---AG
- fn fill_failure_transitions_leftmost(&mut self) {
- /// Represents an item in our queue of states to process.
- ///
- /// Fundamentally, this queue serves the same purpose as the queue
- /// for filling failure transitions using the standard formulation.
- /// In the leftmost case, though, we need to track a bit more
- /// information. See comments below.
- #[derive(Clone, Copy, Debug)]
- struct QueuedState<S> {
- /// The id of the state to visit.
- id: S,
- /// The depth at which the first match was observed in the path
- /// to this state. Note that this corresponds to the depth at
- /// which the beginning of the match was detected. If no match
- /// has been seen, then this is None.
- match_at_depth: Option<usize>,
- }
-
- impl<S: StateID> QueuedState<S> {
- /// Create a queued state corresponding to the given NFA's start
- /// state.
- fn start(nfa: &NFA<S>) -> QueuedState<S> {
- let match_at_depth =
- if nfa.start().is_match() { Some(0) } else { None };
- QueuedState { id: nfa.start_id, match_at_depth }
- }
-
- /// Return the next state to queue up. The given id must be a state
- /// corresponding to a single transition from this queued state.
- fn next_queued_state(
- &self,
- nfa: &NFA<S>,
- id: S,
- ) -> QueuedState<S> {
- let match_at_depth = self.next_match_at_depth(nfa, id);
- QueuedState { id, match_at_depth }
- }
-
- /// Return the earliest depth at which a match has occurred for
- /// the given state. The given state must correspond to a single
- /// transition from this queued state.
- fn next_match_at_depth(
- &self,
- nfa: &NFA<S>,
- next: S,
- ) -> Option<usize> {
- // This is a little tricky. If the previous state has already
- // seen a match or if `next` isn't a match state, then nothing
- // needs to change since a later state cannot find an earlier
- // match.
- match self.match_at_depth {
- Some(x) => return Some(x),
- None if nfa.state(next).is_match() => {}
- None => return None,
- }
- let depth = nfa.state(next).depth
- - nfa.state(next).get_longest_match_len().unwrap()
- + 1;
- Some(depth)
- }
- }
-
- // Initialize the queue for breadth first search with all transitions
- // out of the start state. We handle the start state specially because
- // we only want to follow non-self transitions. If we followed self
- // transitions, then this would never terminate.
- let mut queue: VecDeque<QueuedState<S>> = VecDeque::new();
- let mut seen = self.queued_set();
- let start = QueuedState::start(&self.nfa);
- for b in AllBytesIter::new() {
- let next_id = self.nfa.start().next_state(b);
- if next_id != start.id {
- let next = start.next_queued_state(&self.nfa, next_id);
- if !seen.contains(next.id) {
- queue.push_back(next);
- seen.insert(next.id);
- }
- // If a state immediately following the start state is a match
- // state, then we never want to follow its failure transition
- // since the failure transition necessarily leads back to the
- // start state, which we never want to do for leftmost matching
- // after a match has been found.
- //
- // N.B. This is a special case of the more general handling
- // found below.
- if self.nfa.state(next_id).is_match() {
- self.nfa.state_mut(next_id).fail = dead_id();
- }
- }
- }
- while let Some(item) = queue.pop_front() {
- let mut any_trans = false;
- let mut it = self.nfa.iter_transitions_mut(item.id);
- while let Some((b, next_id)) = it.next() {
- any_trans = true;
-
- // Queue up the next state.
- let next = item.next_queued_state(it.nfa(), next_id);
- if seen.contains(next.id) {
- // The only way to visit a duplicate state in a transition
- // list is when ASCII case insensitivity is enabled. In
- // this case, we want to skip it since it's redundant work.
- // But it would also end up duplicating matches, which
- // results in reporting duplicate matches in some cases.
- // See the 'acasei010' regression test.
- continue;
- }
- queue.push_back(next);
- seen.insert(next.id);
-
- // Find the failure state for next. Same as standard.
- let mut fail = it.nfa().state(item.id).fail;
- while it.nfa().state(fail).next_state(b) == fail_id() {
- fail = it.nfa().state(fail).fail;
- }
- fail = it.nfa().state(fail).next_state(b);
-
- // This is the key difference from the standard formulation.
- // Namely, if we've seen a match, then we only want a failure
- // transition if the failure transition preserves the match
- // we've seen. In general, this is not true of all failure
- // transitions since they can point back to any suffix of what
- // we've seen so far. Instead, we only want to point back to
- // suffixes that contain any match we've seen.
- //
- // We achieve this by comparing the depth of the failure
- // transition with the number of states between this state
- // and the beginning of the earliest match detected. If the
- // depth of the failure state is smaller than this difference,
- // then it cannot contain the match. If it's bigger or equal
- // to the difference, then it necessarily includes the match
- // we've seen since all failure transitions correspond to a
- // suffix.
- //
- // If we've determined that we don't want the failure
- // transition, then we set this state's failure transition to
- // the dead state. In other words, when a search hits this
- // state, it will not continue and correctly stop. (N.B. A
- // dead state is different than a fail state. A dead state
- // MUST be preceded by a match and acts as a sentinel to search
- // routines to terminate.)
- //
- // Understanding this is tricky, and it took me several days
- // to think through this and get it right. If you want to grok
- // it, then I'd recommend: 1) switch the implementation to
- // always use the standard algorithm for filling in failure
- // transitions, 2) run the test suite and 3) examine the test
- // failures. Write out the automatons for them and try to work
- // backwards by figuring out which failure transitions should
- // be removed. You should arrive at the same rule used below.
- if let Some(match_depth) = next.match_at_depth {
- let fail_depth = it.nfa().state(fail).depth;
- let next_depth = it.nfa().state(next.id).depth;
- if next_depth - match_depth + 1 > fail_depth {
- it.nfa().state_mut(next.id).fail = dead_id();
- continue;
- }
- assert_ne!(
- start.id,
- it.nfa().state(next.id).fail,
- "states that are match states or follow match \
- states should never have a failure transition \
- back to the start state in leftmost searching",
- );
- }
- it.nfa().state_mut(next.id).fail = fail;
- it.nfa().copy_matches(fail, next.id);
- }
- // If there are no transitions for this state and if it's a match
- // state, then we must set its failure transition to the dead
- // state since we never want it to restart the search.
- if !any_trans && it.nfa().state(item.id).is_match() {
- it.nfa().state_mut(item.id).fail = dead_id();
+ if !kind.is_leftmost() {
+ it.nfa().copy_empty_matches(id);
}
- // We don't need to copy empty matches from the start state here
- // because that's only necessary for overlapping matches and
- // leftmost match kinds don't support overlapping matches.
}
}
@@ -1176,7 +1013,7 @@ impl<'a, S: StateID> Compiler<'a, S> {
fn calculate_size(&mut self) {
let mut size = 0;
for state in &self.nfa.states {
- size += state.heap_bytes();
+ size += size_of::<State<S>>() + state.heap_bytes();
}
self.nfa.heap_bytes = size;
}