src/nfa/mod.rs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252

use std::fmt;

use classes::ByteClasses;
pub use nfa::compiler::Builder;

mod compiler;
mod map;
mod range_trie;

/// The representation for an NFA state identifier.
pub type StateID = usize;

/// A final compiled NFA.
///
/// The states of the NFA are indexed by state IDs, which are how transitions
/// are expressed.
#[derive(Clone)]
pub struct NFA {
    /// Whether this NFA can only match at the beginning of input or not.
    ///
    /// When true, a match should only be reported if it begins at the 0th
    /// index of the haystack.
    anchored: bool,
    /// The starting state of this NFA.
    start: StateID,
    /// The state list. This list is guaranteed to be indexable by the starting
    /// state ID, and it is also guaranteed to contain exactly one `Match`
    /// state.
    states: Vec<State>,
    /// A mapping from any byte value to its corresponding equivalence class
    /// identifier. Two bytes in the same equivalence class cannot discriminate
    /// between a match or a non-match. This map can be used to shrink the
    /// total size of a DFA's transition table with a small match-time cost.
    ///
    /// Note that the NFA's transitions are *not* defined in terms of these
    /// equivalence classes. The NFA's transitions are defined on the original
    /// byte values. For the most part, this is because they wouldn't really
    /// help the NFA much since the NFA already uses a sparse representation
    /// to represent transitions. Byte classes are most effective in a dense
    /// representation.
    byte_classes: ByteClasses,
}

impl NFA {
    /// Returns an NFA that always matches at every position.
    pub fn always_match() -> NFA {
        NFA {
            anchored: false,
            start: 0,
            states: vec![State::Match],
            byte_classes: ByteClasses::empty(),
        }
    }

    /// Returns an NFA that never matches at any position.
    pub fn never_match() -> NFA {
        NFA {
            anchored: false,
            start: 0,
            states: vec![State::Fail],
            byte_classes: ByteClasses::empty(),
        }
    }

    /// Returns true if and only if this NFA is anchored.
    pub fn is_anchored(&self) -> bool {
        self.anchored
    }

    /// Return the number of states in this NFA.
    pub fn len(&self) -> usize {
        self.states.len()
    }

    /// Return the ID of the initial state of this NFA.
    pub fn start(&self) -> StateID {
        self.start
    }

    /// Return the NFA state corresponding to the given ID.
    pub fn state(&self, id: StateID) -> &State {
        &self.states[id]
    }

    /// Return the set of equivalence classes for this NFA. The slice returned
    /// always has length 256 and maps each possible byte value to its
    /// corresponding equivalence class ID (which is never more than 255).
    pub fn byte_classes(&self) -> &ByteClasses {
        &self.byte_classes
    }
}

impl fmt::Debug for NFA {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        for (i, state) in self.states.iter().enumerate() {
            let status = if i == self.start { '>' } else { ' ' };
            writeln!(f, "{}{:06}: {:?}", status, i, state)?;
        }
        Ok(())
    }
}

/// A state in a final compiled NFA.
#[derive(Clone, Eq, PartialEq)]
pub enum State {
    /// A state that transitions to `next` if and only if the current input
    /// byte is in the range `[start, end]` (inclusive).
    ///
    /// This is a special case of Sparse in that it encodes only one transition
    /// (and therefore avoids the allocation).
    Range { range: Transition },
    /// A state with possibly many transitions, represented in a sparse
    /// fashion. Transitions are ordered lexicographically by input range.
    /// As such, this may only be used when every transition has equal
    /// priority. (In practice, this is only used for encoding large UTF-8
    /// automata.)
    Sparse { ranges: Box<[Transition]> },
    /// An alternation such that there exists an epsilon transition to all
    /// states in `alternates`, where matches found via earlier transitions
    /// are preferred over later transitions.
    Union { alternates: Box<[StateID]> },
    /// A fail state. When encountered, the automaton is guaranteed to never
    /// reach a match state.
    Fail,
    /// A match state. There is exactly one such occurrence of this state in
    /// an NFA.
    Match,
}

/// A transition to another state, only if the given byte falls in the
/// inclusive range specified.
#[derive(Clone, Copy, Eq, Hash, PartialEq)]
pub struct Transition {
    pub start: u8,
    pub end: u8,
    pub next: StateID,
}

impl State {
    /// Returns true if and only if this state contains one or more epsilon
    /// transitions.
    pub fn is_epsilon(&self) -> bool {
        match *self {
            State::Range { .. }
            | State::Sparse { .. }
            | State::Fail
            | State::Match => false,
            State::Union { .. } => true,
        }
    }

    /// Remap the transitions in this state using the given map. Namely, the
    /// given map should be indexed according to the transitions currently
    /// in this state.
    ///
    /// This is used during the final phase of the NFA compiler, which turns
    /// its intermediate NFA into the final NFA.
    fn remap(&mut self, remap: &[StateID]) {
        match *self {
            State::Range { ref mut range } => range.next = remap[range.next],
            State::Sparse { ref mut ranges } => {
                for r in ranges.iter_mut() {
                    r.next = remap[r.next];
                }
            }
            State::Union { ref mut alternates } => {
                for alt in alternates.iter_mut() {
                    *alt = remap[*alt];
                }
            }
            State::Fail => {}
            State::Match => {}
        }
    }
}

impl fmt::Debug for State {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        match *self {
            State::Range { ref range } => range.fmt(f),
            State::Sparse { ref ranges } => {
                let rs = ranges
                    .iter()
                    .map(|t| format!("{:?}", t))
                    .collect::<Vec<String>>()
                    .join(", ");
                write!(f, "sparse({})", rs)
            }
            State::Union { ref alternates } => {
                let alts = alternates
                    .iter()
                    .map(|id| format!("{}", id))
                    .collect::<Vec<String>>()
                    .join(", ");
                write!(f, "alt({})", alts)
            }
            State::Fail => write!(f, "FAIL"),
            State::Match => write!(f, "MATCH"),
        }
    }
}

impl fmt::Debug for Transition {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        let Transition { start, end, next } = *self;
        if self.start == self.end {
            write!(f, "{} => {}", escape(start), next)
        } else {
            write!(f, "{}-{} => {}", escape(start), escape(end), next)
        }
    }
}

/// Return the given byte as its escaped string form.
fn escape(b: u8) -> String {
    use std::ascii;

    String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap()
}

#[cfg(test)]
mod tests {
    use super::*;
    use dense;
    use dfa::DFA;

    #[test]
    fn always_match() {
        let nfa = NFA::always_match();
        let dfa = dense::Builder::new().build_from_nfa::<usize>(&nfa).unwrap();

        assert_eq!(Some(0), dfa.find_at(b"", 0));
        assert_eq!(Some(0), dfa.find_at(b"a", 0));
        assert_eq!(Some(1), dfa.find_at(b"a", 1));
        assert_eq!(Some(0), dfa.find_at(b"ab", 0));
        assert_eq!(Some(1), dfa.find_at(b"ab", 1));
        assert_eq!(Some(2), dfa.find_at(b"ab", 2));
    }

    #[test]
    fn never_match() {
        let nfa = NFA::never_match();
        let dfa = dense::Builder::new().build_from_nfa::<usize>(&nfa).unwrap();

        assert_eq!(None, dfa.find_at(b"", 0));
        assert_eq!(None, dfa.find_at(b"a", 0));
        assert_eq!(None, dfa.find_at(b"a", 1));
        assert_eq!(None, dfa.find_at(b"ab", 0));
        assert_eq!(None, dfa.find_at(b"ab", 1));
        assert_eq!(None, dfa.find_at(b"ab", 2));
    }
}