diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/compile.rs | 2 | ||||
-rw-r--r-- | src/dfa.rs | 12 | ||||
-rw-r--r-- | src/expand.rs | 50 | ||||
-rw-r--r-- | src/lib.rs | 8 | ||||
-rw-r--r-- | src/pikevm.rs | 2 | ||||
-rw-r--r-- | src/re_bytes.rs | 19 | ||||
-rw-r--r-- | src/re_set.rs | 18 | ||||
-rw-r--r-- | src/re_trait.rs | 1 | ||||
-rw-r--r-- | src/re_unicode.rs | 21 |
9 files changed, 92 insertions, 41 deletions
diff --git a/src/compile.rs b/src/compile.rs index ad54040..cdc583c 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -222,7 +222,7 @@ impl Compiler { /// hole /// ``` /// - /// To compile two expressions, e1 and e2, concatinated together we + /// To compile two expressions, e1 and e2, concatenated together we /// would do: /// /// ```ignore @@ -679,7 +679,7 @@ impl<'a> Fsm<'a> { } } else if next_si & STATE_START > 0 { // A start state isn't in the common case because we may - // what to do quick prefix scanning. If the program doesn't + // want to do quick prefix scanning. If the program doesn't // have a detected prefix, then start states are actually // considered common and this case is never reached. debug_assert!(self.has_prefix()); @@ -725,7 +725,7 @@ impl<'a> Fsm<'a> { } } - // Run the DFA once more on the special EOF senitnel value. + // Run the DFA once more on the special EOF sentinel value. // We don't care about the special bits in the state pointer any more, // so get rid of them. prev_si &= STATE_MAX; @@ -830,7 +830,7 @@ impl<'a> Fsm<'a> { } } - // Run the DFA once more on the special EOF senitnel value. + // Run the DFA once more on the special EOF sentinel value. prev_si = match self.next_state(qcur, qnext, prev_si, Byte::eof()) { None => return Result::Quit, Some(STATE_DEAD) => return result.set_non_match(0), @@ -913,8 +913,8 @@ impl<'a> Fsm<'a> { if self.state(si).flags().has_empty() { // Compute the flags immediately preceding the current byte. // This means we only care about the "end" or "end line" flags. - // (The "start" flags are computed immediately proceding the - // current byte and is handled below.) + // (The "start" flags are computed immediately following the + // current byte and are handled below.) let mut flags = EmptyFlags::default(); if b.is_eof() { flags.end = true; @@ -1048,7 +1048,7 @@ impl<'a> Fsm<'a> { /// /// If matching starts after the beginning of the input, then only start /// line should be set if the preceding byte is `\n`. End line should never - /// be set in this case. (Even if the proceding byte is a `\n`, it will + /// be set in this case. (Even if the following byte is a `\n`, it will /// be handled in a subsequent DFA state.) fn follow_epsilons( &mut self, diff --git a/src/expand.rs b/src/expand.rs index 528f55e..fd2ab03 100644 --- a/src/expand.rs +++ b/src/expand.rs @@ -24,7 +24,7 @@ pub fn expand_str( continue; } debug_assert!(!replacement.is_empty()); - let cap_ref = match find_cap_ref(replacement) { + let cap_ref = match find_cap_ref(replacement.as_bytes()) { Some(cap_ref) => cap_ref, None => { dst.push_str("$"); @@ -125,19 +125,15 @@ impl From<usize> for Ref<'static> { /// starting at the beginning of `replacement`. /// /// If no such valid reference could be found, None is returned. -fn find_cap_ref<T: ?Sized + AsRef<[u8]>>( - replacement: &T, -) -> Option<CaptureRef> { +fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef> { let mut i = 0; let rep: &[u8] = replacement.as_ref(); if rep.len() <= 1 || rep[0] != b'$' { return None; } - let mut brace = false; i += 1; if rep[i] == b'{' { - brace = true; - i += 1; + return find_cap_ref_braced(rep, i + 1); } let mut cap_end = i; while rep.get(cap_end).map_or(false, is_valid_cap_letter) { @@ -151,12 +147,6 @@ fn find_cap_ref<T: ?Sized + AsRef<[u8]>>( // check with either unsafe or by parsing the number straight from &[u8]. let cap = str::from_utf8(&rep[i..cap_end]).expect("valid UTF-8 capture name"); - if brace { - if !rep.get(cap_end).map_or(false, |&b| b == b'}') { - return None; - } - cap_end += 1; - } Some(CaptureRef { cap: match cap.parse::<u32>() { Ok(i) => Ref::Number(i as usize), @@ -166,6 +156,31 @@ fn find_cap_ref<T: ?Sized + AsRef<[u8]>>( }) } +fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef> { + let start = i; + while rep.get(i).map_or(false, |&b| b != b'}') { + i += 1; + } + if !rep.get(i).map_or(false, |&b| b == b'}') { + return None; + } + // When looking at braced names, we don't put any restrictions on the name, + // so it's possible it could be invalid UTF-8. But a capture group name + // can never be invalid UTF-8, so if we have invalid UTF-8, then we can + // safely return None. + let cap = match str::from_utf8(&rep[start..i]) { + Err(_) => return None, + Ok(cap) => cap, + }; + Some(CaptureRef { + cap: match cap.parse::<u32>() { + Ok(i) => Ref::Number(i as usize), + Err(_) => Ref::Named(cap), + }, + end: i + 1, + }) +} + /// Returns true if and only if the given byte is allowed in a capture name. fn is_valid_cap_letter(b: &u8) -> bool { match *b { @@ -182,13 +197,13 @@ mod tests { ($name:ident, $text:expr) => { #[test] fn $name() { - assert_eq!(None, find_cap_ref($text)); + assert_eq!(None, find_cap_ref($text.as_bytes())); } }; ($name:ident, $text:expr, $capref:expr) => { #[test] fn $name() { - assert_eq!(Some($capref), find_cap_ref($text)); + assert_eq!(Some($capref), find_cap_ref($text.as_bytes())); } }; } @@ -204,7 +219,8 @@ mod tests { find!(find_cap_ref3, "$0", c!(0, 2)); find!(find_cap_ref4, "$5", c!(5, 2)); find!(find_cap_ref5, "$10", c!(10, 3)); - // see https://github.com/rust-lang/regex/pull/585 for more on characters following numbers + // See https://github.com/rust-lang/regex/pull/585 + // for more on characters following numbers find!(find_cap_ref6, "$42a", c!("42a", 4)); find!(find_cap_ref7, "${42}a", c!(42, 5)); find!(find_cap_ref8, "${42"); @@ -217,4 +233,6 @@ mod tests { find!(find_cap_ref15, "$1_$2", c!("1_", 3)); find!(find_cap_ref16, "$x-$y", c!("x", 2)); find!(find_cap_ref17, "$x_$y", c!("x_", 3)); + find!(find_cap_ref18, "${#}", c!("#", 4)); + find!(find_cap_ref19, "${Z[}", c!("Z[", 5)); } @@ -365,7 +365,7 @@ $ the end of text (or end-of-line with multi-line mode) <pre class="rust"> (exp) numbered capture group (indexed by opening parenthesis) -(?P<name>exp) named (also numbered) capture group (allowed chars: [_0-9a-zA-Z]) +(?P<name>exp) named (also numbered) capture group (allowed chars: [_0-9a-zA-Z.\[\]]) (?:exp) non-capturing group (?flags) set flags within current group (?flags:exp) set flags for exp (non-capturing) @@ -562,7 +562,7 @@ All features below are enabled by default. [Unicode's "simple loose matches" specification](https://www.unicode.org/reports/tr18/#Simple_Loose_Matches). * **unicode-gencat** - Provide the data for - [Uncode general categories](https://www.unicode.org/reports/tr44/tr44-24.html#General_Category_Values). + [Unicode general categories](https://www.unicode.org/reports/tr44/tr44-24.html#General_Category_Values). This includes, but is not limited to, `Decimal_Number`, `Letter`, `Math_Symbol`, `Number` and `Punctuation`. * **unicode-perl** - @@ -731,8 +731,8 @@ Unicode codepoints. For example, in ASCII compatible mode, `\xFF` matches the literal byte `\xFF`, while in Unicode mode, `\xFF` is a Unicode codepoint that matches its UTF-8 encoding of `\xC3\xBF`. Similarly for octal notation when enabled. -6. `.` matches any *byte* except for `\n` instead of any Unicode scalar value. -When the `s` flag is enabled, `.` matches any byte. +6. In ASCII compatible mode, `.` matches any *byte* except for `\n`. When the +`s` flag is additionally enabled, `.` matches any byte. # Performance diff --git a/src/pikevm.rs b/src/pikevm.rs index c106c76..299087d 100644 --- a/src/pikevm.rs +++ b/src/pikevm.rs @@ -8,7 +8,7 @@ // // It can do more than the DFA can (specifically, record capture locations // and execute Unicode word boundary assertions), but at a slower speed. -// Specifically, the Pike VM exectues a DFA implicitly by repeatedly expanding +// Specifically, the Pike VM executes a DFA implicitly by repeatedly expanding // epsilon transitions. That is, the Pike VM engine can be in multiple states // at once where as the DFA is only ever in one state at a time. // diff --git a/src/re_bytes.rs b/src/re_bytes.rs index 69f0b33..ca01e0e 100644 --- a/src/re_bytes.rs +++ b/src/re_bytes.rs @@ -119,7 +119,8 @@ impl Regex { RegexBuilder::new(re).build() } - /// Returns true if and only if the regex matches the string given. + /// Returns true if and only if there is a match for the regex in the + /// string given. /// /// It is recommended to use this method if all you need to do is test /// a match, since the underlying matching engine may be able to do less @@ -930,17 +931,22 @@ impl<'t> Captures<'t> { /// Expands all instances of `$name` in `replacement` to the corresponding /// capture group `name`, and writes them to the `dst` buffer given. /// - /// `name` may be an integer corresponding to the index of the - /// capture group (counted by order of opening parenthesis where `0` is the + /// `name` may be an integer corresponding to the index of the capture + /// group (counted by order of opening parenthesis where `0` is the /// entire match) or it can be a name (consisting of letters, digits or /// underscores) corresponding to a named capture group. /// /// If `name` isn't a valid capture group (whether the name doesn't exist /// or isn't a valid index), then it is replaced with the empty string. /// - /// The longest possible name is used. e.g., `$1a` looks up the capture - /// group named `1a` and not the capture group at index `1`. To exert more - /// precise control over the name, use braces, e.g., `${1}a`. + /// The longest possible name consisting of the characters `[_0-9A-Za-z]` + /// is used. e.g., `$1a` looks up the capture group named `1a` and not the + /// capture group at index `1`. To exert more precise control over the + /// name, or to refer to a capture group name that uses characters outside + /// of `[_0-9A-Za-z]`, use braces, e.g., `${1}a` or `${foo[bar].baz}`. When + /// using braces, any sequence of valid UTF-8 bytes is permitted. If the + /// sequence does not refer to a capture group name in the corresponding + /// regex, then it is replaced with an empty string. /// /// To write a literal `$` use `$$`. pub fn expand(&self, replacement: &[u8], dst: &mut Vec<u8>) { @@ -1051,6 +1057,7 @@ impl<'t, 'i> Index<&'i str> for Captures<'t> { /// /// The lifetime `'c` corresponds to the lifetime of the `Captures` value, and /// the lifetime `'t` corresponds to the originally matched text. +#[derive(Clone)] pub struct SubCaptureMatches<'c, 't: 'c> { caps: &'c Captures<'t>, it: SubCapturesPosIter<'c>, diff --git a/src/re_set.rs b/src/re_set.rs index fc2b61a..b8954be 100644 --- a/src/re_set.rs +++ b/src/re_set.rs @@ -96,6 +96,19 @@ impl RegexSet { RegexSetBuilder::new(exprs).build() } + /// Create a new empty regex set. + /// + /// # Example + /// + /// ```rust + /// # use regex::RegexSet; + /// let set = RegexSet::empty(); + /// assert!(set.is_empty()); + /// ``` + pub fn empty() -> RegexSet { + RegexSetBuilder::new(&[""; 0]).build().unwrap() + } + /// Returns true if and only if one of the regexes in this set matches /// the text given. /// @@ -207,6 +220,11 @@ impl RegexSet { self.0.regex_strings().len() } + /// Returns `true` if this set contains no regular expressions. + pub fn is_empty(&self) -> bool { + self.0.regex_strings().is_empty() + } + /// Returns the patterns that this set will match on. /// /// This function can be used to determine the pattern for a match. The diff --git a/src/re_trait.rs b/src/re_trait.rs index b56804e..d14a9f7 100644 --- a/src/re_trait.rs +++ b/src/re_trait.rs @@ -51,6 +51,7 @@ impl Locations { /// Positions are byte indices in terms of the original string matched. /// /// `'c` is the lifetime of the captures. +#[derive(Clone)] pub struct SubCapturesPosIter<'c> { idx: usize, locs: &'c Locations, diff --git a/src/re_unicode.rs b/src/re_unicode.rs index b746599..ea95c1b 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -175,7 +175,8 @@ impl Regex { RegexBuilder::new(re).build() } - /// Returns true if and only if the regex matches the string given. + /// Returns true if and only if there is a match for the regex in the + /// string given. /// /// It is recommended to use this method if all you need to do is test /// a match, since the underlying matching engine may be able to do less @@ -947,17 +948,22 @@ impl<'t> Captures<'t> { /// Expands all instances of `$name` in `replacement` to the corresponding /// capture group `name`, and writes them to the `dst` buffer given. /// - /// `name` may be an integer corresponding to the index of the - /// capture group (counted by order of opening parenthesis where `0` is the + /// `name` may be an integer corresponding to the index of the capture + /// group (counted by order of opening parenthesis where `0` is the /// entire match) or it can be a name (consisting of letters, digits or /// underscores) corresponding to a named capture group. /// /// If `name` isn't a valid capture group (whether the name doesn't exist /// or isn't a valid index), then it is replaced with the empty string. /// - /// The longest possible name is used. e.g., `$1a` looks up the capture - /// group named `1a` and not the capture group at index `1`. To exert more - /// precise control over the name, use braces, e.g., `${1}a`. + /// The longest possible name consisting of the characters `[_0-9A-Za-z]` + /// is used. e.g., `$1a` looks up the capture group named `1a` and not the + /// capture group at index `1`. To exert more precise control over the + /// name, or to refer to a capture group name that uses characters outside + /// of `[_0-9A-Za-z]`, use braces, e.g., `${1}a` or `${foo[bar].baz}`. When + /// using braces, any sequence of characters is permitted. If the sequence + /// does not refer to a capture group name in the corresponding regex, then + /// it is replaced with an empty string. /// /// To write a literal `$` use `$$`. pub fn expand(&self, replacement: &str, dst: &mut String) { @@ -1053,6 +1059,7 @@ impl<'t, 'i> Index<&'i str> for Captures<'t> { /// /// The lifetime `'c` corresponds to the lifetime of the `Captures` value, and /// the lifetime `'t` corresponds to the originally matched text. +#[derive(Clone)] pub struct SubCaptureMatches<'c, 't: 'c> { caps: &'c Captures<'t>, it: SubCapturesPosIter<'c>, @@ -1122,7 +1129,7 @@ pub trait Replacer { /// have a match at capture group `0`. /// /// For example, a no-op replacement would be - /// `dst.extend(caps.get(0).unwrap().as_str())`. + /// `dst.push_str(caps.get(0).unwrap().as_str())`. fn replace_append(&mut self, caps: &Captures, dst: &mut String); /// Return a fixed unchanging replacement string. |