aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/compile.rs2
-rw-r--r--src/dfa.rs12
-rw-r--r--src/expand.rs50
-rw-r--r--src/lib.rs8
-rw-r--r--src/pikevm.rs2
-rw-r--r--src/re_bytes.rs19
-rw-r--r--src/re_set.rs18
-rw-r--r--src/re_trait.rs1
-rw-r--r--src/re_unicode.rs21
9 files changed, 92 insertions, 41 deletions
diff --git a/src/compile.rs b/src/compile.rs
index ad54040..cdc583c 100644
--- a/src/compile.rs
+++ b/src/compile.rs
@@ -222,7 +222,7 @@ impl Compiler {
/// hole
/// ```
///
- /// To compile two expressions, e1 and e2, concatinated together we
+ /// To compile two expressions, e1 and e2, concatenated together we
/// would do:
///
/// ```ignore
diff --git a/src/dfa.rs b/src/dfa.rs
index decc3b9..2a365ee 100644
--- a/src/dfa.rs
+++ b/src/dfa.rs
@@ -679,7 +679,7 @@ impl<'a> Fsm<'a> {
}
} else if next_si & STATE_START > 0 {
// A start state isn't in the common case because we may
- // what to do quick prefix scanning. If the program doesn't
+ // want to do quick prefix scanning. If the program doesn't
// have a detected prefix, then start states are actually
// considered common and this case is never reached.
debug_assert!(self.has_prefix());
@@ -725,7 +725,7 @@ impl<'a> Fsm<'a> {
}
}
- // Run the DFA once more on the special EOF senitnel value.
+ // Run the DFA once more on the special EOF sentinel value.
// We don't care about the special bits in the state pointer any more,
// so get rid of them.
prev_si &= STATE_MAX;
@@ -830,7 +830,7 @@ impl<'a> Fsm<'a> {
}
}
- // Run the DFA once more on the special EOF senitnel value.
+ // Run the DFA once more on the special EOF sentinel value.
prev_si = match self.next_state(qcur, qnext, prev_si, Byte::eof()) {
None => return Result::Quit,
Some(STATE_DEAD) => return result.set_non_match(0),
@@ -913,8 +913,8 @@ impl<'a> Fsm<'a> {
if self.state(si).flags().has_empty() {
// Compute the flags immediately preceding the current byte.
// This means we only care about the "end" or "end line" flags.
- // (The "start" flags are computed immediately proceding the
- // current byte and is handled below.)
+ // (The "start" flags are computed immediately following the
+ // current byte and are handled below.)
let mut flags = EmptyFlags::default();
if b.is_eof() {
flags.end = true;
@@ -1048,7 +1048,7 @@ impl<'a> Fsm<'a> {
///
/// If matching starts after the beginning of the input, then only start
/// line should be set if the preceding byte is `\n`. End line should never
- /// be set in this case. (Even if the proceding byte is a `\n`, it will
+ /// be set in this case. (Even if the following byte is a `\n`, it will
/// be handled in a subsequent DFA state.)
fn follow_epsilons(
&mut self,
diff --git a/src/expand.rs b/src/expand.rs
index 528f55e..fd2ab03 100644
--- a/src/expand.rs
+++ b/src/expand.rs
@@ -24,7 +24,7 @@ pub fn expand_str(
continue;
}
debug_assert!(!replacement.is_empty());
- let cap_ref = match find_cap_ref(replacement) {
+ let cap_ref = match find_cap_ref(replacement.as_bytes()) {
Some(cap_ref) => cap_ref,
None => {
dst.push_str("$");
@@ -125,19 +125,15 @@ impl From<usize> for Ref<'static> {
/// starting at the beginning of `replacement`.
///
/// If no such valid reference could be found, None is returned.
-fn find_cap_ref<T: ?Sized + AsRef<[u8]>>(
- replacement: &T,
-) -> Option<CaptureRef> {
+fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef> {
let mut i = 0;
let rep: &[u8] = replacement.as_ref();
if rep.len() <= 1 || rep[0] != b'$' {
return None;
}
- let mut brace = false;
i += 1;
if rep[i] == b'{' {
- brace = true;
- i += 1;
+ return find_cap_ref_braced(rep, i + 1);
}
let mut cap_end = i;
while rep.get(cap_end).map_or(false, is_valid_cap_letter) {
@@ -151,12 +147,6 @@ fn find_cap_ref<T: ?Sized + AsRef<[u8]>>(
// check with either unsafe or by parsing the number straight from &[u8].
let cap =
str::from_utf8(&rep[i..cap_end]).expect("valid UTF-8 capture name");
- if brace {
- if !rep.get(cap_end).map_or(false, |&b| b == b'}') {
- return None;
- }
- cap_end += 1;
- }
Some(CaptureRef {
cap: match cap.parse::<u32>() {
Ok(i) => Ref::Number(i as usize),
@@ -166,6 +156,31 @@ fn find_cap_ref<T: ?Sized + AsRef<[u8]>>(
})
}
+fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef> {
+ let start = i;
+ while rep.get(i).map_or(false, |&b| b != b'}') {
+ i += 1;
+ }
+ if !rep.get(i).map_or(false, |&b| b == b'}') {
+ return None;
+ }
+ // When looking at braced names, we don't put any restrictions on the name,
+ // so it's possible it could be invalid UTF-8. But a capture group name
+ // can never be invalid UTF-8, so if we have invalid UTF-8, then we can
+ // safely return None.
+ let cap = match str::from_utf8(&rep[start..i]) {
+ Err(_) => return None,
+ Ok(cap) => cap,
+ };
+ Some(CaptureRef {
+ cap: match cap.parse::<u32>() {
+ Ok(i) => Ref::Number(i as usize),
+ Err(_) => Ref::Named(cap),
+ },
+ end: i + 1,
+ })
+}
+
/// Returns true if and only if the given byte is allowed in a capture name.
fn is_valid_cap_letter(b: &u8) -> bool {
match *b {
@@ -182,13 +197,13 @@ mod tests {
($name:ident, $text:expr) => {
#[test]
fn $name() {
- assert_eq!(None, find_cap_ref($text));
+ assert_eq!(None, find_cap_ref($text.as_bytes()));
}
};
($name:ident, $text:expr, $capref:expr) => {
#[test]
fn $name() {
- assert_eq!(Some($capref), find_cap_ref($text));
+ assert_eq!(Some($capref), find_cap_ref($text.as_bytes()));
}
};
}
@@ -204,7 +219,8 @@ mod tests {
find!(find_cap_ref3, "$0", c!(0, 2));
find!(find_cap_ref4, "$5", c!(5, 2));
find!(find_cap_ref5, "$10", c!(10, 3));
- // see https://github.com/rust-lang/regex/pull/585 for more on characters following numbers
+ // See https://github.com/rust-lang/regex/pull/585
+ // for more on characters following numbers
find!(find_cap_ref6, "$42a", c!("42a", 4));
find!(find_cap_ref7, "${42}a", c!(42, 5));
find!(find_cap_ref8, "${42");
@@ -217,4 +233,6 @@ mod tests {
find!(find_cap_ref15, "$1_$2", c!("1_", 3));
find!(find_cap_ref16, "$x-$y", c!("x", 2));
find!(find_cap_ref17, "$x_$y", c!("x_", 3));
+ find!(find_cap_ref18, "${#}", c!("#", 4));
+ find!(find_cap_ref19, "${Z[}", c!("Z[", 5));
}
diff --git a/src/lib.rs b/src/lib.rs
index e0a0975..bdcebd4 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -365,7 +365,7 @@ $ the end of text (or end-of-line with multi-line mode)
<pre class="rust">
(exp) numbered capture group (indexed by opening parenthesis)
-(?P&lt;name&gt;exp) named (also numbered) capture group (allowed chars: [_0-9a-zA-Z])
+(?P&lt;name&gt;exp) named (also numbered) capture group (allowed chars: [_0-9a-zA-Z.\[\]])
(?:exp) non-capturing group
(?flags) set flags within current group
(?flags:exp) set flags for exp (non-capturing)
@@ -562,7 +562,7 @@ All features below are enabled by default.
[Unicode's "simple loose matches" specification](https://www.unicode.org/reports/tr18/#Simple_Loose_Matches).
* **unicode-gencat** -
Provide the data for
- [Uncode general categories](https://www.unicode.org/reports/tr44/tr44-24.html#General_Category_Values).
+ [Unicode general categories](https://www.unicode.org/reports/tr44/tr44-24.html#General_Category_Values).
This includes, but is not limited to, `Decimal_Number`, `Letter`,
`Math_Symbol`, `Number` and `Punctuation`.
* **unicode-perl** -
@@ -731,8 +731,8 @@ Unicode codepoints. For example, in ASCII compatible mode, `\xFF` matches the
literal byte `\xFF`, while in Unicode mode, `\xFF` is a Unicode codepoint that
matches its UTF-8 encoding of `\xC3\xBF`. Similarly for octal notation when
enabled.
-6. `.` matches any *byte* except for `\n` instead of any Unicode scalar value.
-When the `s` flag is enabled, `.` matches any byte.
+6. In ASCII compatible mode, `.` matches any *byte* except for `\n`. When the
+`s` flag is additionally enabled, `.` matches any byte.
# Performance
diff --git a/src/pikevm.rs b/src/pikevm.rs
index c106c76..299087d 100644
--- a/src/pikevm.rs
+++ b/src/pikevm.rs
@@ -8,7 +8,7 @@
//
// It can do more than the DFA can (specifically, record capture locations
// and execute Unicode word boundary assertions), but at a slower speed.
-// Specifically, the Pike VM exectues a DFA implicitly by repeatedly expanding
+// Specifically, the Pike VM executes a DFA implicitly by repeatedly expanding
// epsilon transitions. That is, the Pike VM engine can be in multiple states
// at once where as the DFA is only ever in one state at a time.
//
diff --git a/src/re_bytes.rs b/src/re_bytes.rs
index 69f0b33..ca01e0e 100644
--- a/src/re_bytes.rs
+++ b/src/re_bytes.rs
@@ -119,7 +119,8 @@ impl Regex {
RegexBuilder::new(re).build()
}
- /// Returns true if and only if the regex matches the string given.
+ /// Returns true if and only if there is a match for the regex in the
+ /// string given.
///
/// It is recommended to use this method if all you need to do is test
/// a match, since the underlying matching engine may be able to do less
@@ -930,17 +931,22 @@ impl<'t> Captures<'t> {
/// Expands all instances of `$name` in `replacement` to the corresponding
/// capture group `name`, and writes them to the `dst` buffer given.
///
- /// `name` may be an integer corresponding to the index of the
- /// capture group (counted by order of opening parenthesis where `0` is the
+ /// `name` may be an integer corresponding to the index of the capture
+ /// group (counted by order of opening parenthesis where `0` is the
/// entire match) or it can be a name (consisting of letters, digits or
/// underscores) corresponding to a named capture group.
///
/// If `name` isn't a valid capture group (whether the name doesn't exist
/// or isn't a valid index), then it is replaced with the empty string.
///
- /// The longest possible name is used. e.g., `$1a` looks up the capture
- /// group named `1a` and not the capture group at index `1`. To exert more
- /// precise control over the name, use braces, e.g., `${1}a`.
+ /// The longest possible name consisting of the characters `[_0-9A-Za-z]`
+ /// is used. e.g., `$1a` looks up the capture group named `1a` and not the
+ /// capture group at index `1`. To exert more precise control over the
+ /// name, or to refer to a capture group name that uses characters outside
+ /// of `[_0-9A-Za-z]`, use braces, e.g., `${1}a` or `${foo[bar].baz}`. When
+ /// using braces, any sequence of valid UTF-8 bytes is permitted. If the
+ /// sequence does not refer to a capture group name in the corresponding
+ /// regex, then it is replaced with an empty string.
///
/// To write a literal `$` use `$$`.
pub fn expand(&self, replacement: &[u8], dst: &mut Vec<u8>) {
@@ -1051,6 +1057,7 @@ impl<'t, 'i> Index<&'i str> for Captures<'t> {
///
/// The lifetime `'c` corresponds to the lifetime of the `Captures` value, and
/// the lifetime `'t` corresponds to the originally matched text.
+#[derive(Clone)]
pub struct SubCaptureMatches<'c, 't: 'c> {
caps: &'c Captures<'t>,
it: SubCapturesPosIter<'c>,
diff --git a/src/re_set.rs b/src/re_set.rs
index fc2b61a..b8954be 100644
--- a/src/re_set.rs
+++ b/src/re_set.rs
@@ -96,6 +96,19 @@ impl RegexSet {
RegexSetBuilder::new(exprs).build()
}
+ /// Create a new empty regex set.
+ ///
+ /// # Example
+ ///
+ /// ```rust
+ /// # use regex::RegexSet;
+ /// let set = RegexSet::empty();
+ /// assert!(set.is_empty());
+ /// ```
+ pub fn empty() -> RegexSet {
+ RegexSetBuilder::new(&[""; 0]).build().unwrap()
+ }
+
/// Returns true if and only if one of the regexes in this set matches
/// the text given.
///
@@ -207,6 +220,11 @@ impl RegexSet {
self.0.regex_strings().len()
}
+ /// Returns `true` if this set contains no regular expressions.
+ pub fn is_empty(&self) -> bool {
+ self.0.regex_strings().is_empty()
+ }
+
/// Returns the patterns that this set will match on.
///
/// This function can be used to determine the pattern for a match. The
diff --git a/src/re_trait.rs b/src/re_trait.rs
index b56804e..d14a9f7 100644
--- a/src/re_trait.rs
+++ b/src/re_trait.rs
@@ -51,6 +51,7 @@ impl Locations {
/// Positions are byte indices in terms of the original string matched.
///
/// `'c` is the lifetime of the captures.
+#[derive(Clone)]
pub struct SubCapturesPosIter<'c> {
idx: usize,
locs: &'c Locations,
diff --git a/src/re_unicode.rs b/src/re_unicode.rs
index b746599..ea95c1b 100644
--- a/src/re_unicode.rs
+++ b/src/re_unicode.rs
@@ -175,7 +175,8 @@ impl Regex {
RegexBuilder::new(re).build()
}
- /// Returns true if and only if the regex matches the string given.
+ /// Returns true if and only if there is a match for the regex in the
+ /// string given.
///
/// It is recommended to use this method if all you need to do is test
/// a match, since the underlying matching engine may be able to do less
@@ -947,17 +948,22 @@ impl<'t> Captures<'t> {
/// Expands all instances of `$name` in `replacement` to the corresponding
/// capture group `name`, and writes them to the `dst` buffer given.
///
- /// `name` may be an integer corresponding to the index of the
- /// capture group (counted by order of opening parenthesis where `0` is the
+ /// `name` may be an integer corresponding to the index of the capture
+ /// group (counted by order of opening parenthesis where `0` is the
/// entire match) or it can be a name (consisting of letters, digits or
/// underscores) corresponding to a named capture group.
///
/// If `name` isn't a valid capture group (whether the name doesn't exist
/// or isn't a valid index), then it is replaced with the empty string.
///
- /// The longest possible name is used. e.g., `$1a` looks up the capture
- /// group named `1a` and not the capture group at index `1`. To exert more
- /// precise control over the name, use braces, e.g., `${1}a`.
+ /// The longest possible name consisting of the characters `[_0-9A-Za-z]`
+ /// is used. e.g., `$1a` looks up the capture group named `1a` and not the
+ /// capture group at index `1`. To exert more precise control over the
+ /// name, or to refer to a capture group name that uses characters outside
+ /// of `[_0-9A-Za-z]`, use braces, e.g., `${1}a` or `${foo[bar].baz}`. When
+ /// using braces, any sequence of characters is permitted. If the sequence
+ /// does not refer to a capture group name in the corresponding regex, then
+ /// it is replaced with an empty string.
///
/// To write a literal `$` use `$$`.
pub fn expand(&self, replacement: &str, dst: &mut String) {
@@ -1053,6 +1059,7 @@ impl<'t, 'i> Index<&'i str> for Captures<'t> {
///
/// The lifetime `'c` corresponds to the lifetime of the `Captures` value, and
/// the lifetime `'t` corresponds to the originally matched text.
+#[derive(Clone)]
pub struct SubCaptureMatches<'c, 't: 'c> {
caps: &'c Captures<'t>,
it: SubCapturesPosIter<'c>,
@@ -1122,7 +1129,7 @@ pub trait Replacer {
/// have a match at capture group `0`.
///
/// For example, a no-op replacement would be
- /// `dst.extend(caps.get(0).unwrap().as_str())`.
+ /// `dst.push_str(caps.get(0).unwrap().as_str())`.
fn replace_append(&mut self, caps: &Captures, dst: &mut String);
/// Return a fixed unchanging replacement string.