diff options
author | Android Build Coastguard Worker <android-build-coastguard-worker@google.com> | 2023-07-07 04:56:16 +0000 |
---|---|---|
committer | Android Build Coastguard Worker <android-build-coastguard-worker@google.com> | 2023-07-07 04:56:16 +0000 |
commit | e8b5199da18c5dd36f098d949c5c48848608479d (patch) | |
tree | d050b095d3d08bc7800c98bf5fe9e2ec15a35921 | |
parent | 8b9fd94e03a63accec93e745868ff863f2d01f07 (diff) | |
parent | 14bea0e36ba5a038a98be2269a0450fe90c9d229 (diff) | |
download | regex-android14-mainline-media-release.tar.gz |
Snap for 10453563 from 14bea0e36ba5a038a98be2269a0450fe90c9d229 to mainline-media-releaseaml_med_341711000aml_med_341619000aml_med_341513600aml_med_341312300aml_med_341312020aml_med_341111000aml_med_341011000aml_med_340922010android14-mainline-media-release
Change-Id: I0f3dde497dbd40f50e6a8a6e222c144de99d58c9
-rw-r--r-- | .cargo_vcs_info.json | 7 | ||||
-rw-r--r-- | Android.bp | 24 | ||||
-rw-r--r-- | CHANGELOG.md | 97 | ||||
-rw-r--r-- | Cargo.toml | 61 | ||||
-rw-r--r-- | Cargo.toml.orig | 4 | ||||
-rw-r--r-- | METADATA | 14 | ||||
-rw-r--r-- | README.md | 10 | ||||
-rw-r--r-- | TEST_MAPPING | 31 | ||||
-rw-r--r-- | cargo2android.json | 3 | ||||
-rw-r--r-- | src/backtrack.rs | 12 | ||||
-rw-r--r-- | src/compile.rs | 84 | ||||
-rw-r--r-- | src/dfa.rs | 40 | ||||
-rw-r--r-- | src/exec.rs | 18 | ||||
-rw-r--r-- | src/expand.rs | 8 | ||||
-rw-r--r-- | src/input.rs | 4 | ||||
-rw-r--r-- | src/lib.rs | 4 | ||||
-rw-r--r-- | src/literal/imp.rs | 4 | ||||
-rw-r--r-- | src/pattern.rs | 2 | ||||
-rw-r--r-- | src/pikevm.rs | 2 | ||||
-rw-r--r-- | src/prog.rs | 2 | ||||
-rw-r--r-- | src/re_bytes.rs | 24 | ||||
-rw-r--r-- | src/re_set.rs | 44 | ||||
-rw-r--r-- | src/re_trait.rs | 13 | ||||
-rw-r--r-- | src/re_unicode.rs | 38 | ||||
-rw-r--r-- | src/utf8.rs | 2 | ||||
-rw-r--r-- | tests/regression.rs | 3 | ||||
-rw-r--r-- | tests/replace.rs | 18 | ||||
-rw-r--r-- | tests/test_default.rs | 80 | ||||
-rw-r--r-- | tests/unicode.rs | 17 |
29 files changed, 490 insertions, 180 deletions
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json index 51b3cd6..a82e282 100644 --- a/.cargo_vcs_info.json +++ b/.cargo_vcs_info.json @@ -1,5 +1,6 @@ { "git": { - "sha1": "f2dc1b788f773a49f1b6633a6302054978344452" - } -} + "sha1": "9582040009820380a16819ca0d1ae262c7d454b0" + }, + "path_in_vcs": "" +}
\ No newline at end of file @@ -43,7 +43,7 @@ rust_library { host_supported: true, crate_name: "regex", cargo_env_compat: true, - cargo_pkg_version: "1.5.4", + cargo_pkg_version: "1.7.3", srcs: ["src/lib.rs"], edition: "2018", features: [ @@ -75,6 +75,8 @@ rust_library { "com.android.compos", "com.android.virt", ], + product_available: true, + vendor_available: true, } rust_test { @@ -82,7 +84,7 @@ rust_test { host_supported: true, crate_name: "regex", cargo_env_compat: true, - cargo_pkg_version: "1.5.4", + cargo_pkg_version: "1.7.3", srcs: ["src/lib.rs"], test_suites: ["general-tests"], auto_gen_config: true, @@ -124,7 +126,7 @@ rust_test { host_supported: true, crate_name: "backtrack", cargo_env_compat: true, - cargo_pkg_version: "1.5.4", + cargo_pkg_version: "1.7.3", srcs: ["tests/test_backtrack.rs"], test_suites: ["general-tests"], auto_gen_config: true, @@ -167,7 +169,7 @@ rust_test { host_supported: true, crate_name: "backtrack_bytes", cargo_env_compat: true, - cargo_pkg_version: "1.5.4", + cargo_pkg_version: "1.7.3", srcs: ["tests/test_backtrack_bytes.rs"], test_suites: ["general-tests"], auto_gen_config: true, @@ -210,7 +212,7 @@ rust_test { host_supported: true, crate_name: "backtrack_utf8bytes", cargo_env_compat: true, - cargo_pkg_version: "1.5.4", + cargo_pkg_version: "1.7.3", srcs: ["tests/test_backtrack_utf8bytes.rs"], test_suites: ["general-tests"], auto_gen_config: true, @@ -253,7 +255,7 @@ rust_test { host_supported: true, crate_name: "crates_regex", cargo_env_compat: true, - cargo_pkg_version: "1.5.4", + cargo_pkg_version: "1.7.3", srcs: ["tests/test_crates_regex.rs"], test_suites: ["general-tests"], auto_gen_config: true, @@ -296,7 +298,7 @@ rust_test { host_supported: true, crate_name: "default", cargo_env_compat: true, - cargo_pkg_version: "1.5.4", + cargo_pkg_version: "1.7.3", srcs: ["tests/test_default.rs"], test_suites: ["general-tests"], auto_gen_config: true, @@ -339,7 +341,7 @@ rust_test { host_supported: true, crate_name: "default_bytes", cargo_env_compat: true, - cargo_pkg_version: "1.5.4", + cargo_pkg_version: "1.7.3", srcs: ["tests/test_default_bytes.rs"], test_suites: ["general-tests"], auto_gen_config: true, @@ -382,7 +384,7 @@ rust_test { host_supported: true, crate_name: "nfa", cargo_env_compat: true, - cargo_pkg_version: "1.5.4", + cargo_pkg_version: "1.7.3", srcs: ["tests/test_nfa.rs"], test_suites: ["general-tests"], auto_gen_config: true, @@ -425,7 +427,7 @@ rust_test { host_supported: true, crate_name: "nfa_bytes", cargo_env_compat: true, - cargo_pkg_version: "1.5.4", + cargo_pkg_version: "1.7.3", srcs: ["tests/test_nfa_bytes.rs"], test_suites: ["general-tests"], auto_gen_config: true, @@ -468,7 +470,7 @@ rust_test { host_supported: true, crate_name: "nfa_utf8bytes", cargo_env_compat: true, - cargo_pkg_version: "1.5.4", + cargo_pkg_version: "1.7.3", srcs: ["tests/test_nfa_utf8bytes.rs"], test_suites: ["general-tests"], auto_gen_config: true, diff --git a/CHANGELOG.md b/CHANGELOG.md index 71d1963..44274ac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,98 @@ +1.7.3 (2023-03-24) +================== +This is a small release that fixes a bug in `Regex::shortest_match_at` that +could cause it to panic, even when the offset given is valid. + +Bug fixes: + +* [BUG #969](https://github.com/rust-lang/regex/issues/969): + Fix a bug in how the reverse DFA was called for `Regex::shortest_match_at`. + + +1.7.2 (2023-03-21) +================== +This is a small release that fixes a failing test on FreeBSD. + +Bug fixes: + +* [BUG #967](https://github.com/rust-lang/regex/issues/967): + Fix "no stack overflow" test which can fail due to the small stack size. + + +1.7.1 (2023-01-09) +================== +This release was done principally to try and fix the doc.rs rendering for the +regex crate. + +Performance improvements: + +* [PERF #930](https://github.com/rust-lang/regex/pull/930): + Optimize `replacen`. This also applies to `replace`, but not `replace_all`. + +Bug fixes: + +* [BUG #945](https://github.com/rust-lang/regex/issues/945): + Maybe fix rustdoc rendering by just bumping a new release? + + +1.7.0 (2022-11-05) +================== +This release principally includes an upgrade to Unicode 15. + +New features: + +* [FEATURE #832](https://github.com/rust-lang/regex/issues/916): + Upgrade to Unicode 15. + + +1.6.0 (2022-07-05) +================== +This release principally includes an upgrade to Unicode 14. + +New features: + +* [FEATURE #832](https://github.com/rust-lang/regex/pull/832): + Clarify that `Captures::len` includes all groups, not just matching groups. +* [FEATURE #857](https://github.com/rust-lang/regex/pull/857): + Add an `ExactSizeIterator` impl for `SubCaptureMatches`. +* [FEATURE #861](https://github.com/rust-lang/regex/pull/861): + Improve `RegexSet` documentation examples. +* [FEATURE #877](https://github.com/rust-lang/regex/issues/877): + Upgrade to Unicode 14. + +Bug fixes: + +* [BUG #792](https://github.com/rust-lang/regex/issues/792): + Fix error message rendering bug. + + +1.5.6 (2022-05-20) +================== +This release includes a few bug fixes, including a bug that produced incorrect +matches when a non-greedy `?` operator was used. + +* [BUG #680](https://github.com/rust-lang/regex/issues/680): + Fixes a bug where `[[:alnum:][:^ascii:]]` dropped `[:alnum:]` from the class. +* [BUG #859](https://github.com/rust-lang/regex/issues/859): + Fixes a bug where `Hir::is_match_empty` returned `false` for `\b`. +* [BUG #862](https://github.com/rust-lang/regex/issues/862): + Fixes a bug where 'ab??' matches 'ab' instead of 'a' in 'ab'. + + +1.5.5 (2022-03-08) +================== +This releases fixes a security bug in the regex compiler. This bug permits a +vector for a denial-of-service attack in cases where the regex being compiled +is untrusted. There are no known problems where the regex is itself trusted, +including in cases of untrusted haystacks. + +* [SECURITY #GHSA-m5pq-gvj9-9vr8](https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8): + Fixes a bug in the regex compiler where empty sub-expressions subverted the + existing mitigations in place to enforce a size limit on compiled regexes. + The Rust Security Response WG published an advisory about this: + https://groups.google.com/g/rustlang-security-announcements/c/NcNNL1Jq7Yw + + 1.5.4 (2021-05-06) ================== This release fixes another compilation failure when building regex. This time, @@ -669,7 +764,7 @@ New features: * Empty sub-expressions are now permitted in most places. That is, `()+` is now a valid regex. * Almost everything in regex-syntax now uses constant stack space, even when - performing anaylsis that requires structural induction. This reduces the risk + performing analysis that requires structural induction. This reduces the risk of a user provided regular expression causing a stack overflow. * [FEATURE #174](https://github.com/rust-lang/regex/issues/174): The `Ast` type in `regex-syntax` now contains span information. @@ -3,27 +3,33 @@ # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies -# to registry (e.g., crates.io) dependencies +# to registry (e.g., crates.io) dependencies. # -# If you believe there's an error in this file please file an -# issue against the rust-lang/cargo repository. If you're -# editing this file be aware that the upstream Cargo.toml -# will likely look very different (and much more reasonable) +# If you are reading this file be aware that the original Cargo.toml +# will likely look very different (and much more reasonable). +# See Cargo.toml.orig for the original contents. [package] edition = "2018" name = "regex" -version = "1.5.4" +version = "1.7.3" authors = ["The Rust Project Developers"] -exclude = ["/scripts/*", "/.github/*"] +exclude = [ + "/scripts/*", + "/.github/*", +] autotests = false -description = "An implementation of regular expressions for Rust. This implementation uses\nfinite automata and guarantees linear time matching on all inputs.\n" +description = """ +An implementation of regular expressions for Rust. This implementation uses +finite automata and guarantees linear time matching on all inputs. +""" homepage = "https://github.com/rust-lang/regex" documentation = "https://docs.rs/regex" readme = "README.md" categories = ["text-processing"] license = "MIT OR Apache-2.0" repository = "https://github.com/rust-lang/regex" + [profile.bench] debug = true @@ -72,6 +78,7 @@ path = "tests/test_backtrack_bytes.rs" [[test]] name = "crates-regex" path = "tests/test_crates_regex.rs" + [dependencies.aho-corasick] version = "0.7.18" optional = true @@ -81,8 +88,9 @@ version = "2.4.0" optional = true [dependencies.regex-syntax] -version = "0.6.25" +version = "0.6.29" default-features = false + [dev-dependencies.lazy_static] version = "1" @@ -92,19 +100,44 @@ default-features = false [dev-dependencies.rand] version = "0.8.3" -features = ["getrandom", "small_rng"] +features = [ + "getrandom", + "small_rng", +] default-features = false [features] -default = ["std", "perf", "unicode", "regex-syntax/default"] +default = [ + "std", + "perf", + "unicode", + "regex-syntax/default", +] pattern = [] -perf = ["perf-cache", "perf-dfa", "perf-inline", "perf-literal"] +perf = [ + "perf-cache", + "perf-dfa", + "perf-inline", + "perf-literal", +] perf-cache = [] perf-dfa = [] perf-inline = [] -perf-literal = ["aho-corasick", "memchr"] +perf-literal = [ + "aho-corasick", + "memchr", +] std = [] -unicode = ["unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment", "regex-syntax/unicode"] +unicode = [ + "unicode-age", + "unicode-bool", + "unicode-case", + "unicode-gencat", + "unicode-perl", + "unicode-script", + "unicode-segment", + "regex-syntax/unicode", +] unicode-age = ["regex-syntax/unicode-age"] unicode-bool = ["regex-syntax/unicode-bool"] unicode-case = ["regex-syntax/unicode-case"] diff --git a/Cargo.toml.orig b/Cargo.toml.orig index 468230b..4c5bd1c 100644 --- a/Cargo.toml.orig +++ b/Cargo.toml.orig @@ -1,6 +1,6 @@ [package] name = "regex" -version = "1.5.4" #:version +version = "1.7.3" #:version authors = ["The Rust Project Developers"] license = "MIT OR Apache-2.0" readme = "README.md" @@ -117,7 +117,7 @@ optional = true # For parsing regular expressions. [dependencies.regex-syntax] path = "regex-syntax" -version = "0.6.25" +version = "0.6.29" default-features = false [dev-dependencies] @@ -1,3 +1,7 @@ +# This project was upgraded with external_updater. +# Usage: tools/external_updater/updater.sh update rust/crates/regex +# For more info, check https://cs.android.com/android/platform/superproject/+/master:tools/external_updater/README.md + name: "regex" description: "An implementation of regular expressions for Rust. This implementation uses finite automata and guarantees linear time matching on all inputs." third_party { @@ -7,13 +11,13 @@ third_party { } url { type: ARCHIVE - value: "https://static.crates.io/crates/regex/regex-1.5.4.crate" + value: "https://static.crates.io/crates/regex/regex-1.7.3.crate" } - version: "1.5.4" + version: "1.7.3" license_type: NOTICE last_upgrade_date { - year: 2021 - month: 5 - day: 19 + year: 2023 + month: 4 + day: 3 } } @@ -8,7 +8,7 @@ Much of the syntax and implementation is inspired by [RE2](https://github.com/google/re2). [![Build status](https://github.com/rust-lang/regex/workflows/ci/badge.svg)](https://github.com/rust-lang/regex/actions) -[![](https://meritbadge.herokuapp.com/regex)](https://crates.io/crates/regex) +[![Crates.io](https://img.shields.io/crates/v/regex.svg)](https://crates.io/crates/regex) [![Rust](https://img.shields.io/badge/rust-1.41.1%2B-blue.svg?maxAge=3600)](https://github.com/rust-lang/regex) ### Documentation @@ -23,12 +23,8 @@ can be found on the ### Usage -Add this to your `Cargo.toml`: - -```toml -[dependencies] -regex = "1.5" -``` +To bring this crate into your repository, either add `regex` to your +`Cargo.toml`, or run `cargo add regex`. Here's a simple example that matches a date in YYYY-MM-DD format and prints the year, month and day: diff --git a/TEST_MAPPING b/TEST_MAPPING index 6064d70..c99af76 100644 --- a/TEST_MAPPING +++ b/TEST_MAPPING @@ -5,6 +5,9 @@ "path": "external/rust/crates/base64" }, { + "path": "external/rust/crates/clap/2.33.3" + }, + { "path": "external/rust/crates/libsqlite3-sys" }, { @@ -18,16 +21,22 @@ }, { "path": "external/rust/crates/unicode-xid" - } - ], - "presubmit": [ + }, + { + "path": "packages/modules/Virtualization/virtualizationmanager" + }, { - "name": "keystore2_test" + "path": "system/keymint/hal" }, { - "name": "legacykeystore_test" + "path": "system/security/keystore2" }, { + "path": "system/security/keystore2/legacykeystore" + } + ], + "presubmit": [ + { "name": "regex_test_src_lib" }, { @@ -56,19 +65,10 @@ }, { "name": "regex_test_tests_test_nfa_utf8bytes" - }, - { - "name": "virtualizationservice_device_test" } ], "presubmit-rust": [ { - "name": "keystore2_test" - }, - { - "name": "legacykeystore_test" - }, - { "name": "regex_test_src_lib" }, { @@ -97,9 +97,6 @@ }, { "name": "regex_test_tests_test_nfa_utf8bytes" - }, - { - "name": "virtualizationservice_device_test" } ] } diff --git a/cargo2android.json b/cargo2android.json index 0e54308..bef74ca 100644 --- a/cargo2android.json +++ b/cargo2android.json @@ -7,5 +7,6 @@ "dependencies": true, "device": true, "run": true, - "tests": true + "tests": true, + "vendor-available": true } diff --git a/src/backtrack.rs b/src/backtrack.rs index a3d25d6..4d83856 100644 --- a/src/backtrack.rs +++ b/src/backtrack.rs @@ -93,13 +93,7 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> { let mut cache = cache.borrow_mut(); let cache = &mut cache.backtrack; let start = input.at(start); - let mut b = Bounded { - prog: prog, - input: input, - matches: matches, - slots: slots, - m: cache, - }; + let mut b = Bounded { prog, input, matches, slots, m: cache }; b.exec_(start, end) } @@ -220,14 +214,14 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> { // job is popped and the old capture index is restored. self.m.jobs.push(Job::SaveRestore { slot: inst.slot, - old_pos: old_pos, + old_pos, }); self.slots[inst.slot] = Some(at.pos()); } ip = inst.goto; } Split(ref inst) => { - self.m.jobs.push(Job::Inst { ip: inst.goto2, at: at }); + self.m.jobs.push(Job::Inst { ip: inst.goto2, at }); ip = inst.goto1; } EmptyLook(ref inst) => { diff --git a/src/compile.rs b/src/compile.rs index 9a2ed5e..90ca250 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -38,6 +38,16 @@ pub struct Compiler { suffix_cache: SuffixCache, utf8_seqs: Option<Utf8Sequences>, byte_classes: ByteClassSet, + // This keeps track of extra bytes allocated while compiling the regex + // program. Currently, this corresponds to two things. First is the heap + // memory allocated by Unicode character classes ('InstRanges'). Second is + // a "fake" amount of memory used by empty sub-expressions, so that enough + // empty sub-expressions will ultimately trigger the compiler to bail + // because of a size limit restriction. (That empty sub-expressions don't + // add to heap memory usage is more-or-less an implementation detail.) In + // the second case, if we don't bail, then an excessively large repetition + // on an empty sub-expression can result in the compiler using a very large + // amount of CPU time. extra_inst_bytes: usize, } @@ -139,7 +149,8 @@ impl Compiler { self.compiled.start = dotstar_patch.entry; } self.compiled.captures = vec![None]; - let patch = self.c_capture(0, expr)?.unwrap_or(self.next_inst()); + let patch = + self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst()); if self.compiled.needs_dotstar() { self.fill(dotstar_patch.hole, patch.entry); } else { @@ -175,7 +186,7 @@ impl Compiler { self.fill_to_next(prev_hole); let split = self.push_split_hole(); let Patch { hole, entry } = - self.c_capture(0, expr)?.unwrap_or(self.next_inst()); + self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst()); self.fill_to_next(hole); self.compiled.matches.push(self.insts.len()); self.push_compiled(Inst::Match(i)); @@ -183,7 +194,7 @@ impl Compiler { } let i = exprs.len() - 1; let Patch { hole, entry } = - self.c_capture(0, &exprs[i])?.unwrap_or(self.next_inst()); + self.c_capture(0, &exprs[i])?.unwrap_or_else(|| self.next_inst()); self.fill(prev_hole, entry); self.fill_to_next(hole); self.compiled.matches.push(self.insts.len()); @@ -260,7 +271,7 @@ impl Compiler { self.check_size()?; match *expr.kind() { - Empty => Ok(None), + Empty => self.c_empty(), Literal(hir::Literal::Unicode(c)) => self.c_char(c), Literal(hir::Literal::Byte(b)) => { assert!(self.compiled.uses_bytes()); @@ -378,6 +389,19 @@ impl Compiler { } } + fn c_empty(&mut self) -> ResultOrEmpty { + // See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8 + // See: CVE-2022-24713 + // + // Since 'empty' sub-expressions don't increase the size of + // the actual compiled object, we "fake" an increase in its + // size so that our 'check_size_limit' routine will eventually + // stop compilation if there are too many empty sub-expressions + // (e.g., via a large repetition). + self.extra_inst_bytes += std::mem::size_of::<Inst>(); + Ok(None) + } + fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty { if self.num_exprs > 1 || self.compiled.is_dfa { // Don't ever compile Save instructions for regex sets because @@ -387,11 +411,11 @@ impl Compiler { } else { let entry = self.insts.len(); let hole = self.push_hole(InstHole::Save { slot: first_slot }); - let patch = self.c(expr)?.unwrap_or(self.next_inst()); + let patch = self.c(expr)?.unwrap_or_else(|| self.next_inst()); self.fill(hole, patch.entry); self.fill_to_next(patch.hole); let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 }); - Ok(Some(Patch { hole: hole, entry: entry })) + Ok(Some(Patch { hole, entry })) } } @@ -425,7 +449,7 @@ impl Compiler { self.c_class(&[hir::ClassUnicodeRange::new(c, c)]) } } else { - let hole = self.push_hole(InstHole::Char { c: c }); + let hole = self.push_hole(InstHole::Char { c }); Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) } } @@ -435,7 +459,7 @@ impl Compiler { assert!(!ranges.is_empty()); if self.compiled.uses_bytes() { - Ok(Some(CompileClass { c: self, ranges: ranges }.compile()?)) + Ok(Some(CompileClass { c: self, ranges }.compile()?)) } else { let ranges: Vec<(char, char)> = ranges.iter().map(|r| (r.start(), r.end())).collect(); @@ -444,9 +468,9 @@ impl Compiler { } else { self.extra_inst_bytes += ranges.len() * (size_of::<char>() * 2); - self.push_hole(InstHole::Ranges { ranges: ranges }) + self.push_hole(InstHole::Ranges { ranges }) }; - Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 })) + Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) } } @@ -485,8 +509,8 @@ impl Compiler { } fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty { - let hole = self.push_hole(InstHole::EmptyLook { look: look }); - Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 })) + let hole = self.push_hole(InstHole::EmptyLook { look }); + Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) } fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty @@ -496,7 +520,7 @@ impl Compiler { let mut exprs = exprs.into_iter(); let Patch { mut hole, entry } = loop { match exprs.next() { - None => return Ok(None), + None => return self.c_empty(), Some(e) => { if let Some(p) = self.c(e)? { break p; @@ -510,7 +534,7 @@ impl Compiler { hole = p.hole; } } - Ok(Some(Patch { hole: hole, entry: entry })) + Ok(Some(Patch { hole, entry })) } fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty { @@ -653,7 +677,7 @@ impl Compiler { // None). let patch_concat = self .c_concat(iter::repeat(expr).take(min))? - .unwrap_or(self.next_inst()); + .unwrap_or_else(|| self.next_inst()); if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? { self.fill(patch_concat.hole, patch_rep.entry); Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry })) @@ -677,7 +701,7 @@ impl Compiler { } // Same reasoning as in c_repeat_range_min_or_more (we know that min < // max at this point). - let patch_concat = patch_concat.unwrap_or(self.next_inst()); + let patch_concat = patch_concat.unwrap_or_else(|| self.next_inst()); let initial_entry = patch_concat.entry; // It is much simpler to compile, e.g., `a{2,5}` as: // @@ -856,14 +880,14 @@ impl MaybeInst { } MaybeInst::Split1(goto1) => { MaybeInst::Compiled(Inst::Split(InstSplit { - goto1: goto1, + goto1, goto2: goto, })) } MaybeInst::Split2(goto2) => { MaybeInst::Compiled(Inst::Split(InstSplit { goto1: goto, - goto2: goto2, + goto2, })) } _ => unreachable!( @@ -877,9 +901,7 @@ impl MaybeInst { fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) { let filled = match *self { - MaybeInst::Split => { - Inst::Split(InstSplit { goto1: goto1, goto2: goto2 }) - } + MaybeInst::Split => Inst::Split(InstSplit { goto1, goto2 }), _ => unreachable!( "must be called on Split instruction, \ instead it was called on: {:?}", @@ -937,19 +959,17 @@ enum InstHole { impl InstHole { fn fill(&self, goto: InstPtr) -> Inst { match *self { - InstHole::Save { slot } => { - Inst::Save(InstSave { goto: goto, slot: slot }) - } + InstHole::Save { slot } => Inst::Save(InstSave { goto, slot }), InstHole::EmptyLook { look } => { - Inst::EmptyLook(InstEmptyLook { goto: goto, look: look }) + Inst::EmptyLook(InstEmptyLook { goto, look }) } - InstHole::Char { c } => Inst::Char(InstChar { goto: goto, c: c }), + InstHole::Char { c } => Inst::Char(InstChar { goto, c }), InstHole::Ranges { ref ranges } => Inst::Ranges(InstRanges { - goto: goto, + goto, ranges: ranges.clone().into_boxed_slice(), }), InstHole::Bytes { start, end } => { - Inst::Bytes(InstBytes { goto: goto, start: start, end: end }) + Inst::Bytes(InstBytes { goto, start, end }) } } } @@ -1019,7 +1039,7 @@ impl<'a, 'b> CompileClass<'a, 'b> { let mut last_hole = Hole::None; for byte_range in seq { let key = SuffixCacheKey { - from_inst: from_inst, + from_inst, start: byte_range.start, end: byte_range.end, }; @@ -1109,7 +1129,7 @@ impl SuffixCache { } } *pos = self.dense.len(); - self.dense.push(SuffixCacheEntry { key: key, pc: pc }); + self.dense.push(SuffixCacheEntry { key, pc }); None } @@ -1120,8 +1140,8 @@ impl SuffixCache { fn hash(&self, suffix: &SuffixCacheKey) -> usize { // Basic FNV-1a hash as described: // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function - const FNV_PRIME: u64 = 1099511628211; - let mut h = 14695981039346656037; + const FNV_PRIME: u64 = 1_099_511_628_211; + let mut h = 14_695_981_039_346_656_037; h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME); h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME); h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME); @@ -31,7 +31,7 @@ considerably more complex than one might expect out of a DFA. A number of tricks are employed to make it fast. Tread carefully. N.B. While this implementation is heavily commented, Russ Cox's series of -articles on regexes is strongly recommended: https://swtch.com/~rsc/regexp/ +articles on regexes is strongly recommended: <https://swtch.com/~rsc/regexp/> (As is the DFA implementation in RE2, which heavily influenced this implementation.) */ @@ -454,10 +454,10 @@ impl<'a> Fsm<'a> { let mut cache = cache.borrow_mut(); let cache = &mut cache.dfa; let mut dfa = Fsm { - prog: prog, + prog, start: 0, // filled in below - at: at, - quit_after_match: quit_after_match, + at, + quit_after_match, last_match_si: STATE_UNKNOWN, last_cache_flush: at, cache: &mut cache.inner, @@ -484,10 +484,10 @@ impl<'a> Fsm<'a> { let mut cache = cache.borrow_mut(); let cache = &mut cache.dfa_reverse; let mut dfa = Fsm { - prog: prog, + prog, start: 0, // filled in below - at: at, - quit_after_match: quit_after_match, + at, + quit_after_match, last_match_si: STATE_UNKNOWN, last_cache_flush: at, cache: &mut cache.inner, @@ -515,9 +515,9 @@ impl<'a> Fsm<'a> { let mut cache = cache.borrow_mut(); let cache = &mut cache.dfa; let mut dfa = Fsm { - prog: prog, + prog, start: 0, // filled in below - at: at, + at, quit_after_match: false, last_match_si: STATE_UNKNOWN, last_cache_flush: at, @@ -1353,7 +1353,6 @@ impl<'a> Fsm<'a> { match self.cache.trans.next(si, self.byte_class(b)) { STATE_UNKNOWN => self.exec_byte(qcur, qnext, si, b), STATE_QUIT => None, - STATE_DEAD => Some(STATE_DEAD), nsi => Some(nsi), } } @@ -1387,7 +1386,6 @@ impl<'a> Fsm<'a> { }; match self.cache.start_states[flagi] { STATE_UNKNOWN => {} - STATE_DEAD => return Some(STATE_DEAD), si => return Some(si), } q.clear(); @@ -1608,11 +1606,7 @@ struct StateMap { impl StateMap { fn new(num_byte_classes: usize) -> StateMap { - StateMap { - map: HashMap::new(), - states: vec![], - num_byte_classes: num_byte_classes, - } + StateMap { map: HashMap::new(), states: vec![], num_byte_classes } } fn len(&self) -> usize { @@ -1648,7 +1642,7 @@ impl Transitions { /// The number of byte classes corresponds to the stride. Every state will /// have `num_byte_classes` slots for transitions. fn new(num_byte_classes: usize) -> Transitions { - Transitions { table: vec![], num_byte_classes: num_byte_classes } + Transitions { table: vec![], num_byte_classes } } /// Returns the total number of states currently in this table. @@ -1698,27 +1692,27 @@ impl Transitions { impl StateFlags { fn is_match(&self) -> bool { - self.0 & 0b0000000_1 > 0 + self.0 & 0b0000_0001 > 0 } fn set_match(&mut self) { - self.0 |= 0b0000000_1; + self.0 |= 0b0000_0001; } fn is_word(&self) -> bool { - self.0 & 0b000000_1_0 > 0 + self.0 & 0b0000_0010 > 0 } fn set_word(&mut self) { - self.0 |= 0b000000_1_0; + self.0 |= 0b0000_0010; } fn has_empty(&self) -> bool { - self.0 & 0b00000_1_00 > 0 + self.0 & 0b0000_0100 > 0 } fn set_empty(&mut self) { - self.0 |= 0b00000_1_00; + self.0 |= 0b0000_0100; } } diff --git a/src/exec.rs b/src/exec.rs index d5fad1c..b9abcdc 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -288,10 +288,10 @@ impl ExecBuilder { exprs.push(expr); } Ok(Parsed { - exprs: exprs, + exprs, prefixes: prefixes.unwrap_or_else(Literals::empty), suffixes: suffixes.unwrap_or_else(Literals::empty), - bytes: bytes, + bytes, }) } @@ -311,7 +311,7 @@ impl ExecBuilder { match_type: MatchType::Nothing, }); let pool = ExecReadOnly::new_pool(&ro); - return Ok(Exec { ro: ro, pool }); + return Ok(Exec { ro, pool }); } let parsed = self.parse()?; let mut nfa = Compiler::new() @@ -340,12 +340,12 @@ impl ExecBuilder { let mut ro = ExecReadOnly { res: self.options.pats, - nfa: nfa, - dfa: dfa, - dfa_reverse: dfa_reverse, + nfa, + dfa, + dfa_reverse, suffixes: LiteralSearcher::suffixes(parsed.suffixes), #[cfg(feature = "perf-literal")] - ac: ac, + ac, match_type: MatchType::Nothing, }; ro.match_type = ro.choose_match_type(self.match_type); @@ -459,7 +459,7 @@ impl<'c> RegularExpression for ExecNoSync<'c> { self.cache.value(), true, &text[start..], - text.len(), + text.len() - start, ) { dfa::Result::Match(_) => Some(text.len()), dfa::Result::NoMatch(_) => None, @@ -511,7 +511,7 @@ impl<'c> RegularExpression for ExecNoSync<'c> { self.cache.value(), true, &text[start..], - text.len(), + text.len() - start, ) { dfa::Result::Match(_) => true, dfa::Result::NoMatch(_) => false, diff --git a/src/expand.rs b/src/expand.rs index fd9c2d0..67b5149 100644 --- a/src/expand.rs +++ b/src/expand.rs @@ -127,7 +127,7 @@ impl From<usize> for Ref<'static> { /// If no such valid reference could be found, None is returned. fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef<'_>> { let mut i = 0; - let rep: &[u8] = replacement.as_ref(); + let rep: &[u8] = replacement; if rep.len() <= 1 || rep[0] != b'$' { return None; } @@ -136,7 +136,7 @@ fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef<'_>> { return find_cap_ref_braced(rep, i + 1); } let mut cap_end = i; - while rep.get(cap_end).map_or(false, is_valid_cap_letter) { + while rep.get(cap_end).copied().map_or(false, is_valid_cap_letter) { cap_end += 1; } if cap_end == i { @@ -183,8 +183,8 @@ fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef<'_>> { } /// Returns true if and only if the given byte is allowed in a capture name. -fn is_valid_cap_letter(b: &u8) -> bool { - match *b { +fn is_valid_cap_letter(b: u8) -> bool { + match b { b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true, _ => false, } diff --git a/src/input.rs b/src/input.rs index 5d50ee3..df6c3e0 100644 --- a/src/input.rs +++ b/src/input.rs @@ -160,7 +160,7 @@ impl<'t> Input for CharInput<'t> { InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 } } else { let c = decode_utf8(&self[i..]).map(|(c, _)| c).into(); - InputAt { pos: i, c: c, byte: None, len: c.len_utf8() } + InputAt { pos: i, c, byte: None, len: c.len_utf8() } } } @@ -231,7 +231,7 @@ pub struct ByteInput<'t> { impl<'t> ByteInput<'t> { /// Return a new byte-based input reader for the given string. pub fn new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t> { - ByteInput { text: text, only_utf8: only_utf8 } + ByteInput { text, only_utf8 } } } @@ -353,6 +353,9 @@ $ the end of text (or end-of-line with multi-line mode) \B not a Unicode word boundary </pre> +The empty regex is valid and matches the empty string. For example, the empty +regex matches `abc` at positions `0`, `1`, `2` and `3`. + ## Grouping and flags <pre class="rust"> @@ -628,7 +631,6 @@ pub use crate::re_builder::unicode::*; #[cfg(feature = "std")] pub use crate::re_set::unicode::*; #[cfg(feature = "std")] -#[cfg(feature = "std")] pub use crate::re_unicode::{ escape, CaptureLocations, CaptureMatches, CaptureNames, Captures, Locations, Match, Matches, NoExpand, Regex, Replacer, ReplacerRef, Split, diff --git a/src/literal/imp.rs b/src/literal/imp.rs index 82f050a..90b2f11 100644 --- a/src/literal/imp.rs +++ b/src/literal/imp.rs @@ -57,10 +57,10 @@ impl LiteralSearcher { fn new(lits: Literals, matcher: Matcher) -> Self { let complete = lits.all_complete(); LiteralSearcher { - complete: complete, + complete, lcp: Memmem::new(lits.longest_common_prefix()), lcs: Memmem::new(lits.longest_common_suffix()), - matcher: matcher, + matcher, } } diff --git a/src/pattern.rs b/src/pattern.rs index b4ffd8e..00549e5 100644 --- a/src/pattern.rs +++ b/src/pattern.rs @@ -15,7 +15,7 @@ impl<'r, 't> Pattern<'t> for &'r Regex { fn into_searcher(self, haystack: &'t str) -> RegexSearcher<'r, 't> { RegexSearcher { - haystack: haystack, + haystack, it: self.find_iter(haystack), last_step_end: 0, next_match: None, diff --git a/src/pikevm.rs b/src/pikevm.rs index 9a14240..8c9eac2 100644 --- a/src/pikevm.rs +++ b/src/pikevm.rs @@ -100,7 +100,7 @@ impl<'r, I: Input> Fsm<'r, I> { cache.clist.resize(prog.len(), prog.captures.len()); cache.nlist.resize(prog.len(), prog.captures.len()); let at = input.at(start); - Fsm { prog: prog, stack: &mut cache.stack, input: input }.exec_( + Fsm { prog, stack: &mut cache.stack, input }.exec_( &mut cache.clist, &mut cache.nlist, matches, diff --git a/src/prog.rs b/src/prog.rs index 475a811..c211f71 100644 --- a/src/prog.rs +++ b/src/prog.rs @@ -233,7 +233,7 @@ impl fmt::Debug for Program { if pc == self.start { write!(f, " (start)")?; } - write!(f, "\n")?; + writeln!(f)?; } Ok(()) } diff --git a/src/re_bytes.rs b/src/re_bytes.rs index ae55d6d..07e9f98 100644 --- a/src/re_bytes.rs +++ b/src/re_bytes.rs @@ -53,7 +53,7 @@ impl<'t> Match<'t> { /// Creates a new match from the given haystack and byte offsets. #[inline] fn new(haystack: &'t [u8], start: usize, end: usize) -> Match<'t> { - Match { text: haystack, start: start, end: end } + Match { text: haystack, start, end } } } @@ -255,7 +255,7 @@ impl Regex { pub fn captures<'t>(&self, text: &'t [u8]) -> Option<Captures<'t>> { let mut locs = self.capture_locations(); self.captures_read_at(&mut locs, text, 0).map(move |_| Captures { - text: text, + text, locs: locs.0, named_groups: self.0.capture_name_idx().clone(), }) @@ -496,12 +496,12 @@ impl Regex { let mut new = Vec::with_capacity(text.len()); let mut last_match = 0; for (i, m) in it { - if limit > 0 && i >= limit { - break; - } new.extend_from_slice(&text[last_match..m.start()]); new.extend_from_slice(&rep); last_match = m.end(); + if limit > 0 && i >= limit - 1 { + break; + } } new.extend_from_slice(&text[last_match..]); return Cow::Owned(new); @@ -516,14 +516,14 @@ impl Regex { let mut new = Vec::with_capacity(text.len()); let mut last_match = 0; for (i, cap) in it { - if limit > 0 && i >= limit { - break; - } // unwrap on 0 is OK because captures only reports matches let m = cap.get(0).unwrap(); new.extend_from_slice(&text[last_match..m.start()]); rep.replace_append(&cap, &mut new); last_match = m.end(); + if limit > 0 && i >= limit - 1 { + break; + } } new.extend_from_slice(&text[last_match..]); Cow::Owned(new) @@ -578,7 +578,7 @@ impl Regex { /// context into consideration. For example, the `\A` anchor can only /// match when `start == 0`. pub fn is_match_at(&self, text: &[u8], start: usize) -> bool { - self.shortest_match_at(text, start).is_some() + self.0.searcher().is_match_at(text, start) } /// Returns the same as find, but starts the search at the given @@ -723,7 +723,7 @@ impl<'r, 't> Iterator for CaptureMatches<'r, 't> { fn next(&mut self) -> Option<Captures<'t>> { self.0.next().map(|locs| Captures { text: self.0.text(), - locs: locs, + locs, named_groups: self.0.regex().capture_name_idx().clone(), }) } @@ -877,7 +877,7 @@ impl CaptureLocations { self.0.pos(i) } - /// Returns the total number of capturing groups. + /// Returns the total number of capture groups (even if they didn't match). /// /// This is always at least `1` since every regex has at least `1` /// capturing group that corresponds to the entire match. @@ -979,7 +979,7 @@ impl<'t> Captures<'t> { expand_bytes(self, replacement, dst) } - /// Returns the number of captured groups. + /// Returns the total number of capture groups (even if they didn't match). /// /// This is always at least `1`, since every regex has at least one capture /// group that corresponds to the full match. diff --git a/src/re_set.rs b/src/re_set.rs index 73d5953..a6d886d 100644 --- a/src/re_set.rs +++ b/src/re_set.rs @@ -59,13 +59,45 @@ $(#[$doc_regexset_example])* /// 1. Does any regex in the set match? /// 2. If so, which regexes in the set match? /// -/// As with the main `Regex` type, it is cheaper to ask (1) instead of (2) -/// since the matching engines can stop after the first match is found. +/// As with the main [`Regex`][crate::Regex] type, it is cheaper to ask (1) +/// instead of (2) since the matching engines can stop after the first match +/// is found. /// -/// Other features like finding the location of successive matches or their -/// sub-captures aren't supported. If you need this functionality, the -/// recommended approach is to compile each regex in the set independently and -/// selectively match them based on which regexes in the set matched. +/// You cannot directly extract [`Match`][crate::Match] or +/// [`Captures`][crate::Captures] objects from a regex set. If you need these +/// operations, the recommended approach is to compile each pattern in the set +/// independently and scan the exact same input a second time with those +/// independently compiled patterns: +/// +/// ```rust +/// use regex::{Regex, RegexSet}; +/// +/// let patterns = ["foo", "bar"]; +/// // Both patterns will match different ranges of this string. +/// let text = "barfoo"; +/// +/// // Compile a set matching any of our patterns. +/// let set = RegexSet::new(&patterns).unwrap(); +/// // Compile each pattern independently. +/// let regexes: Vec<_> = set.patterns().iter() +/// .map(|pat| Regex::new(pat).unwrap()) +/// .collect(); +/// +/// // Match against the whole set first and identify the individual +/// // matching patterns. +/// let matches: Vec<&str> = set.matches(text).into_iter() +/// // Dereference the match index to get the corresponding +/// // compiled pattern. +/// .map(|match_idx| ®exes[match_idx]) +/// // To get match locations or any other info, we then have to search +/// // the exact same text again, using our separately-compiled pattern. +/// .map(|pat| pat.find(text).unwrap().as_str()) +/// .collect(); +/// +/// // Matches arrive in the order the constituent patterns were declared, +/// // not the order they appear in the input. +/// assert_eq!(vec!["foo", "bar"], matches); +/// ``` /// /// # Performance /// diff --git a/src/re_trait.rs b/src/re_trait.rs index 680aa54..d0c717d 100644 --- a/src/re_trait.rs +++ b/src/re_trait.rs @@ -74,8 +74,19 @@ impl<'c> Iterator for SubCapturesPosIter<'c> { self.idx += 1; x } + + fn size_hint(&self) -> (usize, Option<usize>) { + let len = self.locs.len() - self.idx; + (len, Some(len)) + } + + fn count(self) -> usize { + self.len() + } } +impl<'c> ExactSizeIterator for SubCapturesPosIter<'c> {} + impl<'c> FusedIterator for SubCapturesPosIter<'c> {} /// `RegularExpression` describes types that can implement regex searching. @@ -139,7 +150,7 @@ pub trait RegularExpression: Sized + fmt::Debug { /// Returns an iterator over all non-overlapping successive leftmost-first /// matches. fn find_iter(self, text: &Self::Text) -> Matches<'_, Self> { - Matches { re: self, text: text, last_end: 0, last_match: None } + Matches { re: self, text, last_end: 0, last_match: None } } /// Returns an iterator over all non-overlapping successive leftmost-first diff --git a/src/re_unicode.rs b/src/re_unicode.rs index 142c78f..197510e 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -61,7 +61,7 @@ impl<'t> Match<'t> { /// Creates a new match from the given haystack and byte offsets. #[inline] fn new(haystack: &'t str, start: usize, end: usize) -> Match<'t> { - Match { text: haystack, start: start, end: end } + Match { text: haystack, start, end } } } @@ -129,7 +129,7 @@ impl<'t> From<Match<'t>> for Range<usize> { /// assert!(haystack.contains(&re)); /// assert_eq!(haystack.find(&re), Some(1)); /// assert_eq!(haystack.match_indices(&re).collect::<Vec<_>>(), -/// vec![(1, 4), (5, 8)]); +/// vec![(1, "111"), (5, "222")]); /// assert_eq!(haystack.split(&re).collect::<Vec<_>>(), vec!["a", "b", "c"]); /// ``` #[derive(Clone)] @@ -311,7 +311,7 @@ impl Regex { pub fn captures<'t>(&self, text: &'t str) -> Option<Captures<'t>> { let mut locs = self.capture_locations(); self.captures_read_at(&mut locs, text, 0).map(move |_| Captures { - text: text, + text, locs: locs.0, named_groups: self.0.capture_name_idx().clone(), }) @@ -538,7 +538,7 @@ impl Regex { mut rep: R, ) -> Cow<'t, str> { // If we know that the replacement doesn't have any capture expansions, - // then we can fast path. The fast path can make a tremendous + // then we can use the fast path. The fast path can make a tremendous // difference: // // 1) We use `find_iter` instead of `captures_iter`. Not asking for @@ -554,12 +554,12 @@ impl Regex { let mut new = String::with_capacity(text.len()); let mut last_match = 0; for (i, m) in it { - if limit > 0 && i >= limit { - break; - } new.push_str(&text[last_match..m.start()]); new.push_str(&rep); last_match = m.end(); + if limit > 0 && i >= limit - 1 { + break; + } } new.push_str(&text[last_match..]); return Cow::Owned(new); @@ -574,14 +574,14 @@ impl Regex { let mut new = String::with_capacity(text.len()); let mut last_match = 0; for (i, cap) in it { - if limit > 0 && i >= limit { - break; - } // unwrap on 0 is OK because captures only reports matches let m = cap.get(0).unwrap(); new.push_str(&text[last_match..m.start()]); rep.replace_append(&cap, &mut new); last_match = m.end(); + if limit > 0 && i >= limit - 1 { + break; + } } new.push_str(&text[last_match..]); Cow::Owned(new) @@ -636,7 +636,7 @@ impl Regex { /// context into consideration. For example, the `\A` anchor can only /// match when `start == 0`. pub fn is_match_at(&self, text: &str, start: usize) -> bool { - self.shortest_match_at(text, start).is_some() + self.0.searcher_str().is_match_at(text, start) } /// Returns the same as find, but starts the search at the given @@ -887,7 +887,7 @@ impl CaptureLocations { self.0.pos(i) } - /// Returns the total number of capturing groups. + /// Returns the total number of capture groups (even if they didn't match). /// /// This is always at least `1` since every regex has at least `1` /// capturing group that corresponds to the entire match. @@ -989,7 +989,7 @@ impl<'t> Captures<'t> { expand_str(self, replacement, dst) } - /// Returns the number of captured groups. + /// Returns the total number of capture groups (even if they didn't match). /// /// This is always at least `1`, since every regex has at least one capture /// group that corresponds to the full match. @@ -1092,8 +1092,18 @@ impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> { .next() .map(|cap| cap.map(|(s, e)| Match::new(self.caps.text, s, e))) } + + fn size_hint(&self) -> (usize, Option<usize>) { + self.it.size_hint() + } + + fn count(self) -> usize { + self.it.count() + } } +impl<'c, 't> ExactSizeIterator for SubCaptureMatches<'c, 't> {} + impl<'c, 't> FusedIterator for SubCaptureMatches<'c, 't> {} /// An iterator that yields all non-overlapping capture groups matching a @@ -1114,7 +1124,7 @@ impl<'r, 't> Iterator for CaptureMatches<'r, 't> { fn next(&mut self) -> Option<Captures<'t>> { self.0.next().map(|locs| Captures { text: self.0.text(), - locs: locs, + locs, named_groups: self.0.regex().capture_name_idx().clone(), }) } diff --git a/src/utf8.rs b/src/utf8.rs index 6e0608f..2dfd2c0 100644 --- a/src/utf8.rs +++ b/src/utf8.rs @@ -108,7 +108,7 @@ pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> { | ((b2 & !TAG_CONT) as u32) << 6 | ((b3 & !TAG_CONT) as u32); match cp { - 0x10000..=0x10FFFF => char::from_u32(cp).map(|cp| (cp, 4)), + 0x10000..=0x0010_FFFF => char::from_u32(cp).map(|cp| (cp, 4)), _ => None, } } diff --git a/tests/regression.rs b/tests/regression.rs index 44b9083..e8b2525 100644 --- a/tests/regression.rs +++ b/tests/regression.rs @@ -217,3 +217,6 @@ matiter!( // https://en.wikipedia.org/wiki/Je_(Cyrillic) ismatch!(empty_group_match, r"()Ј01", "zЈ01", true); matiter!(empty_group_find, r"()Ј01", "zЈ01", (1, 5)); + +// See: https://github.com/rust-lang/regex/issues/862 +mat!(non_greedy_question_literal, r"ab??", "ab", Some((0, 1))); diff --git a/tests/replace.rs b/tests/replace.rs index 1dc6106..d65be07 100644 --- a/tests/replace.rs +++ b/tests/replace.rs @@ -228,3 +228,21 @@ replace!( bytes!(&std::borrow::Cow::<'_, [u8]>::Owned(vec![b'Z'])), "age: Z6" ); + +#[test] +fn replacen_no_captures() { + let re = regex!(r"[0-9]"); + assert_eq!( + re.replacen(text!("age: 1234"), 2, t!("Z")), + text!("age: ZZ34") + ); +} + +#[test] +fn replacen_with_captures() { + let re = regex!(r"([0-9])"); + assert_eq!( + re.replacen(text!("age: 1234"), 2, t!("${1}Z")), + text!("age: 1Z2Z34") + ); +} diff --git a/tests/test_default.rs b/tests/test_default.rs index d4365fb..19a319a 100644 --- a/tests/test_default.rs +++ b/tests/test_default.rs @@ -150,3 +150,83 @@ fn regex_is_reasonably_small() { assert_eq!(16, size_of::<bytes::Regex>()); assert_eq!(16, size_of::<bytes::RegexSet>()); } + +// See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8 +// See: CVE-2022-24713 +// +// We test that our regex compiler will correctly return a "too big" error when +// we try to use a very large repetition on an *empty* sub-expression. +// +// At the time this test was written, the regex compiler does not represent +// empty sub-expressions with any bytecode instructions. In effect, it's an +// "optimization" to leave them out, since they would otherwise correspond +// to an unconditional JUMP in the regex bytecode (i.e., an unconditional +// epsilon transition in the NFA graph). Therefore, an empty sub-expression +// represents an interesting case for the compiler's size limits. Since it +// doesn't actually contribute any additional memory to the compiled regex +// instructions, the size limit machinery never detects it. Instead, it just +// dumbly tries to compile the empty sub-expression N times, where N is the +// repetition size. +// +// When N is very large, this will cause the compiler to essentially spin and +// do nothing for a decently large amount of time. It causes the regex to take +// quite a bit of time to compile, despite the concrete syntax of the regex +// being quite small. +// +// The degree to which this is actually a problem is somewhat of a judgment +// call. Some regexes simply take a long time to compile. But in general, you +// should be able to reasonably control this by setting lower or higher size +// limits on the compiled object size. But this mitigation doesn't work at all +// for this case. +// +// This particular test is somewhat narrow. It merely checks that regex +// compilation will, at some point, return a "too big" error. Before the +// fix landed, this test would eventually fail because the regex would be +// successfully compiled (after enough time elapsed). So while this test +// doesn't check that we exit in a reasonable amount of time, it does at least +// check that we are properly returning an error at some point. +#[test] +fn big_empty_regex_fails() { + use regex::Regex; + + let result = Regex::new("(?:){4294967295}"); + assert!(result.is_err()); +} + +// Below is a "billion laughs" variant of the previous test case. +#[test] +fn big_empty_reps_chain_regex_fails() { + use regex::Regex; + + let result = Regex::new("(?:){64}{64}{64}{64}{64}{64}"); + assert!(result.is_err()); +} + +// Below is another situation where a zero-length sub-expression can be +// introduced. +#[test] +fn big_zero_reps_regex_fails() { + use regex::Regex; + + let result = Regex::new(r"x{0}{4294967295}"); + assert!(result.is_err()); +} + +// Testing another case for completeness. +#[test] +fn empty_alt_regex_fails() { + use regex::Regex; + + let result = Regex::new(r"(?:|){4294967295}"); + assert!(result.is_err()); +} + +// Regression test for: https://github.com/rust-lang/regex/issues/969 +#[test] +fn regression_i969() { + use regex::Regex; + + let re = Regex::new(r"c.*d\z").unwrap(); + assert_eq!(Some(6), re.shortest_match_at("ababcd", 4)); + assert_eq!(Some(6), re.find_at("ababcd", 4).map(|m| m.end())); +} diff --git a/tests/unicode.rs b/tests/unicode.rs index 9f1cd0c..9b32286 100644 --- a/tests/unicode.rs +++ b/tests/unicode.rs @@ -232,3 +232,20 @@ mat!(uni_class_sb2, r"\p{sb=lower}", "\u{0469}", Some((0, 2))); mat!(uni_class_sb3, r"\p{sb=Close}", "\u{FF60}", Some((0, 3))); mat!(uni_class_sb4, r"\p{sb=Close}", "\u{1F677}", Some((0, 4))); mat!(uni_class_sb5, r"\p{sb=SContinue}", "\u{FF64}", Some((0, 3))); + +// Test 'Vithkuqi' support, which was added in Unicode 14. +// See: https://github.com/rust-lang/regex/issues/877 +mat!( + uni_vithkuqi_literal_upper, + r"(?i)^\u{10570}$", + "\u{10570}", + Some((0, 4)) +); +mat!( + uni_vithkuqi_literal_lower, + r"(?i)^\u{10570}$", + "\u{10597}", + Some((0, 4)) +); +mat!(uni_vithkuqi_word_upper, r"^\w$", "\u{10570}", Some((0, 4))); +mat!(uni_vithkuqi_word_lower, r"^\w$", "\u{10597}", Some((0, 4))); |