aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJakub Kotur <qtr@google.com>2021-03-16 18:39:58 +0000
committerAutomerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>2021-03-16 18:39:58 +0000
commit4c04961b3d3a17bb0a8468d2de46f52e65822e24 (patch)
tree4988f2477da88bee252cb61980423d31bb751ead
parent3a92559c10e243d63731e012f7548244d1f352b2 (diff)
parent3bceaeb547a86bf16a96522c89640523e79575cb (diff)
downloadregex-automata-4c04961b3d3a17bb0a8468d2de46f52e65822e24.tar.gz
Initial import of regex-automata-0.1.9. am: 3bceaeb547
Original change: https://android-review.googlesource.com/c/platform/external/rust/crates/regex-automata/+/1621086 Change-Id: Ic4305aa1022b31db5474f5fbde0a5ba729cb7f03
-rw-r--r--.cargo_vcs_info.json5
-rw-r--r--.github/workflows/ci.yml158
-rw-r--r--.gitignore7
-rw-r--r--Cargo.toml90
-rw-r--r--Cargo.toml.orig76
-rw-r--r--README.md222
-rw-r--r--data/fowler-tests/LICENSE19
-rw-r--r--data/fowler-tests/README17
-rw-r--r--data/fowler-tests/basic.dat221
-rw-r--r--data/fowler-tests/nullsubexpr.dat79
-rw-r--r--data/fowler-tests/repetition.dat163
-rw-r--r--data/tests/crazy.toml177
-rw-r--r--data/tests/flags.toml59
-rw-r--r--data/tests/fowler/LICENSE19
-rw-r--r--data/tests/fowler/README23
-rw-r--r--data/tests/fowler/basic.dat221
-rw-r--r--data/tests/fowler/basic.toml1428
-rwxr-xr-xdata/tests/fowler/fowler-to-toml76
-rw-r--r--data/tests/fowler/nullsubexpr.dat79
-rw-r--r--data/tests/fowler/nullsubexpr.toml350
-rw-r--r--data/tests/fowler/repetition-long.dat85
-rw-r--r--data/tests/fowler/repetition-long.toml294
-rw-r--r--data/tests/fowler/repetition.dat83
-rw-r--r--data/tests/fowler/repetition.toml343
-rw-r--r--data/tests/iter.toml92
-rw-r--r--data/tests/no-unicode.toml138
-rw-r--r--data/tests/unicode.toml489
-rw-r--r--rustfmt.toml2
-rw-r--r--src/classes.rs271
-rw-r--r--src/codegen.rs104
-rw-r--r--src/dense.rs2332
-rw-r--r--src/determinize.rs285
-rw-r--r--src/dfa.rs363
-rw-r--r--src/error.rs150
-rw-r--r--src/lib.rs360
-rw-r--r--src/minimize.rs373
-rw-r--r--src/nfa/compiler.rs1193
-rw-r--r--src/nfa/map.rs282
-rw-r--r--src/nfa/mod.rs252
-rw-r--r--src/nfa/range_trie.rs1048
-rw-r--r--src/regex.rs771
-rw-r--r--src/sparse.rs1256
-rw-r--r--src/sparse_set.rs60
-rw-r--r--src/state_id.rs291
-rw-r--r--src/transducer.rs107
-rw-r--r--tests/collection.rs461
-rw-r--r--tests/regression.rs42
-rw-r--r--tests/suite.rs250
-rw-r--r--tests/tests.rs25
-rw-r--r--tests/unescape.rs84
50 files changed, 15375 insertions, 0 deletions
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json
new file mode 100644
index 0000000..7aad2fd
--- /dev/null
+++ b/.cargo_vcs_info.json
@@ -0,0 +1,5 @@
+{
+ "git": {
+ "sha1": "4e0e8ec599e92b115c53ed8d760f7c38bf91891f"
+ }
+}
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..bc98cce
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,158 @@
+name: ci
+on:
+ pull_request:
+ push:
+ branches:
+ - master
+ schedule:
+ - cron: '00 01 * * *'
+jobs:
+ test:
+ name: test
+ env:
+ # For some builds, we use cross to test on 32-bit and big-endian
+ # systems.
+ CARGO: cargo
+ # When CARGO is set to CROSS, TARGET is set to `--target matrix.target`.
+ TARGET:
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ build:
+ - pinned
+ - stable
+ - stable-32
+ - stable-mips
+ - stable-thumb
+ - beta
+ - nightly
+ - macos
+ - win-msvc
+ - win-gnu
+ include:
+ - build: pinned
+ os: ubuntu-18.04
+ rust: 1.28.0
+ - build: stable
+ os: ubuntu-18.04
+ rust: stable
+ - build: stable-32
+ os: ubuntu-18.04
+ rust: stable
+ target: i686-unknown-linux-gnu
+ - build: stable-mips
+ os: ubuntu-18.04
+ rust: stable
+ target: mips64-unknown-linux-gnuabi64
+ - build: stable-thumb
+ os: ubuntu-18.04
+ rust: stable
+ target: thumbv7em-none-eabihf
+ - build: beta
+ os: ubuntu-18.04
+ rust: beta
+ - build: nightly
+ os: ubuntu-18.04
+ rust: nightly
+ - build: macos
+ os: macos-latest
+ rust: stable
+ - build: win-msvc
+ os: windows-2019
+ rust: stable
+ - build: win-gnu
+ os: windows-2019
+ rust: stable-x86_64-gnu
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v1
+ with:
+ fetch-depth: 1
+
+ - name: Install Rust
+ uses: actions-rs/toolchain@v1
+ with:
+ toolchain: ${{ matrix.rust }}
+ profile: minimal
+ override: true
+
+ - name: Use Cross
+ if: matrix.target != ''
+ run: |
+ # FIXME: to work around bugs in latest cross release, install master.
+ # See: https://github.com/rust-embedded/cross/issues/357
+ cargo install --git https://github.com/rust-embedded/cross
+ echo "::set-env name=CARGO::cross"
+ echo "::set-env name=TARGET::--target ${{ matrix.target }}"
+
+ - name: Show command used for Cargo
+ run: |
+ echo "cargo command is: ${{ env.CARGO }}"
+ echo "target flag is: ${{ env.TARGET }}"
+
+ - name: Build
+ if: matrix.build != 'stable-thumb'
+ run: ${{ env.CARGO }} build --verbose ${{ env.TARGET }}
+
+ - name: Build docs
+ if: matrix.build != 'stable-thumb'
+ run: ${{ env.CARGO }} doc --verbose ${{ env.TARGET }}
+
+ # Our dev dependencies are increasing their MSRV more quickly then we want
+ # to, so the following are only run on non-pinned targets.
+
+ - name: Build examples
+ if: matrix.build != 'pinned' && matrix.build != 'stable-thumb'
+ run: ${{ env.CARGO }} build --manifest-path examples/Cargo.toml --examples
+
+ - name: Run tests
+ if: matrix.build != 'pinned' && matrix.build != 'stable-thumb' && matrix.build != 'stable-mips'
+ run: ${{ env.CARGO }} test --verbose --features transducer ${{ env.TARGET }}
+
+ # The mips test runner is quite sluggish, so don't run the full test
+ # suite there. Unfortunate, but CI times balloon otherwise.
+ - name: Run tests
+ if: matrix.build == 'stable-mips'
+ run: ${{ env.CARGO }} test --verbose --features transducer --lib ${{ env.TARGET }}
+
+ - name: Build without default features
+ if: matrix.build != 'pinned'
+ run: ${{ env.CARGO }} build --verbose --no-default-features ${{ env.TARGET }}
+
+ - name: Build docs without default features
+ if: matrix.build != 'pinned'
+ run: ${{ env.CARGO }} doc --verbose --lib --no-default-features ${{ env.TARGET }}
+
+ - name: Run tests without default features
+ if: matrix.build != 'pinned' && matrix.build != 'stable-thumb'
+ run: ${{ env.CARGO }} test --verbose --lib --no-default-features ${{ env.TARGET }}
+
+ - name: Compile debug tool
+ if: matrix.build != 'pinned' && matrix.build != 'stable-thumb'
+ run: ${{ env.CARGO }} build --verbose --manifest-path regex-automata-debug/Cargo.toml ${{ env.TARGET }}
+
+ - name: Test benchmarks
+ if: matrix.build != 'pinned' && matrix.build != 'stable-thumb'
+ run: ${{ env.CARGO }} bench --manifest-path bench/Cargo.toml --verbose ${{ env.TARGET }} -- --test
+
+ rustfmt:
+ name: rustfmt
+ runs-on: ubuntu-18.04
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v1
+ with:
+ fetch-depth: 1
+ - name: Install Rust
+ uses: actions-rs/toolchain@v1
+ with:
+ toolchain: stable
+ override: true
+ profile: minimal
+ components: rustfmt
+ - name: Check formatting
+ run: |
+ cargo fmt --all -- --check
+ - name: Check formatting for debug tool
+ run: |
+ cargo fmt --manifest-path regex-automata-debug/Cargo.toml -- --check
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..0f5a363
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+/target
+/examples/target
+/regex-automata-debug/target
+tags
+/Cargo.lock
+/examples/Cargo.lock
+BREADCRUMBS
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..7ef891a
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,90 @@
+# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
+#
+# When uploading crates to the registry Cargo will automatically
+# "normalize" Cargo.toml files for maximal compatibility
+# with all versions of Cargo and also rewrite `path` dependencies
+# to registry (e.g., crates.io) dependencies
+#
+# If you believe there's an error in this file please file an
+# issue against the rust-lang/cargo repository. If you're
+# editing this file be aware that the upstream Cargo.toml
+# will likely look very different (and much more reasonable)
+
+[package]
+name = "regex-automata"
+version = "0.1.9"
+authors = ["Andrew Gallant <jamslam@gmail.com>"]
+exclude = ["/.travis.yml", "/appveyor.yml", "/ci/*", "/scripts/*", "/regex-automata-debug"]
+autoexamples = false
+autotests = false
+description = "Automata construction and matching using regular expressions."
+homepage = "https://github.com/BurntSushi/regex-automata"
+documentation = "https://docs.rs/regex-automata"
+readme = "README.md"
+keywords = ["regex", "dfa", "automata", "automaton", "nfa"]
+categories = ["text-processing"]
+license = "Unlicense/MIT"
+repository = "https://github.com/BurntSushi/regex-automata"
+[profile.bench]
+debug = true
+
+[profile.dev]
+opt-level = 3
+debug = true
+
+[profile.release]
+debug = true
+
+[profile.test]
+opt-level = 3
+debug = true
+
+[lib]
+bench = false
+
+[[test]]
+name = "default"
+path = "tests/tests.rs"
+[dependencies.byteorder]
+version = "1.2.7"
+default-features = false
+
+[dependencies.fst]
+version = "0.4.0"
+optional = true
+
+[dependencies.regex-syntax]
+version = "0.6.16"
+optional = true
+[dev-dependencies.bstr]
+version = "0.2"
+features = ["std"]
+default-features = false
+
+[dev-dependencies.lazy_static]
+version = "1.2.0"
+
+[dev-dependencies.regex]
+version = "1.1"
+
+[dev-dependencies.serde]
+version = "1.0.82"
+
+[dev-dependencies.serde_bytes]
+version = "0.11"
+
+[dev-dependencies.serde_derive]
+version = "1.0.82"
+
+[dev-dependencies.toml]
+version = "0.4.10"
+
+[features]
+default = ["std"]
+std = ["regex-syntax"]
+transducer = ["std", "fst"]
+[badges.appveyor]
+repository = "BurntSushi/regex-automata"
+
+[badges.travis-ci]
+repository = "BurntSushi/regex-automata"
diff --git a/Cargo.toml.orig b/Cargo.toml.orig
new file mode 100644
index 0000000..2f5eb41
--- /dev/null
+++ b/Cargo.toml.orig
@@ -0,0 +1,76 @@
+[package]
+name = "regex-automata"
+version = "0.1.9" #:version
+authors = ["Andrew Gallant <jamslam@gmail.com>"]
+description = "Automata construction and matching using regular expressions."
+documentation = "https://docs.rs/regex-automata"
+homepage = "https://github.com/BurntSushi/regex-automata"
+repository = "https://github.com/BurntSushi/regex-automata"
+readme = "README.md"
+keywords = ["regex", "dfa", "automata", "automaton", "nfa"]
+license = "Unlicense/MIT"
+categories = ["text-processing"]
+exclude = [
+ "/.travis.yml", "/appveyor.yml", "/ci/*", "/scripts/*",
+ "/regex-automata-debug",
+]
+autotests = false
+autoexamples = false
+
+[badges]
+travis-ci = { repository = "BurntSushi/regex-automata" }
+appveyor = { repository = "BurntSushi/regex-automata" }
+
+[workspace]
+members = ["bench"]
+# We'd ideally not do this, but since the debug tool uses Rust 2018, older
+# versions of Rust (such as 1.28) fail to parse the manifest because it treats
+# `edition = "2018"` as an unstable feature.
+#
+# When we move our MSRV to Rust 2018, then we should be able to add this back
+# to the workspace.
+exclude = ["examples", "regex-automata-debug"]
+
+[lib]
+bench = false
+
+[features]
+default = ["std"]
+std = ["regex-syntax"]
+transducer = ["std", "fst"]
+
+[dependencies]
+byteorder = { version = "1.2.7", default-features = false }
+fst = { version = "0.4.0", optional = true }
+regex-syntax = { version = "0.6.16", optional = true }
+
+[dev-dependencies]
+bstr = { version = "0.2", default-features = false, features = ["std"] }
+lazy_static = "1.2.0"
+regex = "1.1"
+serde = "1.0.82"
+serde_bytes = "0.11"
+serde_derive = "1.0.82"
+toml = "0.4.10"
+
+[[test]]
+path = "tests/tests.rs"
+name = "default"
+
+[profile.dev]
+# Running tests takes too long in debug mode, so we forcefully always build
+# with optimizations. Unfortunate, but, ¯\_(ツ)_/¯.
+opt-level = 3
+debug = true
+
+[profile.test]
+# Running tests takes too long in debug mode, so we forcefully always build
+# with optimizations. Unfortunate, but, ¯\_(ツ)_/¯.
+opt-level = 3
+debug = true
+
+[profile.release]
+debug = true
+
+[profile.bench]
+debug = true
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..2acf065
--- /dev/null
+++ b/README.md
@@ -0,0 +1,222 @@
+regex-automata
+==============
+A low level regular expression library that uses deterministic finite automata.
+It supports a rich syntax with Unicode support, has extensive options for
+configuring the best space vs time trade off for your use case and provides
+support for cheap deserialization of automata for use in `no_std` environments.
+
+[![Build status](https://github.com/BurntSushi/regex-automata/workflows/ci/badge.svg)](https://github.com/BurntSushi/regex-automata/actions)
+[![](http://meritbadge.herokuapp.com/regex-automata)](https://crates.io/crates/regex-automata)
+
+Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org).
+
+
+### Documentation
+
+https://docs.rs/regex-automata
+
+
+### Usage
+
+Add this to your `Cargo.toml`:
+
+```toml
+[dependencies]
+regex-automata = "0.1"
+```
+
+and this to your crate root (if you're using Rust 2015):
+
+```rust
+extern crate regex_automata;
+```
+
+
+### Example: basic regex searching
+
+This example shows how to compile a regex using the default configuration
+and then use it to find matches in a byte string:
+
+```rust
+use regex_automata::Regex;
+
+let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
+let text = b"2018-12-24 2016-10-08";
+let matches: Vec<(usize, usize)> = re.find_iter(text).collect();
+assert_eq!(matches, vec![(0, 10), (11, 21)]);
+```
+
+For more examples and information about the various knobs that can be turned,
+please see the [docs](https://docs.rs/regex-automata).
+
+
+### Support for `no_std`
+
+This crate comes with a `std` feature that is enabled by default. When the
+`std` feature is enabled, the API of this crate will include the facilities
+necessary for compiling, serializing, deserializing and searching with regular
+expressions. When the `std` feature is disabled, the API of this crate will
+shrink such that it only includes the facilities necessary for deserializing
+and searching with regular expressions.
+
+The intended workflow for `no_std` environments is thus as follows:
+
+* Write a program with the `std` feature that compiles and serializes a
+ regular expression. Serialization should only happen after first converting
+ the DFAs to use a fixed size state identifier instead of the default `usize`.
+ You may also need to serialize both little and big endian versions of each
+ DFA. (So that's 4 DFAs in total for each regex.)
+* In your `no_std` environment, follow the examples above for deserializing
+ your previously serialized DFAs into regexes. You can then search with them
+ as you would any regex.
+
+Deserialization can happen anywhere. For example, with bytes embedded into a
+binary or with a file memory mapped at runtime.
+
+Note that the
+[`ucd-generate`](https://github.com/BurntSushi/ucd-generate)
+tool will do the first step for you with its `dfa` or `regex` sub-commands.
+
+
+### Cargo features
+
+* `std` - **Enabled** by default. This enables the ability to compile finite
+ automata. This requires the `regex-syntax` dependency. Without this feature
+ enabled, finite automata can only be used for searching (using the approach
+ described above).
+* `transducer` - **Disabled** by default. This provides implementations of the
+ `Automaton` trait found in the `fst` crate. This permits using finite
+ automata generated by this crate to search finite state transducers. This
+ requires the `fst` dependency.
+
+
+### Differences with the regex crate
+
+The main goal of the [`regex`](https://docs.rs/regex) crate is to serve as a
+general purpose regular expression engine. It aims to automatically balance low
+compile times, fast search times and low memory usage, while also providing
+a convenient API for users. In contrast, this crate provides a lower level
+regular expression interface that is a bit less convenient while providing more
+explicit control over memory usage and search times.
+
+Here are some specific negative differences:
+
+* **Compilation can take an exponential amount of time and space** in the size
+ of the regex pattern. While most patterns do not exhibit worst case
+ exponential time, such patterns do exist. For example, `[01]*1[01]{N}` will
+ build a DFA with `2^(N+1)` states. For this reason, untrusted patterns should
+ not be compiled with this library. (In the future, the API may expose an
+ option to return an error if the DFA gets too big.)
+* This crate does not support sub-match extraction, which can be achieved with
+ the regex crate's "captures" API. This may be added in the future, but is
+ unlikely.
+* While the regex crate doesn't necessarily sport fast compilation times, the
+ regexes in this crate are almost universally slow to compile, especially when
+ they contain large Unicode character classes. For example, on my system,
+ compiling `\w{3}` with byte classes enabled takes just over 1 second and
+ almost 5MB of memory! (Compiling a sparse regex takes about the same time
+ but only uses about 500KB of memory.) Conversly, compiling the same regex
+ without Unicode support, e.g., `(?-u)\w{3}`, takes under 1 millisecond and
+ less than 5KB of memory. For this reason, you should only use Unicode
+ character classes if you absolutely need them!
+* This crate does not support regex sets.
+* This crate does not support zero-width assertions such as `^`, `$`, `\b` or
+ `\B`.
+* As a lower level crate, this library does not do literal optimizations. In
+ exchange, you get predictable performance regardless of input. The
+ philosophy here is that literal optimizations should be applied at a higher
+ level, although there is no easy support for this in the ecosystem yet.
+* There is no `&str` API like in the regex crate. In this crate, all APIs
+ operate on `&[u8]`. By default, match indices are guaranteed to fall on
+ UTF-8 boundaries, unless `RegexBuilder::allow_invalid_utf8` is enabled.
+
+With some of the downsides out of the way, here are some positive differences:
+
+* Both dense and sparse DFAs can be serialized to raw bytes, and then cheaply
+ deserialized. Deserialization always takes constant time since searching can
+ be performed directly on the raw serialized bytes of a DFA.
+* This crate was specifically designed so that the searching phase of a DFA has
+ minimal runtime requirements, and can therefore be used in `no_std`
+ environments. While `no_std` environments cannot compile regexes, they can
+ deserialize pre-compiled regexes.
+* Since this crate builds DFAs ahead of time, it will generally out-perform
+ the `regex` crate on equivalent tasks. The performance difference is likely
+ not large. However, because of a complex set of optimizations in the regex
+ crate (like literal optimizations), an accurate performance comparison may be
+ difficult to do.
+* Sparse DFAs provide a way to build a DFA ahead of time that sacrifices search
+ performance a small amount, but uses much less storage space. Potentially
+ even less than what the regex crate uses.
+* This crate exposes DFAs directly, such as `DenseDFA` and `SparseDFA`,
+ which enables one to do less work in some cases. For example, if you only
+ need the end of a match and not the start of a match, then you can use a DFA
+ directly without building a `Regex`, which always requires a second DFA to
+ find the start of a match.
+* Aside from choosing between dense and sparse DFAs, there are several options
+ for configuring the space usage vs search time trade off. These include
+ things like choosing a smaller state identifier representation, to
+ premultiplying state identifiers and splitting a DFA's alphabet into
+ equivalence classes. Finally, DFA minimization is also provided, but can
+ increase compilation times dramatically.
+
+
+### Future work
+
+* Look into being smarter about generating NFA states for large Unicode
+ character classes. These can create a lot of additional work for both the
+ determinizer and the minimizer, and I suspect this is the key thing we'll
+ want to improve if we want to make DFA compile times faster. I *believe*
+ it's possible to potentially build minimal or nearly minimal NFAs for the
+ special case of Unicode character classes by leveraging Daciuk's algorithms
+ for building minimal automata in linear time for sets of strings. See
+ https://blog.burntsushi.net/transducers/#construction for more details. The
+ key adaptation I think we need to make is to modify the algorithm to operate
+ on byte ranges instead of enumerating every codepoint in the set. Otherwise,
+ it might not be worth doing.
+* Add support for regex sets. It should be possible to do this by "simply"
+ introducing more match states. I think we can also report the positions at
+ each match, similar to how Aho-Corasick works. I think the long pole in the
+ tent here is probably the API design work and arranging it so that we don't
+ introduce extra overhead into the non-regex-set case without duplicating a
+ lot of code. It seems doable.
+* Stretch goal: support capturing groups by implementing "tagged" DFA
+ (transducers). Laurikari's paper is the usual reference here, but Trofimovich
+ has a much more thorough treatment here:
+ http://re2c.org/2017_trofimovich_tagged_deterministic_finite_automata_with_lookahead.pdf
+ I've only read the paper once. I suspect it will require at least a few more
+ read throughs before I understand it.
+ See also: http://re2c.org/
+* Possibly less ambitious goal: can we select a portion of Trofimovich's work
+ to make small fixed length look-around work? It would be really nice to
+ support ^, $ and \b, especially the Unicode variant of \b and CRLF aware $.
+* Experiment with code generating Rust code. There is an early experiment in
+ src/codegen.rs that is thoroughly bit-rotted. At the time, I was
+ experimenting with whether or not codegen would significant decrease the size
+ of a DFA, since if you squint hard enough, it's kind of like a sparse
+ representation. However, it didn't shrink as much as I thought it would, so
+ I gave up. The other problem is that Rust doesn't support gotos, so I don't
+ even know whether the "match on each state" in a loop thing will be fast
+ enough. Either way, it's probably a good option to have. For one thing, it
+ would be endian independent where as the serialization format of the DFAs in
+ this crate are endian dependent (so you need two versions of every DFA, but
+ you only need to compile one of them for any given arch).
+* Experiment with unrolling the match loops and fill out the benchmarks.
+* Add some kind of streaming API. I believe users of the library can already
+ implement something for this outside of the crate, but it would be good to
+ provide an official API. The key thing here is figuring out the API. I
+ suspect we might want to support several variants.
+* Make a decision on whether or not there is room for literal optimizations
+ in this crate. My original intent was to not let this crate sink down into
+ that very very very deep rabbit hole. But instead, we might want to provide
+ some way for literal optimizations to hook into the match routines. The right
+ path forward here is to probably build something outside of the crate and
+ then see about integrating it. After all, users can implement their own
+ match routines just as efficiently as what the crate provides.
+* A key downside of DFAs is that they can take up a lot of memory and can be
+ quite costly to build. Their worst case compilation time is O(2^n), where
+ n is the number of NFA states. A paper by Yang and Prasanna (2011) actually
+ seems to provide a way to character state blow up such that it is detectable.
+ If we could know whether a regex will exhibit state explosion or not, then
+ we could make an intelligent decision about whether to ahead-of-time compile
+ a DFA.
+ See: https://www.researchgate.net/profile/XU_Shutu/publication/229032602_Characterization_of_a_global_germplasm_collection_and_its_potential_utilization_for_analysis_of_complex_quantitative_traits_in_maize/links/02bfe50f914d04c837000000.pdf
diff --git a/data/fowler-tests/LICENSE b/data/fowler-tests/LICENSE
new file mode 100644
index 0000000..f47dbf4
--- /dev/null
+++ b/data/fowler-tests/LICENSE
@@ -0,0 +1,19 @@
+The following license covers testregex.c and all associated test data.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of THIS SOFTWARE FILE (the "Software"), to deal in the Software
+without restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, and/or sell copies of the
+Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following disclaimer:
+
+THIS SOFTWARE IS PROVIDED BY AT&T ``AS IS'' AND ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL AT&T BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/data/fowler-tests/README b/data/fowler-tests/README
new file mode 100644
index 0000000..6efc2da
--- /dev/null
+++ b/data/fowler-tests/README
@@ -0,0 +1,17 @@
+Test data was taken from the Go distribution, which was in turn taken from the
+testregex test suite:
+
+ http://www2.research.att.com/~astopen/testregex/testregex.html
+
+The LICENSE in this directory corresponds to the LICENSE that the data was
+released under.
+
+The tests themselves were modified for RE2/Go. A couple were modified further
+by me (Andrew Gallant) (only in repetition.dat) so that RE2/Go would pass them.
+(Yes, it seems like RE2/Go includes failing test cases.) This may or may not
+have been a bad idea, but I think being consistent with an established Regex
+library is worth something.
+
+Note that these files are read by 'scripts/regex-match-tests.py' and turned
+into Rust tests found in 'regex_macros/tests/matches.rs'.
+
diff --git a/data/fowler-tests/basic.dat b/data/fowler-tests/basic.dat
new file mode 100644
index 0000000..e55efae
--- /dev/null
+++ b/data/fowler-tests/basic.dat
@@ -0,0 +1,221 @@
+NOTE all standard compliant implementations should pass these : 2002-05-31
+
+BE abracadabra$ abracadabracadabra (7,18)
+BE a...b abababbb (2,7)
+BE XXXXXX ..XXXXXX (2,8)
+E \) () (1,2)
+BE a] a]a (0,2)
+B } } (0,1)
+E \} } (0,1)
+BE \] ] (0,1)
+B ] ] (0,1)
+E ] ] (0,1)
+B { { (0,1)
+B } } (0,1)
+BE ^a ax (0,1)
+BE \^a a^a (1,3)
+BE a\^ a^ (0,2)
+BE a$ aa (1,2)
+BE a\$ a$ (0,2)
+BE ^$ NULL (0,0)
+E $^ NULL (0,0)
+E a($) aa (1,2)(2,2)
+E a*(^a) aa (0,1)(0,1)
+E (..)*(...)* a (0,0)
+E (..)*(...)* abcd (0,4)(2,4)
+E (ab|a)(bc|c) abc (0,3)(0,2)(2,3)
+E (ab)c|abc abc (0,3)(0,2)
+E a{0}b ab (1,2)
+E (a*)(b?)(b+)b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7)
+E (a*)(b{0,1})(b{1,})b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7)
+E a{9876543210} NULL BADBR
+E ((a|a)|a) a (0,1)(0,1)(0,1)
+E (a*)(a|aa) aaaa (0,4)(0,3)(3,4)
+E a*(a.|aa) aaaa (0,4)(2,4)
+E a(b)|c(d)|a(e)f aef (0,3)(?,?)(?,?)(1,2)
+E (a|b)?.* b (0,1)(0,1)
+E (a|b)c|a(b|c) ac (0,2)(0,1)
+E (a|b)c|a(b|c) ab (0,2)(?,?)(1,2)
+E (a|b)*c|(a|ab)*c abc (0,3)(1,2)
+E (a|b)*c|(a|ab)*c xc (1,2)
+E (.a|.b).*|.*(.a|.b) xa (0,2)(0,2)
+E a?(ab|ba)ab abab (0,4)(0,2)
+E a?(ac{0}b|ba)ab abab (0,4)(0,2)
+E ab|abab abbabab (0,2)
+E aba|bab|bba baaabbbaba (5,8)
+E aba|bab baaabbbaba (6,9)
+E (aa|aaa)*|(a|aaaaa) aa (0,2)(0,2)
+E (a.|.a.)*|(a|.a...) aa (0,2)(0,2)
+E ab|a xabc (1,3)
+E ab|a xxabc (2,4)
+Ei (Ab|cD)* aBcD (0,4)(2,4)
+BE [^-] --a (2,3)
+BE [a-]* --a (0,3)
+BE [a-m-]* --amoma-- (0,4)
+E :::1:::0:|:::1:1:0: :::0:::1:::1:::0: (8,17)
+E :::1:::0:|:::1:1:1: :::0:::1:::1:::0: (8,17)
+{E [[:upper:]] A (0,1) [[<element>]] not supported
+E [[:lower:]]+ `az{ (1,3)
+E [[:upper:]]+ @AZ[ (1,3)
+# No collation in Go
+#BE [[-]] [[-]] (2,4)
+#BE [[.NIL.]] NULL ECOLLATE
+#BE [[=aleph=]] NULL ECOLLATE
+}
+BE$ \n \n (0,1)
+BEn$ \n \n (0,1)
+BE$ [^a] \n (0,1)
+BE$ \na \na (0,2)
+E (a)(b)(c) abc (0,3)(0,1)(1,2)(2,3)
+BE xxx xxx (0,3)
+E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 6, (0,6)
+E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) 2/7 (0,3)
+E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 1,Feb 6 (5,11)
+E3 ((((((((((((((((((((((((((((((x)))))))))))))))))))))))))))))) x (0,1)(0,1)(0,1)
+E3 ((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))* xx (0,2)(1,2)(1,2)
+E a?(ab|ba)* ababababababababababababababababababababababababababababababababababababababababa (0,81)(79,81)
+E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabbbbaa (18,25)
+E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabaa (18,22)
+E aaac|aabc|abac|abbc|baac|babc|bbac|bbbc baaabbbabac (7,11)
+BE$ .* \x01\x7f (0,2)
+E aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa (53,57)
+L aaaa\nbbbb\ncccc\nddddd\neeeeee\nfffffff\ngggg\nhhhh\niiiii\njjjjj\nkkkkk\nllll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa NOMATCH
+E a*a*a*a*a*b aaaaaaaaab (0,10)
+BE ^ NULL (0,0)
+BE $ NULL (0,0)
+BE ^$ NULL (0,0)
+BE ^a$ a (0,1)
+BE abc abc (0,3)
+BE abc xabcy (1,4)
+BE abc ababc (2,5)
+BE ab*c abc (0,3)
+BE ab*bc abc (0,3)
+BE ab*bc abbc (0,4)
+BE ab*bc abbbbc (0,6)
+E ab+bc abbc (0,4)
+E ab+bc abbbbc (0,6)
+E ab?bc abbc (0,4)
+E ab?bc abc (0,3)
+E ab?c abc (0,3)
+BE ^abc$ abc (0,3)
+BE ^abc abcc (0,3)
+BE abc$ aabc (1,4)
+BE ^ abc (0,0)
+BE $ abc (3,3)
+BE a.c abc (0,3)
+BE a.c axc (0,3)
+BE a.*c axyzc (0,5)
+BE a[bc]d abd (0,3)
+BE a[b-d]e ace (0,3)
+BE a[b-d] aac (1,3)
+BE a[-b] a- (0,2)
+BE a[b-] a- (0,2)
+BE a] a] (0,2)
+BE a[]]b a]b (0,3)
+BE a[^bc]d aed (0,3)
+BE a[^-b]c adc (0,3)
+BE a[^]b]c adc (0,3)
+E ab|cd abc (0,2)
+E ab|cd abcd (0,2)
+E a\(b a(b (0,3)
+E a\(*b ab (0,2)
+E a\(*b a((b (0,4)
+E ((a)) abc (0,1)(0,1)(0,1)
+E (a)b(c) abc (0,3)(0,1)(2,3)
+E a+b+c aabbabc (4,7)
+E a* aaa (0,3)
+#E (a*)* - (0,0)(0,0)
+E (a*)* - (0,0)(?,?) RE2/Go
+E (a*)+ - (0,0)(0,0)
+#E (a*|b)* - (0,0)(0,0)
+E (a*|b)* - (0,0)(?,?) RE2/Go
+E (a+|b)* ab (0,2)(1,2)
+E (a+|b)+ ab (0,2)(1,2)
+E (a+|b)? ab (0,1)(0,1)
+BE [^ab]* cde (0,3)
+#E (^)* - (0,0)(0,0)
+E (^)* - (0,0)(?,?) RE2/Go
+BE a* NULL (0,0)
+E ([abc])*d abbbcd (0,6)(4,5)
+E ([abc])*bcd abcd (0,4)(0,1)
+E a|b|c|d|e e (0,1)
+E (a|b|c|d|e)f ef (0,2)(0,1)
+#E ((a*|b))* - (0,0)(0,0)(0,0)
+E ((a*|b))* - (0,0)(?,?)(?,?) RE2/Go
+BE abcd*efg abcdefg (0,7)
+BE ab* xabyabbbz (1,3)
+BE ab* xayabbbz (1,2)
+E (ab|cd)e abcde (2,5)(2,4)
+BE [abhgefdc]ij hij (0,3)
+E (a|b)c*d abcd (1,4)(1,2)
+E (ab|ab*)bc abc (0,3)(0,1)
+E a([bc]*)c* abc (0,3)(1,3)
+E a([bc]*)(c*d) abcd (0,4)(1,3)(3,4)
+E a([bc]+)(c*d) abcd (0,4)(1,3)(3,4)
+E a([bc]*)(c+d) abcd (0,4)(1,2)(2,4)
+E a[bcd]*dcdcde adcdcde (0,7)
+E (ab|a)b*c abc (0,3)(0,2)
+E ((a)(b)c)(d) abcd (0,4)(0,3)(0,1)(1,2)(3,4)
+BE [A-Za-z_][A-Za-z0-9_]* alpha (0,5)
+E ^a(bc+|b[eh])g|.h$ abh (1,3)
+E (bc+d$|ef*g.|h?i(j|k)) effgz (0,5)(0,5)
+E (bc+d$|ef*g.|h?i(j|k)) ij (0,2)(0,2)(1,2)
+E (bc+d$|ef*g.|h?i(j|k)) reffgz (1,6)(1,6)
+E (((((((((a))))))))) a (0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)
+BE multiple words multiple words yeah (0,14)
+E (.*)c(.*) abcde (0,5)(0,2)(3,5)
+BE abcd abcd (0,4)
+E a(bc)d abcd (0,4)(1,3)
+E a[-]?c ac (0,3)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qaddafi (0,15)(?,?)(10,12)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mo'ammar Gadhafi (0,16)(?,?)(11,13)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Kaddafi (0,15)(?,?)(10,12)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qadhafi (0,15)(?,?)(10,12)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gadafi (0,14)(?,?)(10,11)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadafi (0,15)(?,?)(11,12)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moamar Gaddafi (0,14)(?,?)(9,11)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadhdhafi (0,18)(?,?)(13,15)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Khaddafi (0,16)(?,?)(11,13)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafy (0,16)(?,?)(11,13)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghadafi (0,15)(?,?)(11,12)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafi (0,16)(?,?)(11,13)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muamar Kaddafi (0,14)(?,?)(9,11)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Quathafi (0,16)(?,?)(11,13)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gheddafi (0,16)(?,?)(11,13)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Khadafy (0,15)(?,?)(11,12)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Qudhafi (0,15)(?,?)(10,12)
+E a+(b|c)*d+ aabcdd (0,6)(3,4)
+E ^.+$ vivi (0,4)
+E ^(.+)$ vivi (0,4)(0,4)
+E ^([^!.]+).att.com!(.+)$ gryphon.att.com!eby (0,19)(0,7)(16,19)
+E ^([^!]+!)?([^!]+)$ bas (0,3)(?,?)(0,3)
+E ^([^!]+!)?([^!]+)$ bar!bas (0,7)(0,4)(4,7)
+E ^([^!]+!)?([^!]+)$ foo!bas (0,7)(0,4)(4,7)
+E ^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(4,8)(8,11)
+E ((foo)|(bar))!bas bar!bas (0,7)(0,3)(?,?)(0,3)
+E ((foo)|(bar))!bas foo!bar!bas (4,11)(4,7)(?,?)(4,7)
+E ((foo)|(bar))!bas foo!bas (0,7)(0,3)(0,3)
+E ((foo)|bar)!bas bar!bas (0,7)(0,3)
+E ((foo)|bar)!bas foo!bar!bas (4,11)(4,7)
+E ((foo)|bar)!bas foo!bas (0,7)(0,3)(0,3)
+E (foo|(bar))!bas bar!bas (0,7)(0,3)(0,3)
+E (foo|(bar))!bas foo!bar!bas (4,11)(4,7)(4,7)
+E (foo|(bar))!bas foo!bas (0,7)(0,3)
+E (foo|bar)!bas bar!bas (0,7)(0,3)
+E (foo|bar)!bas foo!bar!bas (4,11)(4,7)
+E (foo|bar)!bas foo!bas (0,7)(0,3)
+E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
+E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bas (0,3)(?,?)(0,3)
+E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bar!bas (0,7)(0,4)(4,7)
+E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(?,?)(?,?)(4,8)(8,11)
+E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bas (0,7)(0,4)(4,7)
+E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bas (0,3)(0,3)(?,?)(0,3)
+E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bar!bas (0,7)(0,7)(0,4)(4,7)
+E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
+E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bas (0,7)(0,7)(0,4)(4,7)
+E .*(/XXX).* /XXX (0,4)(0,4)
+E .*(\\XXX).* \XXX (0,4)(0,4)
+E \\XXX \XXX (0,4)
+E .*(/000).* /000 (0,4)(0,4)
+E .*(\\000).* \000 (0,4)(0,4)
+E \\000 \000 (0,4)
diff --git a/data/fowler-tests/nullsubexpr.dat b/data/fowler-tests/nullsubexpr.dat
new file mode 100644
index 0000000..2e18fbb
--- /dev/null
+++ b/data/fowler-tests/nullsubexpr.dat
@@ -0,0 +1,79 @@
+NOTE null subexpression matches : 2002-06-06
+
+E (a*)* a (0,1)(0,1)
+#E SAME x (0,0)(0,0)
+E SAME x (0,0)(?,?) RE2/Go
+E SAME aaaaaa (0,6)(0,6)
+E SAME aaaaaax (0,6)(0,6)
+E (a*)+ a (0,1)(0,1)
+E SAME x (0,0)(0,0)
+E SAME aaaaaa (0,6)(0,6)
+E SAME aaaaaax (0,6)(0,6)
+E (a+)* a (0,1)(0,1)
+E SAME x (0,0)
+E SAME aaaaaa (0,6)(0,6)
+E SAME aaaaaax (0,6)(0,6)
+E (a+)+ a (0,1)(0,1)
+E SAME x NOMATCH
+E SAME aaaaaa (0,6)(0,6)
+E SAME aaaaaax (0,6)(0,6)
+
+E ([a]*)* a (0,1)(0,1)
+#E SAME x (0,0)(0,0)
+E SAME x (0,0)(?,?) RE2/Go
+E SAME aaaaaa (0,6)(0,6)
+E SAME aaaaaax (0,6)(0,6)
+E ([a]*)+ a (0,1)(0,1)
+E SAME x (0,0)(0,0)
+E SAME aaaaaa (0,6)(0,6)
+E SAME aaaaaax (0,6)(0,6)
+E ([^b]*)* a (0,1)(0,1)
+#E SAME b (0,0)(0,0)
+E SAME b (0,0)(?,?) RE2/Go
+E SAME aaaaaa (0,6)(0,6)
+E SAME aaaaaab (0,6)(0,6)
+E ([ab]*)* a (0,1)(0,1)
+E SAME aaaaaa (0,6)(0,6)
+E SAME ababab (0,6)(0,6)
+E SAME bababa (0,6)(0,6)
+E SAME b (0,1)(0,1)
+E SAME bbbbbb (0,6)(0,6)
+E SAME aaaabcde (0,5)(0,5)
+E ([^a]*)* b (0,1)(0,1)
+E SAME bbbbbb (0,6)(0,6)
+#E SAME aaaaaa (0,0)(0,0)
+E SAME aaaaaa (0,0)(?,?) RE2/Go
+E ([^ab]*)* ccccxx (0,6)(0,6)
+#E SAME ababab (0,0)(0,0)
+E SAME ababab (0,0)(?,?) RE2/Go
+
+E ((z)+|a)* zabcde (0,2)(1,2)
+
+#{E a+? aaaaaa (0,1) no *? +? mimimal match ops
+#E (a) aaa (0,1)(0,1)
+#E (a*?) aaa (0,0)(0,0)
+#E (a)*? aaa (0,0)
+#E (a*?)*? aaa (0,0)
+#}
+
+B \(a*\)*\(x\) x (0,1)(0,0)(0,1)
+B \(a*\)*\(x\) ax (0,2)(0,1)(1,2)
+B \(a*\)*\(x\) axa (0,2)(0,1)(1,2)
+B \(a*\)*\(x\)\(\1\) x (0,1)(0,0)(0,1)(1,1)
+B \(a*\)*\(x\)\(\1\) ax (0,2)(1,1)(1,2)(2,2)
+B \(a*\)*\(x\)\(\1\) axa (0,3)(0,1)(1,2)(2,3)
+B \(a*\)*\(x\)\(\1\)\(x\) axax (0,4)(0,1)(1,2)(2,3)(3,4)
+B \(a*\)*\(x\)\(\1\)\(x\) axxa (0,3)(1,1)(1,2)(2,2)(2,3)
+
+#E (a*)*(x) x (0,1)(0,0)(0,1)
+E (a*)*(x) x (0,1)(?,?)(0,1) RE2/Go
+E (a*)*(x) ax (0,2)(0,1)(1,2)
+E (a*)*(x) axa (0,2)(0,1)(1,2)
+
+E (a*)+(x) x (0,1)(0,0)(0,1)
+E (a*)+(x) ax (0,2)(0,1)(1,2)
+E (a*)+(x) axa (0,2)(0,1)(1,2)
+
+E (a*){2}(x) x (0,1)(0,0)(0,1)
+E (a*){2}(x) ax (0,2)(1,1)(1,2)
+E (a*){2}(x) axa (0,2)(1,1)(1,2)
diff --git a/data/fowler-tests/repetition.dat b/data/fowler-tests/repetition.dat
new file mode 100644
index 0000000..3bb2121
--- /dev/null
+++ b/data/fowler-tests/repetition.dat
@@ -0,0 +1,163 @@
+NOTE implicit vs. explicit repetitions : 2009-02-02
+
+# Glenn Fowler <gsf@research.att.com>
+# conforming matches (column 4) must match one of the following BREs
+# NOMATCH
+# (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)*
+# (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)*
+# i.e., each 3-tuple has two identical elements and one (?,?)
+
+E ((..)|(.)) NULL NOMATCH
+E ((..)|(.))((..)|(.)) NULL NOMATCH
+E ((..)|(.))((..)|(.))((..)|(.)) NULL NOMATCH
+
+E ((..)|(.)){1} NULL NOMATCH
+E ((..)|(.)){2} NULL NOMATCH
+E ((..)|(.)){3} NULL NOMATCH
+
+E ((..)|(.))* NULL (0,0)
+
+E ((..)|(.)) a (0,1)(0,1)(?,?)(0,1)
+E ((..)|(.))((..)|(.)) a NOMATCH
+E ((..)|(.))((..)|(.))((..)|(.)) a NOMATCH
+
+E ((..)|(.)){1} a (0,1)(0,1)(?,?)(0,1)
+E ((..)|(.)){2} a NOMATCH
+E ((..)|(.)){3} a NOMATCH
+
+E ((..)|(.))* a (0,1)(0,1)(?,?)(0,1)
+
+E ((..)|(.)) aa (0,2)(0,2)(0,2)(?,?)
+E ((..)|(.))((..)|(.)) aa (0,2)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)
+E ((..)|(.))((..)|(.))((..)|(.)) aa NOMATCH
+
+E ((..)|(.)){1} aa (0,2)(0,2)(0,2)(?,?)
+E ((..)|(.)){2} aa (0,2)(1,2)(?,?)(1,2)
+E ((..)|(.)){3} aa NOMATCH
+
+E ((..)|(.))* aa (0,2)(0,2)(0,2)(?,?)
+
+E ((..)|(.)) aaa (0,2)(0,2)(0,2)(?,?)
+E ((..)|(.))((..)|(.)) aaa (0,3)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)
+E ((..)|(.))((..)|(.))((..)|(.)) aaa (0,3)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)(2,3)(?,?)(2,3)
+
+E ((..)|(.)){1} aaa (0,2)(0,2)(0,2)(?,?)
+#E ((..)|(.)){2} aaa (0,3)(2,3)(?,?)(2,3)
+E ((..)|(.)){2} aaa (0,3)(2,3)(0,2)(2,3) RE2/Go
+E ((..)|(.)){3} aaa (0,3)(2,3)(?,?)(2,3)
+
+#E ((..)|(.))* aaa (0,3)(2,3)(?,?)(2,3)
+E ((..)|(.))* aaa (0,3)(2,3)(0,2)(2,3) RE2/Go
+
+E ((..)|(.)) aaaa (0,2)(0,2)(0,2)(?,?)
+E ((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
+E ((..)|(.))((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)(3,4)(?,?)(3,4)
+
+E ((..)|(.)){1} aaaa (0,2)(0,2)(0,2)(?,?)
+E ((..)|(.)){2} aaaa (0,4)(2,4)(2,4)(?,?)
+#E ((..)|(.)){3} aaaa (0,4)(3,4)(?,?)(3,4)
+E ((..)|(.)){3} aaaa (0,4)(3,4)(0,2)(3,4) RE2/Go
+
+E ((..)|(.))* aaaa (0,4)(2,4)(2,4)(?,?)
+
+E ((..)|(.)) aaaaa (0,2)(0,2)(0,2)(?,?)
+E ((..)|(.))((..)|(.)) aaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
+E ((..)|(.))((..)|(.))((..)|(.)) aaaaa (0,5)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,5)(?,?)(4,5)
+
+E ((..)|(.)){1} aaaaa (0,2)(0,2)(0,2)(?,?)
+E ((..)|(.)){2} aaaaa (0,4)(2,4)(2,4)(?,?)
+#E ((..)|(.)){3} aaaaa (0,5)(4,5)(?,?)(4,5)
+E ((..)|(.)){3} aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go
+
+#E ((..)|(.))* aaaaa (0,5)(4,5)(?,?)(4,5)
+E ((..)|(.))* aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go
+
+E ((..)|(.)) aaaaaa (0,2)(0,2)(0,2)(?,?)
+E ((..)|(.))((..)|(.)) aaaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
+E ((..)|(.))((..)|(.))((..)|(.)) aaaaaa (0,6)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,6)(4,6)(?,?)
+
+E ((..)|(.)){1} aaaaaa (0,2)(0,2)(0,2)(?,?)
+E ((..)|(.)){2} aaaaaa (0,4)(2,4)(2,4)(?,?)
+E ((..)|(.)){3} aaaaaa (0,6)(4,6)(4,6)(?,?)
+
+E ((..)|(.))* aaaaaa (0,6)(4,6)(4,6)(?,?)
+
+NOTE additional repetition tests graciously provided by Chris Kuklewicz www.haskell.org 2009-02-02
+
+# These test a bug in OS X / FreeBSD / NetBSD, and libtree.
+# Linux/GLIBC gets the {8,} and {8,8} wrong.
+
+:HA#100:E X(.?){0,}Y X1234567Y (0,9)(7,8)
+:HA#101:E X(.?){1,}Y X1234567Y (0,9)(7,8)
+:HA#102:E X(.?){2,}Y X1234567Y (0,9)(7,8)
+:HA#103:E X(.?){3,}Y X1234567Y (0,9)(7,8)
+:HA#104:E X(.?){4,}Y X1234567Y (0,9)(7,8)
+:HA#105:E X(.?){5,}Y X1234567Y (0,9)(7,8)
+:HA#106:E X(.?){6,}Y X1234567Y (0,9)(7,8)
+:HA#107:E X(.?){7,}Y X1234567Y (0,9)(7,8)
+:HA#108:E X(.?){8,}Y X1234567Y (0,9)(8,8)
+#:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(7,8)
+:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(8,8) RE2/Go
+#:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(7,8)
+:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(8,8) RE2/Go
+#:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(7,8)
+:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(8,8) RE2/Go
+#:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(7,8)
+:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(8,8) RE2/Go
+#:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(7,8)
+:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(8,8) RE2/Go
+#:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(7,8)
+:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(8,8) RE2/Go
+#:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(7,8)
+:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(8,8) RE2/Go
+#:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(7,8)
+:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(8,8) RE2/Go
+:HA#118:E X(.?){8,8}Y X1234567Y (0,9)(8,8)
+
+# These test a fixed bug in my regex-tdfa that did not keep the expanded
+# form properly grouped, so right association did the wrong thing with
+# these ambiguous patterns (crafted just to test my code when I became
+# suspicious of my implementation). The first subexpression should use
+# "ab" then "a" then "bcd".
+
+# OS X / FreeBSD / NetBSD badly fail many of these, with impossible
+# results like (0,6)(4,5)(6,6).
+
+:HA#260:E (a|ab|c|bcd){0,}(d*) ababcd (0,1)(0,1)(1,1)
+:HA#261:E (a|ab|c|bcd){1,}(d*) ababcd (0,1)(0,1)(1,1)
+:HA#262:E (a|ab|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#263:E (a|ab|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#264:E (a|ab|c|bcd){4,}(d*) ababcd NOMATCH
+:HA#265:E (a|ab|c|bcd){0,10}(d*) ababcd (0,1)(0,1)(1,1)
+:HA#266:E (a|ab|c|bcd){1,10}(d*) ababcd (0,1)(0,1)(1,1)
+:HA#267:E (a|ab|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#268:E (a|ab|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#269:E (a|ab|c|bcd){4,10}(d*) ababcd NOMATCH
+:HA#270:E (a|ab|c|bcd)*(d*) ababcd (0,1)(0,1)(1,1)
+:HA#271:E (a|ab|c|bcd)+(d*) ababcd (0,1)(0,1)(1,1)
+
+# The above worked on Linux/GLIBC but the following often fail.
+# They also trip up OS X / FreeBSD / NetBSD:
+
+#:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
+#:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
+#:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
+#:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
+:HA#284:E (ab|a|c|bcd){4,}(d*) ababcd NOMATCH
+#:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
+#:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
+#:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
+#:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
+:HA#289:E (ab|a|c|bcd){4,10}(d*) ababcd NOMATCH
+#:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(3,6)(6,6)
+:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
+#:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(3,6)(6,6)
+:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
diff --git a/data/tests/crazy.toml b/data/tests/crazy.toml
new file mode 100644
index 0000000..30c4b31
--- /dev/null
+++ b/data/tests/crazy.toml
@@ -0,0 +1,177 @@
+[[tests]]
+name = "crazy-misc1"
+pattern = '[-+]?[0-9]*\.?[0-9]+'
+input = "0.1"
+matches = [[0, 3]]
+
+[[tests]]
+name = "crazy-misc2"
+pattern = '[-+]?[0-9]*\.?[0-9]+'
+input = "0.1.2"
+matches = [[0, 3]]
+
+[[tests]]
+name = "crazy-misc3"
+pattern = '[-+]?[0-9]*\.?[0-9]+'
+input = "a1.2"
+matches = [[1, 4]]
+
+[[tests]]
+options = ["case-insensitive"]
+name = "crazy-misc4"
+pattern = '[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
+input = "mine is jam.slam@gmail.com "
+matches = [[8, 26]]
+
+[[tests]]
+options = ["case-insensitive"]
+name = "crazy-misc5"
+pattern = '[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
+input = "mine is jam.slam@gmail "
+matches = []
+
+[[tests]]
+name = "crazy-misc6"
+pattern = '''[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?'''
+input = "mine is jam.slam@gmail.com "
+matches = [[8, 26]]
+
+[[tests]]
+name = "crazy-misc7"
+pattern = '(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])'
+input = "1900-01-01"
+matches = [[0, 10]]
+
+[[tests]]
+name = "crazy-misc8"
+pattern = '(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])'
+input = "1900-00-01"
+matches = []
+
+[[tests]]
+name = "crazy-misc9"
+pattern = '(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])'
+input = "1900-13-01"
+matches = []
+
+
+[[tests]]
+name = "crazy-negclass1"
+pattern = "[^ac]"
+input = "acx"
+matches = [[2, 3]]
+
+[[tests]]
+name = "crazy-negclass2"
+pattern = "[^a,]"
+input = "a,x"
+matches = [[2, 3]]
+
+[[tests]]
+name = "crazy-negclass3"
+pattern = '[^a\s]'
+input = "a x"
+matches = [[2, 3]]
+
+[[tests]]
+name = "crazy-negclass4"
+pattern = "[^,]"
+input = ",,x"
+matches = [[2, 3]]
+
+[[tests]]
+name = "crazy-negclass5"
+pattern = '[^\s]'
+input = " a"
+matches = [[1, 2]]
+
+[[tests]]
+name = "crazy-negclass6"
+pattern = '[^,\s]'
+input = ", a"
+matches = [[2, 3]]
+
+[[tests]]
+name = "crazy-negclass7"
+pattern = '[^\s,]'
+input = " ,a"
+matches = [[2, 3]]
+
+[[tests]]
+name = "crazy-negclass8"
+pattern = "[^[:alpha:]Z]"
+input = "A1"
+matches = [[1, 2]]
+
+
+[[tests]]
+name = "crazy-empty-repeat1"
+pattern = "((.*)*?)="
+input = "a=b"
+matches = [[0, 2]]
+
+[[tests]]
+name = "crazy-empty-repeat2"
+pattern = "((.?)*?)="
+input = "a=b"
+matches = [[0, 2]]
+
+[[tests]]
+name = "crazy-empty-repeat3"
+pattern = "((.*)+?)="
+input = "a=b"
+matches = [[0, 2]]
+
+[[tests]]
+name = "crazy-empty-repeat4"
+pattern = "((.?)+?)="
+input = "a=b"
+matches = [[0, 2]]
+
+[[tests]]
+name = "crazy-empty-repeat5"
+pattern = "((.*){1,}?)="
+input = "a=b"
+matches = [[0, 2]]
+
+[[tests]]
+name = "crazy-empty-repeat6"
+pattern = "((.*){1,2}?)="
+input = "a=b"
+matches = [[0, 2]]
+
+[[tests]]
+name = "crazy-empty-repeat7"
+pattern = "((.*)*)="
+input = "a=b"
+matches = [[0, 2]]
+
+[[tests]]
+name = "crazy-empty-repeat8"
+pattern = "((.?)*)="
+input = "a=b"
+matches = [[0, 2]]
+
+[[tests]]
+name = "crazy-empty-repeat9"
+pattern = "((.*)+)="
+input = "a=b"
+matches = [[0, 2]]
+
+[[tests]]
+name = "crazy-empty-repeat10"
+pattern = "((.?)+)="
+input = "a=b"
+matches = [[0, 2]]
+
+[[tests]]
+name = "crazy-empty-repeat11"
+pattern = "((.*){1,})="
+input = "a=b"
+matches = [[0, 2]]
+
+[[tests]]
+name = "crazy-empty-repeat12"
+pattern = "((.*){1,2})="
+input = "a=b"
+matches = [[0, 2]]
diff --git a/data/tests/flags.toml b/data/tests/flags.toml
new file mode 100644
index 0000000..98024d9
--- /dev/null
+++ b/data/tests/flags.toml
@@ -0,0 +1,59 @@
+[[tests]]
+name = "flags1"
+pattern = "(?i)abc"
+input = "ABC"
+matches = [[0, 3]]
+
+[[tests]]
+name = "flags2"
+pattern = "(?i)a(?-i)bc"
+input = "Abc"
+matches = [[0, 3]]
+
+[[tests]]
+name = "flags3"
+pattern = "(?i)a(?-i)bc"
+input = "ABC"
+matches = []
+
+[[tests]]
+name = "flags4"
+pattern = "(?is)a."
+input = "A\n"
+matches = [[0, 2]]
+
+[[tests]]
+name = "flags5"
+pattern = "(?is)a.(?-is)a."
+input = "A\nab"
+matches = [[0, 4]]
+
+[[tests]]
+name = "flags6"
+pattern = "(?is)a.(?-is)a."
+input = "A\na\n"
+matches = []
+
+[[tests]]
+name = "flags7"
+pattern = "(?is)a.(?-is:a.)?"
+input = "A\na\n"
+matches = [[0, 2]]
+
+[[tests]]
+name = "flags8"
+pattern = "(?U)a+"
+input = "aa"
+matches = [[0, 1]]
+
+[[tests]]
+name = "flags9"
+pattern = "(?U)a+?"
+input = "aa"
+matches = [[0, 2]]
+
+[[tests]]
+name = "flags10"
+pattern = "(?U)(?-U)a+"
+input = "aa"
+matches = [[0, 2]]
diff --git a/data/tests/fowler/LICENSE b/data/tests/fowler/LICENSE
new file mode 100644
index 0000000..f47dbf4
--- /dev/null
+++ b/data/tests/fowler/LICENSE
@@ -0,0 +1,19 @@
+The following license covers testregex.c and all associated test data.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of THIS SOFTWARE FILE (the "Software"), to deal in the Software
+without restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, and/or sell copies of the
+Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following disclaimer:
+
+THIS SOFTWARE IS PROVIDED BY AT&T ``AS IS'' AND ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL AT&T BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/data/tests/fowler/README b/data/tests/fowler/README
new file mode 100644
index 0000000..55507f0
--- /dev/null
+++ b/data/tests/fowler/README
@@ -0,0 +1,23 @@
+Test data was taken from the Go distribution, which was in turn taken from the
+testregex test suite:
+
+ http://www2.research.att.com/~astopen/testregex/testregex.html
+
+Unfortunately, the above link is now dead, but the test data lives on.
+
+The LICENSE in this directory corresponds to the LICENSE that the data was
+originally released under.
+
+The tests themselves were modified for RE2/Go. A couple were modified further
+by me (Andrew Gallant) (only in repetition.dat) so that RE2/Go would pass them.
+(Yes, it seems like RE2/Go includes failing test cases.) This may or may not
+have been a bad idea, but I think being consistent with an established Regex
+library is worth something.
+
+After some number of years, these tests were transformed into a JSON format
+using the fowler-to-json script in this directory, e.g.,
+
+ ./fowler-to-json basic.dat > basic.json
+
+which brings them into a sensible structured format in which other tests can
+be written.
diff --git a/data/tests/fowler/basic.dat b/data/tests/fowler/basic.dat
new file mode 100644
index 0000000..e55efae
--- /dev/null
+++ b/data/tests/fowler/basic.dat
@@ -0,0 +1,221 @@
+NOTE all standard compliant implementations should pass these : 2002-05-31
+
+BE abracadabra$ abracadabracadabra (7,18)
+BE a...b abababbb (2,7)
+BE XXXXXX ..XXXXXX (2,8)
+E \) () (1,2)
+BE a] a]a (0,2)
+B } } (0,1)
+E \} } (0,1)
+BE \] ] (0,1)
+B ] ] (0,1)
+E ] ] (0,1)
+B { { (0,1)
+B } } (0,1)
+BE ^a ax (0,1)
+BE \^a a^a (1,3)
+BE a\^ a^ (0,2)
+BE a$ aa (1,2)
+BE a\$ a$ (0,2)
+BE ^$ NULL (0,0)
+E $^ NULL (0,0)
+E a($) aa (1,2)(2,2)
+E a*(^a) aa (0,1)(0,1)
+E (..)*(...)* a (0,0)
+E (..)*(...)* abcd (0,4)(2,4)
+E (ab|a)(bc|c) abc (0,3)(0,2)(2,3)
+E (ab)c|abc abc (0,3)(0,2)
+E a{0}b ab (1,2)
+E (a*)(b?)(b+)b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7)
+E (a*)(b{0,1})(b{1,})b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7)
+E a{9876543210} NULL BADBR
+E ((a|a)|a) a (0,1)(0,1)(0,1)
+E (a*)(a|aa) aaaa (0,4)(0,3)(3,4)
+E a*(a.|aa) aaaa (0,4)(2,4)
+E a(b)|c(d)|a(e)f aef (0,3)(?,?)(?,?)(1,2)
+E (a|b)?.* b (0,1)(0,1)
+E (a|b)c|a(b|c) ac (0,2)(0,1)
+E (a|b)c|a(b|c) ab (0,2)(?,?)(1,2)
+E (a|b)*c|(a|ab)*c abc (0,3)(1,2)
+E (a|b)*c|(a|ab)*c xc (1,2)
+E (.a|.b).*|.*(.a|.b) xa (0,2)(0,2)
+E a?(ab|ba)ab abab (0,4)(0,2)
+E a?(ac{0}b|ba)ab abab (0,4)(0,2)
+E ab|abab abbabab (0,2)
+E aba|bab|bba baaabbbaba (5,8)
+E aba|bab baaabbbaba (6,9)
+E (aa|aaa)*|(a|aaaaa) aa (0,2)(0,2)
+E (a.|.a.)*|(a|.a...) aa (0,2)(0,2)
+E ab|a xabc (1,3)
+E ab|a xxabc (2,4)
+Ei (Ab|cD)* aBcD (0,4)(2,4)
+BE [^-] --a (2,3)
+BE [a-]* --a (0,3)
+BE [a-m-]* --amoma-- (0,4)
+E :::1:::0:|:::1:1:0: :::0:::1:::1:::0: (8,17)
+E :::1:::0:|:::1:1:1: :::0:::1:::1:::0: (8,17)
+{E [[:upper:]] A (0,1) [[<element>]] not supported
+E [[:lower:]]+ `az{ (1,3)
+E [[:upper:]]+ @AZ[ (1,3)
+# No collation in Go
+#BE [[-]] [[-]] (2,4)
+#BE [[.NIL.]] NULL ECOLLATE
+#BE [[=aleph=]] NULL ECOLLATE
+}
+BE$ \n \n (0,1)
+BEn$ \n \n (0,1)
+BE$ [^a] \n (0,1)
+BE$ \na \na (0,2)
+E (a)(b)(c) abc (0,3)(0,1)(1,2)(2,3)
+BE xxx xxx (0,3)
+E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 6, (0,6)
+E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) 2/7 (0,3)
+E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 1,Feb 6 (5,11)
+E3 ((((((((((((((((((((((((((((((x)))))))))))))))))))))))))))))) x (0,1)(0,1)(0,1)
+E3 ((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))* xx (0,2)(1,2)(1,2)
+E a?(ab|ba)* ababababababababababababababababababababababababababababababababababababababababa (0,81)(79,81)
+E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabbbbaa (18,25)
+E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabaa (18,22)
+E aaac|aabc|abac|abbc|baac|babc|bbac|bbbc baaabbbabac (7,11)
+BE$ .* \x01\x7f (0,2)
+E aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa (53,57)
+L aaaa\nbbbb\ncccc\nddddd\neeeeee\nfffffff\ngggg\nhhhh\niiiii\njjjjj\nkkkkk\nllll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa NOMATCH
+E a*a*a*a*a*b aaaaaaaaab (0,10)
+BE ^ NULL (0,0)
+BE $ NULL (0,0)
+BE ^$ NULL (0,0)
+BE ^a$ a (0,1)
+BE abc abc (0,3)
+BE abc xabcy (1,4)
+BE abc ababc (2,5)
+BE ab*c abc (0,3)
+BE ab*bc abc (0,3)
+BE ab*bc abbc (0,4)
+BE ab*bc abbbbc (0,6)
+E ab+bc abbc (0,4)
+E ab+bc abbbbc (0,6)
+E ab?bc abbc (0,4)
+E ab?bc abc (0,3)
+E ab?c abc (0,3)
+BE ^abc$ abc (0,3)
+BE ^abc abcc (0,3)
+BE abc$ aabc (1,4)
+BE ^ abc (0,0)
+BE $ abc (3,3)
+BE a.c abc (0,3)
+BE a.c axc (0,3)
+BE a.*c axyzc (0,5)
+BE a[bc]d abd (0,3)
+BE a[b-d]e ace (0,3)
+BE a[b-d] aac (1,3)
+BE a[-b] a- (0,2)
+BE a[b-] a- (0,2)
+BE a] a] (0,2)
+BE a[]]b a]b (0,3)
+BE a[^bc]d aed (0,3)
+BE a[^-b]c adc (0,3)
+BE a[^]b]c adc (0,3)
+E ab|cd abc (0,2)
+E ab|cd abcd (0,2)
+E a\(b a(b (0,3)
+E a\(*b ab (0,2)
+E a\(*b a((b (0,4)
+E ((a)) abc (0,1)(0,1)(0,1)
+E (a)b(c) abc (0,3)(0,1)(2,3)
+E a+b+c aabbabc (4,7)
+E a* aaa (0,3)
+#E (a*)* - (0,0)(0,0)
+E (a*)* - (0,0)(?,?) RE2/Go
+E (a*)+ - (0,0)(0,0)
+#E (a*|b)* - (0,0)(0,0)
+E (a*|b)* - (0,0)(?,?) RE2/Go
+E (a+|b)* ab (0,2)(1,2)
+E (a+|b)+ ab (0,2)(1,2)
+E (a+|b)? ab (0,1)(0,1)
+BE [^ab]* cde (0,3)
+#E (^)* - (0,0)(0,0)
+E (^)* - (0,0)(?,?) RE2/Go
+BE a* NULL (0,0)
+E ([abc])*d abbbcd (0,6)(4,5)
+E ([abc])*bcd abcd (0,4)(0,1)
+E a|b|c|d|e e (0,1)
+E (a|b|c|d|e)f ef (0,2)(0,1)
+#E ((a*|b))* - (0,0)(0,0)(0,0)
+E ((a*|b))* - (0,0)(?,?)(?,?) RE2/Go
+BE abcd*efg abcdefg (0,7)
+BE ab* xabyabbbz (1,3)
+BE ab* xayabbbz (1,2)
+E (ab|cd)e abcde (2,5)(2,4)
+BE [abhgefdc]ij hij (0,3)
+E (a|b)c*d abcd (1,4)(1,2)
+E (ab|ab*)bc abc (0,3)(0,1)
+E a([bc]*)c* abc (0,3)(1,3)
+E a([bc]*)(c*d) abcd (0,4)(1,3)(3,4)
+E a([bc]+)(c*d) abcd (0,4)(1,3)(3,4)
+E a([bc]*)(c+d) abcd (0,4)(1,2)(2,4)
+E a[bcd]*dcdcde adcdcde (0,7)
+E (ab|a)b*c abc (0,3)(0,2)
+E ((a)(b)c)(d) abcd (0,4)(0,3)(0,1)(1,2)(3,4)
+BE [A-Za-z_][A-Za-z0-9_]* alpha (0,5)
+E ^a(bc+|b[eh])g|.h$ abh (1,3)
+E (bc+d$|ef*g.|h?i(j|k)) effgz (0,5)(0,5)
+E (bc+d$|ef*g.|h?i(j|k)) ij (0,2)(0,2)(1,2)
+E (bc+d$|ef*g.|h?i(j|k)) reffgz (1,6)(1,6)
+E (((((((((a))))))))) a (0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)
+BE multiple words multiple words yeah (0,14)
+E (.*)c(.*) abcde (0,5)(0,2)(3,5)
+BE abcd abcd (0,4)
+E a(bc)d abcd (0,4)(1,3)
+E a[-]?c ac (0,3)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qaddafi (0,15)(?,?)(10,12)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mo'ammar Gadhafi (0,16)(?,?)(11,13)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Kaddafi (0,15)(?,?)(10,12)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qadhafi (0,15)(?,?)(10,12)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gadafi (0,14)(?,?)(10,11)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadafi (0,15)(?,?)(11,12)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moamar Gaddafi (0,14)(?,?)(9,11)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadhdhafi (0,18)(?,?)(13,15)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Khaddafi (0,16)(?,?)(11,13)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafy (0,16)(?,?)(11,13)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghadafi (0,15)(?,?)(11,12)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafi (0,16)(?,?)(11,13)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muamar Kaddafi (0,14)(?,?)(9,11)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Quathafi (0,16)(?,?)(11,13)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gheddafi (0,16)(?,?)(11,13)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Khadafy (0,15)(?,?)(11,12)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Qudhafi (0,15)(?,?)(10,12)
+E a+(b|c)*d+ aabcdd (0,6)(3,4)
+E ^.+$ vivi (0,4)
+E ^(.+)$ vivi (0,4)(0,4)
+E ^([^!.]+).att.com!(.+)$ gryphon.att.com!eby (0,19)(0,7)(16,19)
+E ^([^!]+!)?([^!]+)$ bas (0,3)(?,?)(0,3)
+E ^([^!]+!)?([^!]+)$ bar!bas (0,7)(0,4)(4,7)
+E ^([^!]+!)?([^!]+)$ foo!bas (0,7)(0,4)(4,7)
+E ^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(4,8)(8,11)
+E ((foo)|(bar))!bas bar!bas (0,7)(0,3)(?,?)(0,3)
+E ((foo)|(bar))!bas foo!bar!bas (4,11)(4,7)(?,?)(4,7)
+E ((foo)|(bar))!bas foo!bas (0,7)(0,3)(0,3)
+E ((foo)|bar)!bas bar!bas (0,7)(0,3)
+E ((foo)|bar)!bas foo!bar!bas (4,11)(4,7)
+E ((foo)|bar)!bas foo!bas (0,7)(0,3)(0,3)
+E (foo|(bar))!bas bar!bas (0,7)(0,3)(0,3)
+E (foo|(bar))!bas foo!bar!bas (4,11)(4,7)(4,7)
+E (foo|(bar))!bas foo!bas (0,7)(0,3)
+E (foo|bar)!bas bar!bas (0,7)(0,3)
+E (foo|bar)!bas foo!bar!bas (4,11)(4,7)
+E (foo|bar)!bas foo!bas (0,7)(0,3)
+E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
+E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bas (0,3)(?,?)(0,3)
+E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bar!bas (0,7)(0,4)(4,7)
+E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(?,?)(?,?)(4,8)(8,11)
+E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bas (0,7)(0,4)(4,7)
+E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bas (0,3)(0,3)(?,?)(0,3)
+E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bar!bas (0,7)(0,7)(0,4)(4,7)
+E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
+E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bas (0,7)(0,7)(0,4)(4,7)
+E .*(/XXX).* /XXX (0,4)(0,4)
+E .*(\\XXX).* \XXX (0,4)(0,4)
+E \\XXX \XXX (0,4)
+E .*(/000).* /000 (0,4)(0,4)
+E .*(\\000).* \000 (0,4)(0,4)
+E \\000 \000 (0,4)
diff --git a/data/tests/fowler/basic.toml b/data/tests/fowler/basic.toml
new file mode 100644
index 0000000..3eeebd7
--- /dev/null
+++ b/data/tests/fowler/basic.toml
@@ -0,0 +1,1428 @@
+[[tests]]
+name = "basic3"
+options = ['escaped']
+pattern = '''abracadabra$'''
+input = '''abracadabracadabra'''
+matches = [[7, 18]]
+
+[[tests]]
+name = "basic4"
+options = ['escaped']
+pattern = '''a...b'''
+input = '''abababbb'''
+matches = [[2, 7]]
+
+[[tests]]
+name = "basic5"
+options = ['escaped']
+pattern = '''XXXXXX'''
+input = '''..XXXXXX'''
+matches = [[2, 8]]
+
+[[tests]]
+name = "basic6"
+options = ['escaped']
+pattern = '''\)'''
+input = '''()'''
+matches = [[1, 2]]
+
+[[tests]]
+name = "basic7"
+options = ['escaped']
+pattern = '''a]'''
+input = '''a]a'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "basic9"
+options = ['escaped']
+pattern = '''\}'''
+input = '''}'''
+matches = [[0, 1]]
+
+[[tests]]
+name = "basic10"
+options = ['escaped']
+pattern = '''\]'''
+input = ''']'''
+matches = [[0, 1]]
+
+[[tests]]
+name = "basic12"
+options = ['escaped']
+pattern = ''']'''
+input = ''']'''
+matches = [[0, 1]]
+
+[[tests]]
+name = "basic15"
+options = ['escaped']
+pattern = '''^a'''
+input = '''ax'''
+matches = [[0, 1]]
+
+[[tests]]
+name = "basic16"
+options = ['escaped']
+pattern = '''\^a'''
+input = '''a^a'''
+matches = [[1, 3]]
+
+[[tests]]
+name = "basic17"
+options = ['escaped']
+pattern = '''a\^'''
+input = '''a^'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "basic18"
+options = ['escaped']
+pattern = '''a$'''
+input = '''aa'''
+matches = [[1, 2]]
+
+[[tests]]
+name = "basic19"
+options = ['escaped']
+pattern = '''a\$'''
+input = '''a$'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "basic20"
+options = ['escaped']
+pattern = '''^$'''
+input = ''''''
+matches = [[0, 0]]
+
+[[tests]]
+name = "basic21"
+options = ['escaped']
+pattern = '''$^'''
+input = ''''''
+matches = [[0, 0]]
+
+[[tests]]
+name = "basic22"
+options = ['escaped']
+pattern = '''a($)'''
+input = '''aa'''
+matches = [[1, 2]]
+
+[[tests]]
+name = "basic23"
+options = ['escaped']
+pattern = '''a*(^a)'''
+input = '''aa'''
+matches = [[0, 1]]
+
+[[tests]]
+name = "basic24"
+options = ['escaped']
+pattern = '''(..)*(...)*'''
+input = '''a'''
+matches = [[0, 0]]
+
+[[tests]]
+name = "basic25"
+options = ['escaped']
+pattern = '''(..)*(...)*'''
+input = '''abcd'''
+matches = [[0, 4]]
+
+[[tests]]
+name = "basic26"
+options = ['escaped']
+pattern = '''(ab|a)(bc|c)'''
+input = '''abc'''
+matches = [[0, 3]]
+
+[[tests]]
+name = "basic27"
+options = ['escaped']
+pattern = '''(ab)c|abc'''
+input = '''abc'''
+matches = [[0, 3]]
+
+[[tests]]
+name = "basic28"
+options = ['escaped']
+pattern = '''a{0}b'''
+input = '''ab'''
+matches = [[1, 2]]
+
+[[tests]]
+name = "basic29"
+options = ['escaped']
+pattern = '''(a*)(b?)(b+)b{3}'''
+input = '''aaabbbbbbb'''
+matches = [[0, 10]]
+
+[[tests]]
+name = "basic30"
+options = ['escaped']
+pattern = '''(a*)(b{0,1})(b{1,})b{3}'''
+input = '''aaabbbbbbb'''
+matches = [[0, 10]]
+
+[[tests]]
+name = "basic32"
+options = ['escaped']
+pattern = '''((a|a)|a)'''
+input = '''a'''
+matches = [[0, 1]]
+
+[[tests]]
+name = "basic33"
+options = ['escaped']
+pattern = '''(a*)(a|aa)'''
+input = '''aaaa'''
+matches = [[0, 4]]
+
+[[tests]]
+name = "basic34"
+options = ['escaped']
+pattern = '''a*(a.|aa)'''
+input = '''aaaa'''
+matches = [[0, 4]]
+
+[[tests]]
+name = "basic35"
+options = ['escaped']
+pattern = '''a(b)|c(d)|a(e)f'''
+input = '''aef'''
+matches = [[0, 3]]
+
+[[tests]]
+name = "basic36"
+options = ['escaped']
+pattern = '''(a|b)?.*'''
+input = '''b'''
+matches = [[0, 1]]
+
+[[tests]]
+name = "basic37"
+options = ['escaped']
+pattern = '''(a|b)c|a(b|c)'''
+input = '''ac'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "basic38"
+options = ['escaped']
+pattern = '''(a|b)c|a(b|c)'''
+input = '''ab'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "basic39"
+options = ['escaped']
+pattern = '''(a|b)*c|(a|ab)*c'''
+input = '''abc'''
+matches = [[0, 3]]
+
+[[tests]]
+name = "basic40"
+options = ['escaped']
+pattern = '''(a|b)*c|(a|ab)*c'''
+input = '''xc'''
+matches = [[1, 2]]
+
+[[tests]]
+name = "basic41"
+options = ['escaped']
+pattern = '''(.a|.b).*|.*(.a|.b)'''
+input = '''xa'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "basic42"
+options = ['escaped']
+pattern = '''a?(ab|ba)ab'''
+input = '''abab'''
+matches = [[0, 4]]
+
+[[tests]]
+name = "basic43"
+options = ['escaped']
+pattern = '''a?(ac{0}b|ba)ab'''
+input = '''abab'''
+matches = [[0, 4]]
+
+[[tests]]
+name = "basic44"
+options = ['escaped']
+pattern = '''ab|abab'''
+input = '''abbabab'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "basic45"
+options = ['escaped']
+pattern = '''aba|bab|bba'''
+input = '''baaabbbaba'''
+matches = [[5, 8]]
+
+[[tests]]
+name = "basic46"
+options = ['escaped']
+pattern = '''aba|bab'''
+input = '''baaabbbaba'''
+matches = [[6, 9]]
+
+[[tests]]
+name = "basic47"
+options = ['escaped']
+pattern = '''(aa|aaa)*|(a|aaaaa)'''
+input = '''aa'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "basic48"
+options = ['escaped']
+pattern = '''(a.|.a.)*|(a|.a...)'''
+input = '''aa'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "basic49"
+options = ['escaped']
+pattern = '''ab|a'''
+input = '''xabc'''
+matches = [[1, 3]]
+
+[[tests]]
+name = "basic50"
+options = ['escaped']
+pattern = '''ab|a'''
+input = '''xxabc'''
+matches = [[2, 4]]
+
+[[tests]]
+name = "basic51"
+options = ['escaped', 'case-insensitive']
+pattern = '''(Ab|cD)*'''
+input = '''aBcD'''
+matches = [[0, 4]]
+
+[[tests]]
+name = "basic52"
+options = ['escaped']
+pattern = '''[^-]'''
+input = '''--a'''
+matches = [[2, 3]]
+
+[[tests]]
+name = "basic53"
+options = ['escaped']
+pattern = '''[a-]*'''
+input = '''--a'''
+matches = [[0, 3]]
+
+[[tests]]
+name = "basic54"
+options = ['escaped']
+pattern = '''[a-m-]*'''
+input = '''--amoma--'''
+matches = [[0, 4]]
+
+[[tests]]
+name = "basic55"
+options = ['escaped']
+pattern = ''':::1:::0:|:::1:1:0:'''
+input = ''':::0:::1:::1:::0:'''
+matches = [[8, 17]]
+
+[[tests]]
+name = "basic56"
+options = ['escaped']
+pattern = ''':::1:::0:|:::1:1:1:'''
+input = ''':::0:::1:::1:::0:'''
+matches = [[8, 17]]
+
+[[tests]]
+name = "basic57"
+options = ['escaped']
+pattern = '''[[:upper:]]'''
+input = '''A'''
+matches = [[0, 1]]
+
+[[tests]]
+name = "basic58"
+options = ['escaped']
+pattern = '''[[:lower:]]+'''
+input = '''`az{'''
+matches = [[1, 3]]
+
+[[tests]]
+name = "basic59"
+options = ['escaped']
+pattern = '''[[:upper:]]+'''
+input = '''@AZ['''
+matches = [[1, 3]]
+
+[[tests]]
+name = "basic65"
+options = ['escaped']
+pattern = '''\n'''
+input = '''\n'''
+matches = [[0, 1]]
+
+[[tests]]
+name = "basic66"
+options = ['escaped']
+pattern = '''\n'''
+input = '''\n'''
+matches = [[0, 1]]
+
+[[tests]]
+name = "basic67"
+options = ['escaped']
+pattern = '''[^a]'''
+input = '''\n'''
+matches = [[0, 1]]
+
+[[tests]]
+name = "basic68"
+options = ['escaped']
+pattern = '''\na'''
+input = '''\na'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "basic69"
+options = ['escaped']
+pattern = '''(a)(b)(c)'''
+input = '''abc'''
+matches = [[0, 3]]
+
+[[tests]]
+name = "basic70"
+options = ['escaped']
+pattern = '''xxx'''
+input = '''xxx'''
+matches = [[0, 3]]
+
+[[tests]]
+name = "basic71"
+options = ['escaped']
+pattern = '''(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)'''
+input = '''feb 6,'''
+matches = [[0, 6]]
+
+[[tests]]
+name = "basic72"
+options = ['escaped']
+pattern = '''(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)'''
+input = '''2/7'''
+matches = [[0, 3]]
+
+[[tests]]
+name = "basic73"
+options = ['escaped']
+pattern = '''(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)'''
+input = '''feb 1,Feb 6'''
+matches = [[5, 11]]
+
+[[tests]]
+name = "basic74"
+options = ['escaped']
+pattern = '''((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))'''
+input = '''x'''
+matches = [[0, 1]]
+
+[[tests]]
+name = "basic75"
+options = ['escaped']
+pattern = '''((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))*'''
+input = '''xx'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "basic76"
+options = ['escaped']
+pattern = '''a?(ab|ba)*'''
+input = '''ababababababababababababababababababababababababababababababababababababababababa'''
+matches = [[0, 81]]
+
+[[tests]]
+name = "basic77"
+options = ['escaped']
+pattern = '''abaa|abbaa|abbbaa|abbbbaa'''
+input = '''ababbabbbabbbabbbbabbbbaa'''
+matches = [[18, 25]]
+
+[[tests]]
+name = "basic78"
+options = ['escaped']
+pattern = '''abaa|abbaa|abbbaa|abbbbaa'''
+input = '''ababbabbbabbbabbbbabaa'''
+matches = [[18, 22]]
+
+[[tests]]
+name = "basic79"
+options = ['escaped']
+pattern = '''aaac|aabc|abac|abbc|baac|babc|bbac|bbbc'''
+input = '''baaabbbabac'''
+matches = [[7, 11]]
+
+[[tests]]
+name = "basic80"
+options = ['escaped']
+pattern = '''.*'''
+input = '''\x01\x7f'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "basic81"
+options = ['escaped']
+pattern = '''aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll'''
+input = '''XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa'''
+matches = [[53, 57]]
+
+[[tests]]
+name = "basic83"
+options = ['escaped']
+pattern = '''a*a*a*a*a*b'''
+input = '''aaaaaaaaab'''
+matches = [[0, 10]]
+
+[[tests]]
+name = "basic84"
+options = ['escaped']
+pattern = '''^'''
+input = ''''''
+matches = [[0, 0]]
+
+[[tests]]
+name = "basic85"
+options = ['escaped']
+pattern = '''$'''
+input = ''''''
+matches = [[0, 0]]
+
+[[tests]]
+name = "basic86"
+options = ['escaped']
+pattern = '''^$'''
+input = ''''''
+matches = [[0, 0]]
+
+[[tests]]
+name = "basic87"
+options = ['escaped']
+pattern = '''^a$'''
+input = '''a'''
+matches = [[0, 1]]
+
+[[tests]]
+name = "basic88"
+options = ['escaped']
+pattern = '''abc'''
+input = '''abc'''
+matches = [[0, 3]]
+
+[[tests]]
+name = "basic89"
+options = ['escaped']
+pattern = '''abc'''
+input = '''xabcy'''
+matches = [[1, 4]]
+
+[[tests]]
+name = "basic90"
+options = ['escaped']
+pattern = '''abc'''
+input = '''ababc'''
+matches = [[2, 5]]
+
+[[tests]]
+name = "basic91"
+options = ['escaped']
+pattern = '''ab*c'''
+input = '''abc'''
+matches = [[0, 3]]
+
+[[tests]]
+name = "basic92"
+options = ['escaped']
+pattern = '''ab*bc'''
+input = '''abc'''
+matches = [[0, 3]]
+
+[[tests]]
+name = "basic93"
+options = ['escaped']
+pattern = '''ab*bc'''
+input = '''abbc'''
+matches = [[0, 4]]
+
+[[tests]]
+name = "basic94"
+options = ['escaped']
+pattern = '''ab*bc'''
+input = '''abbbbc'''
+matches = [[0, 6]]
+
+[[tests]]
+name = "basic95"
+options = ['escaped']
+pattern = '''ab+bc'''
+input = '''abbc'''
+matches = [[0, 4]]
+
+[[tests]]
+name = "basic96"
+options = ['escaped']
+pattern = '''ab+bc'''
+input = '''abbbbc'''
+matches = [[0, 6]]
+
+[[tests]]
+name = "basic97"
+options = ['escaped']
+pattern = '''ab?bc'''
+input = '''abbc'''
+matches = [[0, 4]]
+
+[[tests]]
+name = "basic98"
+options = ['escaped']
+pattern = '''ab?bc'''
+input = '''abc'''
+matches = [[0, 3]]
+
+[[tests]]
+name = "basic99"
+options = ['escaped']
+pattern = '''ab?c'''
+input = '''abc'''
+matches = [[0, 3]]
+
+[[tests]]
+name = "basic100"
+options = ['escaped']
+pattern = '''^abc$'''
+input = '''abc'''
+matches = [[0, 3]]
+
+[[tests]]
+name = "basic101"
+options = ['escaped']
+pattern = '''^abc'''
+input = '''abcc'''
+matches = [[0, 3]]
+
+[[tests]]
+name = "basic102"
+options = ['escaped']
+pattern = '''abc$'''
+input = '''aabc'''
+matches = [[1, 4]]
+
+[[tests]]
+name = "basic103"
+options = ['escaped']
+pattern = '''^'''
+input = '''abc'''
+matches = [[0, 0]]
+
+[[tests]]
+name = "basic104"
+options = ['escaped']
+pattern = '''$'''
+input = '''abc'''
+matches = [[3, 3]]
+
+[[tests]]
+name = "basic105"
+options = ['escaped']
+pattern = '''a.c'''
+input = '''abc'''
+matches = [[0, 3]]
+
+[[tests]]
+name = "basic106"
+options = ['escaped']
+pattern = '''a.c'''
+input = '''axc'''
+matches = [[0, 3]]
+
+[[tests]]
+name = "basic107"
+options = ['escaped']
+pattern = '''a.*c'''
+input = '''axyzc'''
+matches = [[0, 5]]
+
+[[tests]]
+name = "basic108"
+options = ['escaped']
+pattern = '''a[bc]d'''
+input = '''abd'''
+matches = [[0, 3]]
+
+[[tests]]
+name = "basic109"
+options = ['escaped']
+pattern = '''a[b-d]e'''
+input = '''ace'''
+matches = [[0, 3]]
+
+[[tests]]
+name = "basic110"
+options = ['escaped']
+pattern = '''a[b-d]'''
+input = '''aac'''
+matches = [[1, 3]]
+
+[[tests]]
+name = "basic111"
+options = ['escaped']
+pattern = '''a[-b]'''
+input = '''a-'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "basic112"
+options = ['escaped']
+pattern = '''a[b-]'''
+input = '''a-'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "basic113"
+options = ['escaped']
+pattern = '''a]'''
+input = '''a]'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "basic114"
+options = ['escaped']
+pattern = '''a[]]b'''
+input = '''a]b'''
+matches = [[0, 3]]
+
+[[tests]]
+name = "basic115"
+options = ['escaped']
+pattern = '''a[^bc]d'''
+input = '''aed'''
+matches = [[0, 3]]
+
+[[tests]]
+name = "basic116"
+options = ['escaped']
+pattern = '''a[^-b]c'''
+input = '''adc'''
+matches = [[0, 3]]
+
+[[tests]]
+name = "basic117"
+options = ['escaped']
+pattern = '''a[^]b]c'''
+input = '''adc'''
+matches = [[0, 3]]
+
+[[tests]]
+name = "basic118"
+options = ['escaped']
+pattern = '''ab|cd'''
+input = '''abc'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "basic119"
+options = ['escaped']
+pattern = '''ab|cd'''
+input = '''abcd'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "basic120"
+options = ['escaped']
+pattern = '''a\(b'''
+input = '''a(b'''
+matches = [[0, 3]]
+
+[[tests]]
+name = "basic121"
+options = ['escaped']
+pattern = '''a\(*b'''
+input = '''ab'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "basic122"
+options = ['escaped']
+pattern = '''a\(*b'''
+input = '''a((b'''
+matches = [[0, 4]]
+
+[[tests]]
+name = "basic123"
+options = ['escaped']
+pattern = '''((a))'''
+input = '''abc'''
+matches = [[0, 1]]
+
+[[tests]]
+name = "basic124"
+options = ['escaped']
+pattern = '''(a)b(c)'''
+input = '''abc'''
+matches = [[0, 3]]
+
+[[tests]]
+name = "basic125"
+options = ['escaped']
+pattern = '''a+b+c'''
+input = '''aabbabc'''
+matches = [[4, 7]]
+
+[[tests]]
+name = "basic126"
+options = ['escaped']
+pattern = '''a*'''
+input = '''aaa'''
+matches = [[0, 3]]
+
+[[tests]]
+name = "basic128"
+options = ['escaped']
+pattern = '''(a*)*'''
+input = '''-'''
+matches = [[0, 0]]
+
+[[tests]]
+name = "basic129"
+options = ['escaped']
+pattern = '''(a*)+'''
+input = '''-'''
+matches = [[0, 0]]
+
+[[tests]]
+name = "basic131"
+options = ['escaped']
+pattern = '''(a*|b)*'''
+input = '''-'''
+matches = [[0, 0]]
+
+[[tests]]
+name = "basic132"
+options = ['escaped']
+pattern = '''(a+|b)*'''
+input = '''ab'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "basic133"
+options = ['escaped']
+pattern = '''(a+|b)+'''
+input = '''ab'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "basic134"
+options = ['escaped']
+pattern = '''(a+|b)?'''
+input = '''ab'''
+matches = [[0, 1]]
+
+[[tests]]
+name = "basic135"
+options = ['escaped']
+pattern = '''[^ab]*'''
+input = '''cde'''
+matches = [[0, 3]]
+
+[[tests]]
+name = "basic137"
+options = ['escaped']
+pattern = '''(^)*'''
+input = '''-'''
+matches = [[0, 0]]
+
+[[tests]]
+name = "basic138"
+options = ['escaped']
+pattern = '''a*'''
+input = ''''''
+matches = [[0, 0]]
+
+[[tests]]
+name = "basic139"
+options = ['escaped']
+pattern = '''([abc])*d'''
+input = '''abbbcd'''
+matches = [[0, 6]]
+
+[[tests]]
+name = "basic140"
+options = ['escaped']
+pattern = '''([abc])*bcd'''
+input = '''abcd'''
+matches = [[0, 4]]
+
+[[tests]]
+name = "basic141"
+options = ['escaped']
+pattern = '''a|b|c|d|e'''
+input = '''e'''
+matches = [[0, 1]]
+
+[[tests]]
+name = "basic142"
+options = ['escaped']
+pattern = '''(a|b|c|d|e)f'''
+input = '''ef'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "basic144"
+options = ['escaped']
+pattern = '''((a*|b))*'''
+input = '''-'''
+matches = [[0, 0]]
+
+[[tests]]
+name = "basic145"
+options = ['escaped']
+pattern = '''abcd*efg'''
+input = '''abcdefg'''
+matches = [[0, 7]]
+
+[[tests]]
+name = "basic146"
+options = ['escaped']
+pattern = '''ab*'''
+input = '''xabyabbbz'''
+matches = [[1, 3]]
+
+[[tests]]
+name = "basic147"
+options = ['escaped']
+pattern = '''ab*'''
+input = '''xayabbbz'''
+matches = [[1, 2]]
+
+[[tests]]
+name = "basic148"
+options = ['escaped']
+pattern = '''(ab|cd)e'''
+input = '''abcde'''
+matches = [[2, 5]]
+
+[[tests]]
+name = "basic149"
+options = ['escaped']
+pattern = '''[abhgefdc]ij'''
+input = '''hij'''
+matches = [[0, 3]]
+
+[[tests]]
+name = "basic150"
+options = ['escaped']
+pattern = '''(a|b)c*d'''
+input = '''abcd'''
+matches = [[1, 4]]
+
+[[tests]]
+name = "basic151"
+options = ['escaped']
+pattern = '''(ab|ab*)bc'''
+input = '''abc'''
+matches = [[0, 3]]
+
+[[tests]]
+name = "basic152"
+options = ['escaped']
+pattern = '''a([bc]*)c*'''
+input = '''abc'''
+matches = [[0, 3]]
+
+[[tests]]
+name = "basic153"
+options = ['escaped']
+pattern = '''a([bc]*)(c*d)'''
+input = '''abcd'''
+matches = [[0, 4]]
+
+[[tests]]
+name = "basic154"
+options = ['escaped']
+pattern = '''a([bc]+)(c*d)'''
+input = '''abcd'''
+matches = [[0, 4]]
+
+[[tests]]
+name = "basic155"
+options = ['escaped']
+pattern = '''a([bc]*)(c+d)'''
+input = '''abcd'''
+matches = [[0, 4]]
+
+[[tests]]
+name = "basic156"
+options = ['escaped']
+pattern = '''a[bcd]*dcdcde'''
+input = '''adcdcde'''
+matches = [[0, 7]]
+
+[[tests]]
+name = "basic157"
+options = ['escaped']
+pattern = '''(ab|a)b*c'''
+input = '''abc'''
+matches = [[0, 3]]
+
+[[tests]]
+name = "basic158"
+options = ['escaped']
+pattern = '''((a)(b)c)(d)'''
+input = '''abcd'''
+matches = [[0, 4]]
+
+[[tests]]
+name = "basic159"
+options = ['escaped']
+pattern = '''[A-Za-z_][A-Za-z0-9_]*'''
+input = '''alpha'''
+matches = [[0, 5]]
+
+[[tests]]
+name = "basic160"
+options = ['escaped']
+pattern = '''^a(bc+|b[eh])g|.h$'''
+input = '''abh'''
+matches = [[1, 3]]
+
+[[tests]]
+name = "basic161"
+options = ['escaped']
+pattern = '''(bc+d$|ef*g.|h?i(j|k))'''
+input = '''effgz'''
+matches = [[0, 5]]
+
+[[tests]]
+name = "basic162"
+options = ['escaped']
+pattern = '''(bc+d$|ef*g.|h?i(j|k))'''
+input = '''ij'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "basic163"
+options = ['escaped']
+pattern = '''(bc+d$|ef*g.|h?i(j|k))'''
+input = '''reffgz'''
+matches = [[1, 6]]
+
+[[tests]]
+name = "basic164"
+options = ['escaped']
+pattern = '''(((((((((a)))))))))'''
+input = '''a'''
+matches = [[0, 1]]
+
+[[tests]]
+name = "basic165"
+options = ['escaped']
+pattern = '''multiple words'''
+input = '''multiple words yeah'''
+matches = [[0, 14]]
+
+[[tests]]
+name = "basic166"
+options = ['escaped']
+pattern = '''(.*)c(.*)'''
+input = '''abcde'''
+matches = [[0, 5]]
+
+[[tests]]
+name = "basic167"
+options = ['escaped']
+pattern = '''abcd'''
+input = '''abcd'''
+matches = [[0, 4]]
+
+[[tests]]
+name = "basic168"
+options = ['escaped']
+pattern = '''a(bc)d'''
+input = '''abcd'''
+matches = [[0, 4]]
+
+[[tests]]
+name = "basic169"
+options = ['escaped']
+pattern = '''a[\x01-\x03]?c'''
+input = '''a\x02c'''
+matches = [[0, 3]]
+
+[[tests]]
+name = "basic170"
+options = ['escaped']
+pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Muammar Qaddafi'''
+matches = [[0, 15]]
+
+[[tests]]
+name = "basic171"
+options = ['escaped']
+pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Mo'ammar Gadhafi'''
+matches = [[0, 16]]
+
+[[tests]]
+name = "basic172"
+options = ['escaped']
+pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Muammar Kaddafi'''
+matches = [[0, 15]]
+
+[[tests]]
+name = "basic173"
+options = ['escaped']
+pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Muammar Qadhafi'''
+matches = [[0, 15]]
+
+[[tests]]
+name = "basic174"
+options = ['escaped']
+pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Muammar Gadafi'''
+matches = [[0, 14]]
+
+[[tests]]
+name = "basic175"
+options = ['escaped']
+pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Mu'ammar Qadafi'''
+matches = [[0, 15]]
+
+[[tests]]
+name = "basic176"
+options = ['escaped']
+pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Moamar Gaddafi'''
+matches = [[0, 14]]
+
+[[tests]]
+name = "basic177"
+options = ['escaped']
+pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Mu'ammar Qadhdhafi'''
+matches = [[0, 18]]
+
+[[tests]]
+name = "basic178"
+options = ['escaped']
+pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Muammar Khaddafi'''
+matches = [[0, 16]]
+
+[[tests]]
+name = "basic179"
+options = ['escaped']
+pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Muammar Ghaddafy'''
+matches = [[0, 16]]
+
+[[tests]]
+name = "basic180"
+options = ['escaped']
+pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Muammar Ghadafi'''
+matches = [[0, 15]]
+
+[[tests]]
+name = "basic181"
+options = ['escaped']
+pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Muammar Ghaddafi'''
+matches = [[0, 16]]
+
+[[tests]]
+name = "basic182"
+options = ['escaped']
+pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Muamar Kaddafi'''
+matches = [[0, 14]]
+
+[[tests]]
+name = "basic183"
+options = ['escaped']
+pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Muammar Quathafi'''
+matches = [[0, 16]]
+
+[[tests]]
+name = "basic184"
+options = ['escaped']
+pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Muammar Gheddafi'''
+matches = [[0, 16]]
+
+[[tests]]
+name = "basic185"
+options = ['escaped']
+pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Moammar Khadafy'''
+matches = [[0, 15]]
+
+[[tests]]
+name = "basic186"
+options = ['escaped']
+pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Moammar Qudhafi'''
+matches = [[0, 15]]
+
+[[tests]]
+name = "basic187"
+options = ['escaped']
+pattern = '''a+(b|c)*d+'''
+input = '''aabcdd'''
+matches = [[0, 6]]
+
+[[tests]]
+name = "basic188"
+options = ['escaped']
+pattern = '''^.+$'''
+input = '''vivi'''
+matches = [[0, 4]]
+
+[[tests]]
+name = "basic189"
+options = ['escaped']
+pattern = '''^(.+)$'''
+input = '''vivi'''
+matches = [[0, 4]]
+
+[[tests]]
+name = "basic190"
+options = ['escaped']
+pattern = '''^([^!.]+).att.com!(.+)$'''
+input = '''gryphon.att.com!eby'''
+matches = [[0, 19]]
+
+[[tests]]
+name = "basic191"
+options = ['escaped']
+pattern = '''^([^!]+!)?([^!]+)$'''
+input = '''bas'''
+matches = [[0, 3]]
+
+[[tests]]
+name = "basic192"
+options = ['escaped']
+pattern = '''^([^!]+!)?([^!]+)$'''
+input = '''bar!bas'''
+matches = [[0, 7]]
+
+[[tests]]
+name = "basic193"
+options = ['escaped']
+pattern = '''^([^!]+!)?([^!]+)$'''
+input = '''foo!bas'''
+matches = [[0, 7]]
+
+[[tests]]
+name = "basic194"
+options = ['escaped']
+pattern = '''^.+!([^!]+!)([^!]+)$'''
+input = '''foo!bar!bas'''
+matches = [[0, 11]]
+
+[[tests]]
+name = "basic195"
+options = ['escaped']
+pattern = '''((foo)|(bar))!bas'''
+input = '''bar!bas'''
+matches = [[0, 7]]
+
+[[tests]]
+name = "basic196"
+options = ['escaped']
+pattern = '''((foo)|(bar))!bas'''
+input = '''foo!bar!bas'''
+matches = [[4, 11]]
+
+[[tests]]
+name = "basic197"
+options = ['escaped']
+pattern = '''((foo)|(bar))!bas'''
+input = '''foo!bas'''
+matches = [[0, 7]]
+
+[[tests]]
+name = "basic198"
+options = ['escaped']
+pattern = '''((foo)|bar)!bas'''
+input = '''bar!bas'''
+matches = [[0, 7]]
+
+[[tests]]
+name = "basic199"
+options = ['escaped']
+pattern = '''((foo)|bar)!bas'''
+input = '''foo!bar!bas'''
+matches = [[4, 11]]
+
+[[tests]]
+name = "basic200"
+options = ['escaped']
+pattern = '''((foo)|bar)!bas'''
+input = '''foo!bas'''
+matches = [[0, 7]]
+
+[[tests]]
+name = "basic201"
+options = ['escaped']
+pattern = '''(foo|(bar))!bas'''
+input = '''bar!bas'''
+matches = [[0, 7]]
+
+[[tests]]
+name = "basic202"
+options = ['escaped']
+pattern = '''(foo|(bar))!bas'''
+input = '''foo!bar!bas'''
+matches = [[4, 11]]
+
+[[tests]]
+name = "basic203"
+options = ['escaped']
+pattern = '''(foo|(bar))!bas'''
+input = '''foo!bas'''
+matches = [[0, 7]]
+
+[[tests]]
+name = "basic204"
+options = ['escaped']
+pattern = '''(foo|bar)!bas'''
+input = '''bar!bas'''
+matches = [[0, 7]]
+
+[[tests]]
+name = "basic205"
+options = ['escaped']
+pattern = '''(foo|bar)!bas'''
+input = '''foo!bar!bas'''
+matches = [[4, 11]]
+
+[[tests]]
+name = "basic206"
+options = ['escaped']
+pattern = '''(foo|bar)!bas'''
+input = '''foo!bas'''
+matches = [[0, 7]]
+
+[[tests]]
+name = "basic207"
+options = ['escaped']
+pattern = '''^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$'''
+input = '''foo!bar!bas'''
+matches = [[0, 11]]
+
+[[tests]]
+name = "basic208"
+options = ['escaped']
+pattern = '''^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$'''
+input = '''bas'''
+matches = [[0, 3]]
+
+[[tests]]
+name = "basic209"
+options = ['escaped']
+pattern = '''^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$'''
+input = '''bar!bas'''
+matches = [[0, 7]]
+
+[[tests]]
+name = "basic210"
+options = ['escaped']
+pattern = '''^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$'''
+input = '''foo!bar!bas'''
+matches = [[0, 11]]
+
+[[tests]]
+name = "basic211"
+options = ['escaped']
+pattern = '''^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$'''
+input = '''foo!bas'''
+matches = [[0, 7]]
+
+[[tests]]
+name = "basic212"
+options = ['escaped']
+pattern = '''^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$'''
+input = '''bas'''
+matches = [[0, 3]]
+
+[[tests]]
+name = "basic213"
+options = ['escaped']
+pattern = '''^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$'''
+input = '''bar!bas'''
+matches = [[0, 7]]
+
+[[tests]]
+name = "basic214"
+options = ['escaped']
+pattern = '''^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$'''
+input = '''foo!bar!bas'''
+matches = [[0, 11]]
+
+[[tests]]
+name = "basic215"
+options = ['escaped']
+pattern = '''^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$'''
+input = '''foo!bas'''
+matches = [[0, 7]]
+
+[[tests]]
+name = "basic216"
+options = ['escaped']
+pattern = '''.*(/XXX).*'''
+input = '''/XXX'''
+matches = [[0, 4]]
+
+[[tests]]
+name = "basic217"
+options = ['escaped']
+pattern = '''.*(\\XXX).*'''
+input = '''\\XXX'''
+matches = [[0, 4]]
+
+[[tests]]
+name = "basic218"
+options = ['escaped']
+pattern = '''\\XXX'''
+input = '''\\XXX'''
+matches = [[0, 4]]
+
+[[tests]]
+name = "basic219"
+options = ['escaped']
+pattern = '''.*(/000).*'''
+input = '''/000'''
+matches = [[0, 4]]
+
+[[tests]]
+name = "basic220"
+options = ['escaped']
+pattern = '''.*(\\000).*'''
+input = '''\\000'''
+matches = [[0, 4]]
+
+[[tests]]
+name = "basic221"
+options = ['escaped']
+pattern = '''\\000'''
+input = '''\\000'''
+matches = [[0, 4]]
+
diff --git a/data/tests/fowler/fowler-to-toml b/data/tests/fowler/fowler-to-toml
new file mode 100755
index 0000000..5f1d91f
--- /dev/null
+++ b/data/tests/fowler/fowler-to-toml
@@ -0,0 +1,76 @@
+#!/usr/bin/env python
+
+from __future__ import absolute_import, division, print_function
+import argparse
+import os.path as path
+
+
+def read_tests(f):
+ basename, _ = path.splitext(path.basename(f))
+ tests = []
+ prev_pattern = None
+
+ for lineno, line in enumerate(open(f), 1):
+ fields = list(filter(None, map(str.strip, line.split('\t'))))
+ if not (4 <= len(fields) <= 5) \
+ or 'E' not in fields[0] or fields[0][0] == '#':
+ continue
+
+ terse_opts, pat, text, sgroups = fields[0:4]
+ groups = [] # groups as integer ranges
+ if sgroups == 'NOMATCH':
+ groups = []
+ elif ',' in sgroups:
+ noparen = map(lambda s: s.strip('()'), sgroups.split(')('))
+ for g in noparen:
+ s, e = map(str.strip, g.split(','))
+ groups.append([int(s), int(e)])
+ break
+ else:
+ # This skips tests that should result in an error.
+ # There aren't many, so I think we can just capture those
+ # manually. Possibly fix this in future.
+ continue
+
+ opts = []
+ if text == "NULL":
+ text = ""
+ if pat == 'SAME':
+ pat = prev_pattern
+ if '$' in terse_opts:
+ pat = pat.encode('utf-8').decode('unicode_escape')
+ text = text.encode('utf-8').decode('unicode_escape')
+ text = text.encode('unicode_escape').decode('utf-8')
+ opts.append('escaped')
+ else:
+ opts.append('escaped')
+ text = text.encode('unicode_escape').decode('utf-8')
+ if 'i' in terse_opts:
+ opts.append('case-insensitive')
+
+ pat = pat.encode('unicode_escape').decode('utf-8')
+ pat = pat.replace('\\\\', '\\')
+ tests.append({
+ 'name': '"%s%d"' % (basename, lineno),
+ 'options': repr(opts),
+ 'pattern': "'''%s'''" % pat,
+ 'input': "'''%s'''" % text,
+ 'matches': str(groups),
+ })
+ prev_pattern = pat
+ return tests
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(
+ description='Generate match tests from an AT&T POSIX test file.')
+ aa = parser.add_argument
+ aa('datfile', help='A dat AT&T POSIX test file.')
+ args = parser.parse_args()
+
+ tests = read_tests(args.datfile)
+ for t in tests:
+ print('[[tests]]')
+ for k, v in t.items():
+ print('%s = %s' % (k, v))
+ print('')
diff --git a/data/tests/fowler/nullsubexpr.dat b/data/tests/fowler/nullsubexpr.dat
new file mode 100644
index 0000000..2e18fbb
--- /dev/null
+++ b/data/tests/fowler/nullsubexpr.dat
@@ -0,0 +1,79 @@
+NOTE null subexpression matches : 2002-06-06
+
+E (a*)* a (0,1)(0,1)
+#E SAME x (0,0)(0,0)
+E SAME x (0,0)(?,?) RE2/Go
+E SAME aaaaaa (0,6)(0,6)
+E SAME aaaaaax (0,6)(0,6)
+E (a*)+ a (0,1)(0,1)
+E SAME x (0,0)(0,0)
+E SAME aaaaaa (0,6)(0,6)
+E SAME aaaaaax (0,6)(0,6)
+E (a+)* a (0,1)(0,1)
+E SAME x (0,0)
+E SAME aaaaaa (0,6)(0,6)
+E SAME aaaaaax (0,6)(0,6)
+E (a+)+ a (0,1)(0,1)
+E SAME x NOMATCH
+E SAME aaaaaa (0,6)(0,6)
+E SAME aaaaaax (0,6)(0,6)
+
+E ([a]*)* a (0,1)(0,1)
+#E SAME x (0,0)(0,0)
+E SAME x (0,0)(?,?) RE2/Go
+E SAME aaaaaa (0,6)(0,6)
+E SAME aaaaaax (0,6)(0,6)
+E ([a]*)+ a (0,1)(0,1)
+E SAME x (0,0)(0,0)
+E SAME aaaaaa (0,6)(0,6)
+E SAME aaaaaax (0,6)(0,6)
+E ([^b]*)* a (0,1)(0,1)
+#E SAME b (0,0)(0,0)
+E SAME b (0,0)(?,?) RE2/Go
+E SAME aaaaaa (0,6)(0,6)
+E SAME aaaaaab (0,6)(0,6)
+E ([ab]*)* a (0,1)(0,1)
+E SAME aaaaaa (0,6)(0,6)
+E SAME ababab (0,6)(0,6)
+E SAME bababa (0,6)(0,6)
+E SAME b (0,1)(0,1)
+E SAME bbbbbb (0,6)(0,6)
+E SAME aaaabcde (0,5)(0,5)
+E ([^a]*)* b (0,1)(0,1)
+E SAME bbbbbb (0,6)(0,6)
+#E SAME aaaaaa (0,0)(0,0)
+E SAME aaaaaa (0,0)(?,?) RE2/Go
+E ([^ab]*)* ccccxx (0,6)(0,6)
+#E SAME ababab (0,0)(0,0)
+E SAME ababab (0,0)(?,?) RE2/Go
+
+E ((z)+|a)* zabcde (0,2)(1,2)
+
+#{E a+? aaaaaa (0,1) no *? +? mimimal match ops
+#E (a) aaa (0,1)(0,1)
+#E (a*?) aaa (0,0)(0,0)
+#E (a)*? aaa (0,0)
+#E (a*?)*? aaa (0,0)
+#}
+
+B \(a*\)*\(x\) x (0,1)(0,0)(0,1)
+B \(a*\)*\(x\) ax (0,2)(0,1)(1,2)
+B \(a*\)*\(x\) axa (0,2)(0,1)(1,2)
+B \(a*\)*\(x\)\(\1\) x (0,1)(0,0)(0,1)(1,1)
+B \(a*\)*\(x\)\(\1\) ax (0,2)(1,1)(1,2)(2,2)
+B \(a*\)*\(x\)\(\1\) axa (0,3)(0,1)(1,2)(2,3)
+B \(a*\)*\(x\)\(\1\)\(x\) axax (0,4)(0,1)(1,2)(2,3)(3,4)
+B \(a*\)*\(x\)\(\1\)\(x\) axxa (0,3)(1,1)(1,2)(2,2)(2,3)
+
+#E (a*)*(x) x (0,1)(0,0)(0,1)
+E (a*)*(x) x (0,1)(?,?)(0,1) RE2/Go
+E (a*)*(x) ax (0,2)(0,1)(1,2)
+E (a*)*(x) axa (0,2)(0,1)(1,2)
+
+E (a*)+(x) x (0,1)(0,0)(0,1)
+E (a*)+(x) ax (0,2)(0,1)(1,2)
+E (a*)+(x) axa (0,2)(0,1)(1,2)
+
+E (a*){2}(x) x (0,1)(0,0)(0,1)
+E (a*){2}(x) ax (0,2)(1,1)(1,2)
+E (a*){2}(x) axa (0,2)(1,1)(1,2)
diff --git a/data/tests/fowler/nullsubexpr.toml b/data/tests/fowler/nullsubexpr.toml
new file mode 100644
index 0000000..331067c
--- /dev/null
+++ b/data/tests/fowler/nullsubexpr.toml
@@ -0,0 +1,350 @@
+[[tests]]
+name = "nullsubexpr3"
+options = ['escaped']
+pattern = '''(a*)*'''
+input = '''a'''
+matches = [[0, 1]]
+
+[[tests]]
+name = "nullsubexpr5"
+options = ['escaped']
+pattern = '''(a*)*'''
+input = '''x'''
+matches = [[0, 0]]
+
+[[tests]]
+name = "nullsubexpr6"
+options = ['escaped']
+pattern = '''(a*)*'''
+input = '''aaaaaa'''
+matches = [[0, 6]]
+
+[[tests]]
+name = "nullsubexpr7"
+options = ['escaped']
+pattern = '''(a*)*'''
+input = '''aaaaaax'''
+matches = [[0, 6]]
+
+[[tests]]
+name = "nullsubexpr8"
+options = ['escaped']
+pattern = '''(a*)+'''
+input = '''a'''
+matches = [[0, 1]]
+
+[[tests]]
+name = "nullsubexpr9"
+options = ['escaped']
+pattern = '''(a*)+'''
+input = '''x'''
+matches = [[0, 0]]
+
+[[tests]]
+name = "nullsubexpr10"
+options = ['escaped']
+pattern = '''(a*)+'''
+input = '''aaaaaa'''
+matches = [[0, 6]]
+
+[[tests]]
+name = "nullsubexpr11"
+options = ['escaped']
+pattern = '''(a*)+'''
+input = '''aaaaaax'''
+matches = [[0, 6]]
+
+[[tests]]
+name = "nullsubexpr12"
+options = ['escaped']
+pattern = '''(a+)*'''
+input = '''a'''
+matches = [[0, 1]]
+
+[[tests]]
+name = "nullsubexpr13"
+options = ['escaped']
+pattern = '''(a+)*'''
+input = '''x'''
+matches = [[0, 0]]
+
+[[tests]]
+name = "nullsubexpr14"
+options = ['escaped']
+pattern = '''(a+)*'''
+input = '''aaaaaa'''
+matches = [[0, 6]]
+
+[[tests]]
+name = "nullsubexpr15"
+options = ['escaped']
+pattern = '''(a+)*'''
+input = '''aaaaaax'''
+matches = [[0, 6]]
+
+[[tests]]
+name = "nullsubexpr16"
+options = ['escaped']
+pattern = '''(a+)+'''
+input = '''a'''
+matches = [[0, 1]]
+
+[[tests]]
+name = "nullsubexpr17"
+options = ['escaped']
+pattern = '''(a+)+'''
+input = '''x'''
+matches = []
+
+[[tests]]
+name = "nullsubexpr18"
+options = ['escaped']
+pattern = '''(a+)+'''
+input = '''aaaaaa'''
+matches = [[0, 6]]
+
+[[tests]]
+name = "nullsubexpr19"
+options = ['escaped']
+pattern = '''(a+)+'''
+input = '''aaaaaax'''
+matches = [[0, 6]]
+
+[[tests]]
+name = "nullsubexpr21"
+options = ['escaped']
+pattern = '''([a]*)*'''
+input = '''a'''
+matches = [[0, 1]]
+
+[[tests]]
+name = "nullsubexpr23"
+options = ['escaped']
+pattern = '''([a]*)*'''
+input = '''x'''
+matches = [[0, 0]]
+
+[[tests]]
+name = "nullsubexpr24"
+options = ['escaped']
+pattern = '''([a]*)*'''
+input = '''aaaaaa'''
+matches = [[0, 6]]
+
+[[tests]]
+name = "nullsubexpr25"
+options = ['escaped']
+pattern = '''([a]*)*'''
+input = '''aaaaaax'''
+matches = [[0, 6]]
+
+[[tests]]
+name = "nullsubexpr26"
+options = ['escaped']
+pattern = '''([a]*)+'''
+input = '''a'''
+matches = [[0, 1]]
+
+[[tests]]
+name = "nullsubexpr27"
+options = ['escaped']
+pattern = '''([a]*)+'''
+input = '''x'''
+matches = [[0, 0]]
+
+[[tests]]
+name = "nullsubexpr28"
+options = ['escaped']
+pattern = '''([a]*)+'''
+input = '''aaaaaa'''
+matches = [[0, 6]]
+
+[[tests]]
+name = "nullsubexpr29"
+options = ['escaped']
+pattern = '''([a]*)+'''
+input = '''aaaaaax'''
+matches = [[0, 6]]
+
+[[tests]]
+name = "nullsubexpr30"
+options = ['escaped']
+pattern = '''([^b]*)*'''
+input = '''a'''
+matches = [[0, 1]]
+
+[[tests]]
+name = "nullsubexpr32"
+options = ['escaped']
+pattern = '''([^b]*)*'''
+input = '''b'''
+matches = [[0, 0]]
+
+[[tests]]
+name = "nullsubexpr33"
+options = ['escaped']
+pattern = '''([^b]*)*'''
+input = '''aaaaaa'''
+matches = [[0, 6]]
+
+[[tests]]
+name = "nullsubexpr34"
+options = ['escaped']
+pattern = '''([^b]*)*'''
+input = '''aaaaaab'''
+matches = [[0, 6]]
+
+[[tests]]
+name = "nullsubexpr35"
+options = ['escaped']
+pattern = '''([ab]*)*'''
+input = '''a'''
+matches = [[0, 1]]
+
+[[tests]]
+name = "nullsubexpr36"
+options = ['escaped']
+pattern = '''([ab]*)*'''
+input = '''aaaaaa'''
+matches = [[0, 6]]
+
+[[tests]]
+name = "nullsubexpr37"
+options = ['escaped']
+pattern = '''([ab]*)*'''
+input = '''ababab'''
+matches = [[0, 6]]
+
+[[tests]]
+name = "nullsubexpr38"
+options = ['escaped']
+pattern = '''([ab]*)*'''
+input = '''bababa'''
+matches = [[0, 6]]
+
+[[tests]]
+name = "nullsubexpr39"
+options = ['escaped']
+pattern = '''([ab]*)*'''
+input = '''b'''
+matches = [[0, 1]]
+
+[[tests]]
+name = "nullsubexpr40"
+options = ['escaped']
+pattern = '''([ab]*)*'''
+input = '''bbbbbb'''
+matches = [[0, 6]]
+
+[[tests]]
+name = "nullsubexpr41"
+options = ['escaped']
+pattern = '''([ab]*)*'''
+input = '''aaaabcde'''
+matches = [[0, 5]]
+
+[[tests]]
+name = "nullsubexpr42"
+options = ['escaped']
+pattern = '''([^a]*)*'''
+input = '''b'''
+matches = [[0, 1]]
+
+[[tests]]
+name = "nullsubexpr43"
+options = ['escaped']
+pattern = '''([^a]*)*'''
+input = '''bbbbbb'''
+matches = [[0, 6]]
+
+[[tests]]
+name = "nullsubexpr45"
+options = ['escaped']
+pattern = '''([^a]*)*'''
+input = '''aaaaaa'''
+matches = [[0, 0]]
+
+[[tests]]
+name = "nullsubexpr46"
+options = ['escaped']
+pattern = '''([^ab]*)*'''
+input = '''ccccxx'''
+matches = [[0, 6]]
+
+[[tests]]
+name = "nullsubexpr48"
+options = ['escaped']
+pattern = '''([^ab]*)*'''
+input = '''ababab'''
+matches = [[0, 0]]
+
+[[tests]]
+name = "nullsubexpr50"
+options = ['escaped']
+pattern = '''((z)+|a)*'''
+input = '''zabcde'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "nullsubexpr69"
+options = ['escaped']
+pattern = '''(a*)*(x)'''
+input = '''x'''
+matches = [[0, 1]]
+
+[[tests]]
+name = "nullsubexpr70"
+options = ['escaped']
+pattern = '''(a*)*(x)'''
+input = '''ax'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "nullsubexpr71"
+options = ['escaped']
+pattern = '''(a*)*(x)'''
+input = '''axa'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "nullsubexpr73"
+options = ['escaped']
+pattern = '''(a*)+(x)'''
+input = '''x'''
+matches = [[0, 1]]
+
+[[tests]]
+name = "nullsubexpr74"
+options = ['escaped']
+pattern = '''(a*)+(x)'''
+input = '''ax'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "nullsubexpr75"
+options = ['escaped']
+pattern = '''(a*)+(x)'''
+input = '''axa'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "nullsubexpr77"
+options = ['escaped']
+pattern = '''(a*){2}(x)'''
+input = '''x'''
+matches = [[0, 1]]
+
+[[tests]]
+name = "nullsubexpr78"
+options = ['escaped']
+pattern = '''(a*){2}(x)'''
+input = '''ax'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "nullsubexpr79"
+options = ['escaped']
+pattern = '''(a*){2}(x)'''
+input = '''axa'''
+matches = [[0, 2]]
+
diff --git a/data/tests/fowler/repetition-long.dat b/data/tests/fowler/repetition-long.dat
new file mode 100644
index 0000000..c915802
--- /dev/null
+++ b/data/tests/fowler/repetition-long.dat
@@ -0,0 +1,85 @@
+NOTE implicit vs. explicit repetitions : 2009-02-02
+
+# Glenn Fowler <gsf@research.att.com>
+# conforming matches (column 4) must match one of the following BREs
+# NOMATCH
+# (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)*
+# (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)*
+# i.e., each 3-tuple has two identical elements and one (?,?)
+
+NOTE additional repetition tests graciously provided by Chris Kuklewicz www.haskell.org 2009-02-02
+
+:HA#100:E X(.?){0,}Y X1234567Y (0,9)(7,8)
+:HA#101:E X(.?){1,}Y X1234567Y (0,9)(7,8)
+:HA#102:E X(.?){2,}Y X1234567Y (0,9)(7,8)
+:HA#103:E X(.?){3,}Y X1234567Y (0,9)(7,8)
+:HA#104:E X(.?){4,}Y X1234567Y (0,9)(7,8)
+:HA#105:E X(.?){5,}Y X1234567Y (0,9)(7,8)
+:HA#106:E X(.?){6,}Y X1234567Y (0,9)(7,8)
+:HA#107:E X(.?){7,}Y X1234567Y (0,9)(7,8)
+:HA#108:E X(.?){8,}Y X1234567Y (0,9)(8,8)
+#:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(7,8)
+:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(8,8) RE2/Go
+#:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(7,8)
+:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(8,8) RE2/Go
+#:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(7,8)
+:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(8,8) RE2/Go
+#:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(7,8)
+:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(8,8) RE2/Go
+#:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(7,8)
+:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(8,8) RE2/Go
+#:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(7,8)
+:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(8,8) RE2/Go
+#:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(7,8)
+:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(8,8) RE2/Go
+#:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(7,8)
+:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(8,8) RE2/Go
+:HA#118:E X(.?){8,8}Y X1234567Y (0,9)(8,8)
+
+# These test a fixed bug in my regex-tdfa that did not keep the expanded
+# form properly grouped, so right association did the wrong thing with
+# these ambiguous patterns (crafted just to test my code when I became
+# suspicious of my implementation). The first subexpression should use
+# "ab" then "a" then "bcd".
+
+# OS X / FreeBSD / NetBSD badly fail many of these, with impossible
+# results like (0,6)(4,5)(6,6).
+
+:HA#260:E (a|ab|c|bcd){0,}(d*) ababcd (0,1)(0,1)(1,1)
+:HA#261:E (a|ab|c|bcd){1,}(d*) ababcd (0,1)(0,1)(1,1)
+:HA#262:E (a|ab|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#263:E (a|ab|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#264:E (a|ab|c|bcd){4,}(d*) ababcd NOMATCH
+:HA#265:E (a|ab|c|bcd){0,10}(d*) ababcd (0,1)(0,1)(1,1)
+:HA#266:E (a|ab|c|bcd){1,10}(d*) ababcd (0,1)(0,1)(1,1)
+:HA#267:E (a|ab|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#268:E (a|ab|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#269:E (a|ab|c|bcd){4,10}(d*) ababcd NOMATCH
+:HA#270:E (a|ab|c|bcd)*(d*) ababcd (0,1)(0,1)(1,1)
+:HA#271:E (a|ab|c|bcd)+(d*) ababcd (0,1)(0,1)(1,1)
+
+# The above worked on Linux/GLIBC but the following often fail.
+# They also trip up OS X / FreeBSD / NetBSD:
+
+#:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
+#:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
+#:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
+#:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
+:HA#284:E (ab|a|c|bcd){4,}(d*) ababcd NOMATCH
+#:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
+#:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
+#:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
+#:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
+:HA#289:E (ab|a|c|bcd){4,10}(d*) ababcd NOMATCH
+#:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(3,6)(6,6)
+:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
+#:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(3,6)(6,6)
+:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
diff --git a/data/tests/fowler/repetition-long.toml b/data/tests/fowler/repetition-long.toml
new file mode 100644
index 0000000..e0b2ea7
--- /dev/null
+++ b/data/tests/fowler/repetition-long.toml
@@ -0,0 +1,294 @@
+[[tests]]
+name = "repetition-long12"
+options = ['escaped']
+pattern = '''X(.?){0,}Y'''
+input = '''X1234567Y'''
+matches = [[0, 9]]
+
+[[tests]]
+name = "repetition-long13"
+options = ['escaped']
+pattern = '''X(.?){1,}Y'''
+input = '''X1234567Y'''
+matches = [[0, 9]]
+
+[[tests]]
+name = "repetition-long14"
+options = ['escaped']
+pattern = '''X(.?){2,}Y'''
+input = '''X1234567Y'''
+matches = [[0, 9]]
+
+[[tests]]
+name = "repetition-long15"
+options = ['escaped']
+pattern = '''X(.?){3,}Y'''
+input = '''X1234567Y'''
+matches = [[0, 9]]
+
+[[tests]]
+name = "repetition-long16"
+options = ['escaped']
+pattern = '''X(.?){4,}Y'''
+input = '''X1234567Y'''
+matches = [[0, 9]]
+
+[[tests]]
+name = "repetition-long17"
+options = ['escaped']
+pattern = '''X(.?){5,}Y'''
+input = '''X1234567Y'''
+matches = [[0, 9]]
+
+[[tests]]
+name = "repetition-long18"
+options = ['escaped']
+pattern = '''X(.?){6,}Y'''
+input = '''X1234567Y'''
+matches = [[0, 9]]
+
+[[tests]]
+name = "repetition-long19"
+options = ['escaped']
+pattern = '''X(.?){7,}Y'''
+input = '''X1234567Y'''
+matches = [[0, 9]]
+
+[[tests]]
+name = "repetition-long20"
+options = ['escaped']
+pattern = '''X(.?){8,}Y'''
+input = '''X1234567Y'''
+matches = [[0, 9]]
+
+[[tests]]
+name = "repetition-long22"
+options = ['escaped']
+pattern = '''X(.?){0,8}Y'''
+input = '''X1234567Y'''
+matches = [[0, 9]]
+
+[[tests]]
+name = "repetition-long24"
+options = ['escaped']
+pattern = '''X(.?){1,8}Y'''
+input = '''X1234567Y'''
+matches = [[0, 9]]
+
+[[tests]]
+name = "repetition-long26"
+options = ['escaped']
+pattern = '''X(.?){2,8}Y'''
+input = '''X1234567Y'''
+matches = [[0, 9]]
+
+[[tests]]
+name = "repetition-long28"
+options = ['escaped']
+pattern = '''X(.?){3,8}Y'''
+input = '''X1234567Y'''
+matches = [[0, 9]]
+
+[[tests]]
+name = "repetition-long30"
+options = ['escaped']
+pattern = '''X(.?){4,8}Y'''
+input = '''X1234567Y'''
+matches = [[0, 9]]
+
+[[tests]]
+name = "repetition-long32"
+options = ['escaped']
+pattern = '''X(.?){5,8}Y'''
+input = '''X1234567Y'''
+matches = [[0, 9]]
+
+[[tests]]
+name = "repetition-long34"
+options = ['escaped']
+pattern = '''X(.?){6,8}Y'''
+input = '''X1234567Y'''
+matches = [[0, 9]]
+
+[[tests]]
+name = "repetition-long36"
+options = ['escaped']
+pattern = '''X(.?){7,8}Y'''
+input = '''X1234567Y'''
+matches = [[0, 9]]
+
+[[tests]]
+name = "repetition-long37"
+options = ['escaped']
+pattern = '''X(.?){8,8}Y'''
+input = '''X1234567Y'''
+matches = [[0, 9]]
+
+[[tests]]
+name = "repetition-long48"
+options = ['escaped']
+pattern = '''(a|ab|c|bcd){0,}(d*)'''
+input = '''ababcd'''
+matches = [[0, 1]]
+
+[[tests]]
+name = "repetition-long49"
+options = ['escaped']
+pattern = '''(a|ab|c|bcd){1,}(d*)'''
+input = '''ababcd'''
+matches = [[0, 1]]
+
+[[tests]]
+name = "repetition-long50"
+options = ['escaped']
+pattern = '''(a|ab|c|bcd){2,}(d*)'''
+input = '''ababcd'''
+matches = [[0, 6]]
+
+[[tests]]
+name = "repetition-long51"
+options = ['escaped']
+pattern = '''(a|ab|c|bcd){3,}(d*)'''
+input = '''ababcd'''
+matches = [[0, 6]]
+
+[[tests]]
+name = "repetition-long52"
+options = ['escaped']
+pattern = '''(a|ab|c|bcd){4,}(d*)'''
+input = '''ababcd'''
+matches = []
+
+[[tests]]
+name = "repetition-long53"
+options = ['escaped']
+pattern = '''(a|ab|c|bcd){0,10}(d*)'''
+input = '''ababcd'''
+matches = [[0, 1]]
+
+[[tests]]
+name = "repetition-long54"
+options = ['escaped']
+pattern = '''(a|ab|c|bcd){1,10}(d*)'''
+input = '''ababcd'''
+matches = [[0, 1]]
+
+[[tests]]
+name = "repetition-long55"
+options = ['escaped']
+pattern = '''(a|ab|c|bcd){2,10}(d*)'''
+input = '''ababcd'''
+matches = [[0, 6]]
+
+[[tests]]
+name = "repetition-long56"
+options = ['escaped']
+pattern = '''(a|ab|c|bcd){3,10}(d*)'''
+input = '''ababcd'''
+matches = [[0, 6]]
+
+[[tests]]
+name = "repetition-long57"
+options = ['escaped']
+pattern = '''(a|ab|c|bcd){4,10}(d*)'''
+input = '''ababcd'''
+matches = []
+
+[[tests]]
+name = "repetition-long58"
+options = ['escaped']
+pattern = '''(a|ab|c|bcd)*(d*)'''
+input = '''ababcd'''
+matches = [[0, 1]]
+
+[[tests]]
+name = "repetition-long59"
+options = ['escaped']
+pattern = '''(a|ab|c|bcd)+(d*)'''
+input = '''ababcd'''
+matches = [[0, 1]]
+
+[[tests]]
+name = "repetition-long65"
+options = ['escaped']
+pattern = '''(ab|a|c|bcd){0,}(d*)'''
+input = '''ababcd'''
+matches = [[0, 6]]
+
+[[tests]]
+name = "repetition-long67"
+options = ['escaped']
+pattern = '''(ab|a|c|bcd){1,}(d*)'''
+input = '''ababcd'''
+matches = [[0, 6]]
+
+[[tests]]
+name = "repetition-long69"
+options = ['escaped']
+pattern = '''(ab|a|c|bcd){2,}(d*)'''
+input = '''ababcd'''
+matches = [[0, 6]]
+
+[[tests]]
+name = "repetition-long71"
+options = ['escaped']
+pattern = '''(ab|a|c|bcd){3,}(d*)'''
+input = '''ababcd'''
+matches = [[0, 6]]
+
+[[tests]]
+name = "repetition-long72"
+options = ['escaped']
+pattern = '''(ab|a|c|bcd){4,}(d*)'''
+input = '''ababcd'''
+matches = []
+
+[[tests]]
+name = "repetition-long74"
+options = ['escaped']
+pattern = '''(ab|a|c|bcd){0,10}(d*)'''
+input = '''ababcd'''
+matches = [[0, 6]]
+
+[[tests]]
+name = "repetition-long76"
+options = ['escaped']
+pattern = '''(ab|a|c|bcd){1,10}(d*)'''
+input = '''ababcd'''
+matches = [[0, 6]]
+
+[[tests]]
+name = "repetition-long78"
+options = ['escaped']
+pattern = '''(ab|a|c|bcd){2,10}(d*)'''
+input = '''ababcd'''
+matches = [[0, 6]]
+
+[[tests]]
+name = "repetition-long80"
+options = ['escaped']
+pattern = '''(ab|a|c|bcd){3,10}(d*)'''
+input = '''ababcd'''
+matches = [[0, 6]]
+
+[[tests]]
+name = "repetition-long81"
+options = ['escaped']
+pattern = '''(ab|a|c|bcd){4,10}(d*)'''
+input = '''ababcd'''
+matches = []
+
+[[tests]]
+name = "repetition-long83"
+options = ['escaped']
+pattern = '''(ab|a|c|bcd)*(d*)'''
+input = '''ababcd'''
+matches = [[0, 6]]
+
+[[tests]]
+name = "repetition-long85"
+options = ['escaped']
+pattern = '''(ab|a|c|bcd)+(d*)'''
+input = '''ababcd'''
+matches = [[0, 6]]
+
diff --git a/data/tests/fowler/repetition.dat b/data/tests/fowler/repetition.dat
new file mode 100644
index 0000000..2dac082
--- /dev/null
+++ b/data/tests/fowler/repetition.dat
@@ -0,0 +1,83 @@
+NOTE implicit vs. explicit repetitions : 2009-02-02
+
+# Glenn Fowler <gsf@research.att.com>
+# conforming matches (column 4) must match one of the following BREs
+# NOMATCH
+# (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)*
+# (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)*
+# i.e., each 3-tuple has two identical elements and one (?,?)
+
+E ((..)|(.)) NULL NOMATCH
+E ((..)|(.))((..)|(.)) NULL NOMATCH
+E ((..)|(.))((..)|(.))((..)|(.)) NULL NOMATCH
+
+E ((..)|(.)){1} NULL NOMATCH
+E ((..)|(.)){2} NULL NOMATCH
+E ((..)|(.)){3} NULL NOMATCH
+
+E ((..)|(.))* NULL (0,0)
+
+E ((..)|(.)) a (0,1)(0,1)(?,?)(0,1)
+E ((..)|(.))((..)|(.)) a NOMATCH
+E ((..)|(.))((..)|(.))((..)|(.)) a NOMATCH
+
+E ((..)|(.)){1} a (0,1)(0,1)(?,?)(0,1)
+E ((..)|(.)){2} a NOMATCH
+E ((..)|(.)){3} a NOMATCH
+
+E ((..)|(.))* a (0,1)(0,1)(?,?)(0,1)
+
+E ((..)|(.)) aa (0,2)(0,2)(0,2)(?,?)
+E ((..)|(.))((..)|(.)) aa (0,2)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)
+E ((..)|(.))((..)|(.))((..)|(.)) aa NOMATCH
+
+E ((..)|(.)){1} aa (0,2)(0,2)(0,2)(?,?)
+E ((..)|(.)){2} aa (0,2)(1,2)(?,?)(1,2)
+E ((..)|(.)){3} aa NOMATCH
+
+E ((..)|(.))* aa (0,2)(0,2)(0,2)(?,?)
+
+E ((..)|(.)) aaa (0,2)(0,2)(0,2)(?,?)
+E ((..)|(.))((..)|(.)) aaa (0,3)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)
+E ((..)|(.))((..)|(.))((..)|(.)) aaa (0,3)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)(2,3)(?,?)(2,3)
+
+E ((..)|(.)){1} aaa (0,2)(0,2)(0,2)(?,?)
+#E ((..)|(.)){2} aaa (0,3)(2,3)(?,?)(2,3)
+E ((..)|(.)){2} aaa (0,3)(2,3)(0,2)(2,3) RE2/Go
+E ((..)|(.)){3} aaa (0,3)(2,3)(?,?)(2,3)
+
+#E ((..)|(.))* aaa (0,3)(2,3)(?,?)(2,3)
+E ((..)|(.))* aaa (0,3)(2,3)(0,2)(2,3) RE2/Go
+
+E ((..)|(.)) aaaa (0,2)(0,2)(0,2)(?,?)
+E ((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
+E ((..)|(.))((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)(3,4)(?,?)(3,4)
+
+E ((..)|(.)){1} aaaa (0,2)(0,2)(0,2)(?,?)
+E ((..)|(.)){2} aaaa (0,4)(2,4)(2,4)(?,?)
+#E ((..)|(.)){3} aaaa (0,4)(3,4)(?,?)(3,4)
+E ((..)|(.)){3} aaaa (0,4)(3,4)(0,2)(3,4) RE2/Go
+
+E ((..)|(.))* aaaa (0,4)(2,4)(2,4)(?,?)
+
+E ((..)|(.)) aaaaa (0,2)(0,2)(0,2)(?,?)
+E ((..)|(.))((..)|(.)) aaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
+E ((..)|(.))((..)|(.))((..)|(.)) aaaaa (0,5)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,5)(?,?)(4,5)
+
+E ((..)|(.)){1} aaaaa (0,2)(0,2)(0,2)(?,?)
+E ((..)|(.)){2} aaaaa (0,4)(2,4)(2,4)(?,?)
+#E ((..)|(.)){3} aaaaa (0,5)(4,5)(?,?)(4,5)
+E ((..)|(.)){3} aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go
+
+#E ((..)|(.))* aaaaa (0,5)(4,5)(?,?)(4,5)
+E ((..)|(.))* aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go
+
+E ((..)|(.)) aaaaaa (0,2)(0,2)(0,2)(?,?)
+E ((..)|(.))((..)|(.)) aaaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
+E ((..)|(.))((..)|(.))((..)|(.)) aaaaaa (0,6)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,6)(4,6)(?,?)
+
+E ((..)|(.)){1} aaaaaa (0,2)(0,2)(0,2)(?,?)
+E ((..)|(.)){2} aaaaaa (0,4)(2,4)(2,4)(?,?)
+E ((..)|(.)){3} aaaaaa (0,6)(4,6)(4,6)(?,?)
+
+E ((..)|(.))* aaaaaa (0,6)(4,6)(4,6)(?,?)
diff --git a/data/tests/fowler/repetition.toml b/data/tests/fowler/repetition.toml
new file mode 100644
index 0000000..43280a4
--- /dev/null
+++ b/data/tests/fowler/repetition.toml
@@ -0,0 +1,343 @@
+[[tests]]
+name = "repetition10"
+options = ['escaped']
+pattern = '''((..)|(.))'''
+input = ''''''
+matches = []
+
+[[tests]]
+name = "repetition11"
+options = ['escaped']
+pattern = '''((..)|(.))((..)|(.))'''
+input = ''''''
+matches = []
+
+[[tests]]
+name = "repetition12"
+options = ['escaped']
+pattern = '''((..)|(.))((..)|(.))((..)|(.))'''
+input = ''''''
+matches = []
+
+[[tests]]
+name = "repetition14"
+options = ['escaped']
+pattern = '''((..)|(.)){1}'''
+input = ''''''
+matches = []
+
+[[tests]]
+name = "repetition15"
+options = ['escaped']
+pattern = '''((..)|(.)){2}'''
+input = ''''''
+matches = []
+
+[[tests]]
+name = "repetition16"
+options = ['escaped']
+pattern = '''((..)|(.)){3}'''
+input = ''''''
+matches = []
+
+[[tests]]
+name = "repetition18"
+options = ['escaped']
+pattern = '''((..)|(.))*'''
+input = ''''''
+matches = [[0, 0]]
+
+[[tests]]
+name = "repetition20"
+options = ['escaped']
+pattern = '''((..)|(.))'''
+input = '''a'''
+matches = [[0, 1]]
+
+[[tests]]
+name = "repetition21"
+options = ['escaped']
+pattern = '''((..)|(.))((..)|(.))'''
+input = '''a'''
+matches = []
+
+[[tests]]
+name = "repetition22"
+options = ['escaped']
+pattern = '''((..)|(.))((..)|(.))((..)|(.))'''
+input = '''a'''
+matches = []
+
+[[tests]]
+name = "repetition24"
+options = ['escaped']
+pattern = '''((..)|(.)){1}'''
+input = '''a'''
+matches = [[0, 1]]
+
+[[tests]]
+name = "repetition25"
+options = ['escaped']
+pattern = '''((..)|(.)){2}'''
+input = '''a'''
+matches = []
+
+[[tests]]
+name = "repetition26"
+options = ['escaped']
+pattern = '''((..)|(.)){3}'''
+input = '''a'''
+matches = []
+
+[[tests]]
+name = "repetition28"
+options = ['escaped']
+pattern = '''((..)|(.))*'''
+input = '''a'''
+matches = [[0, 1]]
+
+[[tests]]
+name = "repetition30"
+options = ['escaped']
+pattern = '''((..)|(.))'''
+input = '''aa'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "repetition31"
+options = ['escaped']
+pattern = '''((..)|(.))((..)|(.))'''
+input = '''aa'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "repetition32"
+options = ['escaped']
+pattern = '''((..)|(.))((..)|(.))((..)|(.))'''
+input = '''aa'''
+matches = []
+
+[[tests]]
+name = "repetition34"
+options = ['escaped']
+pattern = '''((..)|(.)){1}'''
+input = '''aa'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "repetition35"
+options = ['escaped']
+pattern = '''((..)|(.)){2}'''
+input = '''aa'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "repetition36"
+options = ['escaped']
+pattern = '''((..)|(.)){3}'''
+input = '''aa'''
+matches = []
+
+[[tests]]
+name = "repetition38"
+options = ['escaped']
+pattern = '''((..)|(.))*'''
+input = '''aa'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "repetition40"
+options = ['escaped']
+pattern = '''((..)|(.))'''
+input = '''aaa'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "repetition41"
+options = ['escaped']
+pattern = '''((..)|(.))((..)|(.))'''
+input = '''aaa'''
+matches = [[0, 3]]
+
+[[tests]]
+name = "repetition42"
+options = ['escaped']
+pattern = '''((..)|(.))((..)|(.))((..)|(.))'''
+input = '''aaa'''
+matches = [[0, 3]]
+
+[[tests]]
+name = "repetition44"
+options = ['escaped']
+pattern = '''((..)|(.)){1}'''
+input = '''aaa'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "repetition46"
+options = ['escaped']
+pattern = '''((..)|(.)){2}'''
+input = '''aaa'''
+matches = [[0, 3]]
+
+[[tests]]
+name = "repetition47"
+options = ['escaped']
+pattern = '''((..)|(.)){3}'''
+input = '''aaa'''
+matches = [[0, 3]]
+
+[[tests]]
+name = "repetition50"
+options = ['escaped']
+pattern = '''((..)|(.))*'''
+input = '''aaa'''
+matches = [[0, 3]]
+
+[[tests]]
+name = "repetition52"
+options = ['escaped']
+pattern = '''((..)|(.))'''
+input = '''aaaa'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "repetition53"
+options = ['escaped']
+pattern = '''((..)|(.))((..)|(.))'''
+input = '''aaaa'''
+matches = [[0, 4]]
+
+[[tests]]
+name = "repetition54"
+options = ['escaped']
+pattern = '''((..)|(.))((..)|(.))((..)|(.))'''
+input = '''aaaa'''
+matches = [[0, 4]]
+
+[[tests]]
+name = "repetition56"
+options = ['escaped']
+pattern = '''((..)|(.)){1}'''
+input = '''aaaa'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "repetition57"
+options = ['escaped']
+pattern = '''((..)|(.)){2}'''
+input = '''aaaa'''
+matches = [[0, 4]]
+
+[[tests]]
+name = "repetition59"
+options = ['escaped']
+pattern = '''((..)|(.)){3}'''
+input = '''aaaa'''
+matches = [[0, 4]]
+
+[[tests]]
+name = "repetition61"
+options = ['escaped']
+pattern = '''((..)|(.))*'''
+input = '''aaaa'''
+matches = [[0, 4]]
+
+[[tests]]
+name = "repetition63"
+options = ['escaped']
+pattern = '''((..)|(.))'''
+input = '''aaaaa'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "repetition64"
+options = ['escaped']
+pattern = '''((..)|(.))((..)|(.))'''
+input = '''aaaaa'''
+matches = [[0, 4]]
+
+[[tests]]
+name = "repetition65"
+options = ['escaped']
+pattern = '''((..)|(.))((..)|(.))((..)|(.))'''
+input = '''aaaaa'''
+matches = [[0, 5]]
+
+[[tests]]
+name = "repetition67"
+options = ['escaped']
+pattern = '''((..)|(.)){1}'''
+input = '''aaaaa'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "repetition68"
+options = ['escaped']
+pattern = '''((..)|(.)){2}'''
+input = '''aaaaa'''
+matches = [[0, 4]]
+
+[[tests]]
+name = "repetition70"
+options = ['escaped']
+pattern = '''((..)|(.)){3}'''
+input = '''aaaaa'''
+matches = [[0, 5]]
+
+[[tests]]
+name = "repetition73"
+options = ['escaped']
+pattern = '''((..)|(.))*'''
+input = '''aaaaa'''
+matches = [[0, 5]]
+
+[[tests]]
+name = "repetition75"
+options = ['escaped']
+pattern = '''((..)|(.))'''
+input = '''aaaaaa'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "repetition76"
+options = ['escaped']
+pattern = '''((..)|(.))((..)|(.))'''
+input = '''aaaaaa'''
+matches = [[0, 4]]
+
+[[tests]]
+name = "repetition77"
+options = ['escaped']
+pattern = '''((..)|(.))((..)|(.))((..)|(.))'''
+input = '''aaaaaa'''
+matches = [[0, 6]]
+
+[[tests]]
+name = "repetition79"
+options = ['escaped']
+pattern = '''((..)|(.)){1}'''
+input = '''aaaaaa'''
+matches = [[0, 2]]
+
+[[tests]]
+name = "repetition80"
+options = ['escaped']
+pattern = '''((..)|(.)){2}'''
+input = '''aaaaaa'''
+matches = [[0, 4]]
+
+[[tests]]
+name = "repetition81"
+options = ['escaped']
+pattern = '''((..)|(.)){3}'''
+input = '''aaaaaa'''
+matches = [[0, 6]]
+
+[[tests]]
+name = "repetition83"
+options = ['escaped']
+pattern = '''((..)|(.))*'''
+input = '''aaaaaa'''
+matches = [[0, 6]]
+
diff --git a/data/tests/iter.toml b/data/tests/iter.toml
new file mode 100644
index 0000000..30abae8
--- /dev/null
+++ b/data/tests/iter.toml
@@ -0,0 +1,92 @@
+[[tests]]
+name = "iter1"
+pattern = "a"
+input = "aaa"
+matches = [[0, 1], [1, 2], [2, 3]]
+
+[[tests]]
+name = "iter2"
+pattern = "a"
+input = "aba"
+matches = [[0, 1], [2, 3]]
+
+[[tests]]
+name = "iter-empty1"
+pattern = ''
+input = ''
+matches = [[0, 0]]
+
+[[tests]]
+name = "iter-empty2"
+pattern = ''
+input = 'abc'
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "iter-empty3"
+pattern = '()'
+input = 'abc'
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "iter-empty4"
+pattern = '()*'
+input = 'abc'
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "iter-empty5"
+pattern = '()+'
+input = 'abc'
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "iter-empty6"
+pattern = '()?'
+input = 'abc'
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "iter-empty7"
+pattern = '()()'
+input = 'abc'
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "iter-empty8"
+pattern = '()+|z'
+input = 'abc'
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "iter-empty9"
+pattern = 'z|()+'
+input = 'abc'
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "iter-empty10"
+pattern = '()+|b'
+input = 'abc'
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "iter-empty11"
+pattern = 'b|()+'
+input = 'abc'
+matches = [[0, 0], [1, 2], [3, 3]]
+
+
+[[tests]]
+options = ["anchored"]
+name = "iter-anchored1"
+pattern = "a"
+input = "a"
+matches = [[0, 1]]
+
+[[tests]]
+options = ["anchored"]
+name = "iter-anchored2"
+pattern = "a"
+input = "aa"
+matches = [[0, 1]]
diff --git a/data/tests/no-unicode.toml b/data/tests/no-unicode.toml
new file mode 100644
index 0000000..16e02b4
--- /dev/null
+++ b/data/tests/no-unicode.toml
@@ -0,0 +1,138 @@
+[[tests]]
+name = "invalid-utf8-literal1"
+options = ["escaped", "invalid-utf8", "no-unicode"]
+pattern = '\xFF'
+input = '\xFF'
+matches = [[0, 1]]
+
+
+[[tests]]
+name = "no-unicode-mixed"
+options = ["escaped", "invalid-utf8"]
+pattern = '(.+)(?-u)(.+)'
+input = '\xCE\x93\xCE\x94\xFF'
+matches = [[0, 5]]
+
+
+[[tests]]
+name = "no-unicode-case1"
+options = ["case-insensitive", "no-unicode"]
+pattern = "a"
+input = "A"
+matches = [[0, 1]]
+
+[[tests]]
+name = "no-unicode-case2"
+options = ["case-insensitive", "no-unicode"]
+pattern = "[a-z]+"
+input = "AaAaA"
+matches = [[0, 5]]
+
+[[tests]]
+name = "no-unicode-case3"
+options = ["case-insensitive"]
+pattern = "[a-z]+"
+input = "aA\u212AaA"
+matches = [[0, 7]]
+
+[[tests]]
+name = "no-unicode-case4"
+options = ["case-insensitive", "no-unicode"]
+pattern = "[a-z]+"
+input = "aA\u212AaA"
+matches = [[0, 2]]
+
+
+[[tests]]
+name = "no-unicode-negate1"
+options = []
+pattern = "[^a]"
+input = "δ"
+matches = [[0, 2]]
+
+[[tests]]
+name = "no-unicode-negate2"
+options = ["no-unicode", "invalid-utf8"]
+pattern = "[^a]"
+input = "δ"
+matches = [[0, 1]]
+
+
+[[tests]]
+name = "no-unicode-dotstar-prefix1"
+options = ["escaped", "no-unicode", "invalid-utf8"]
+pattern = "a"
+input = '\xFFa'
+matches = [[1, 2]]
+
+[[tests]]
+name = "no-unicode-dotstar-prefix2"
+options = ["escaped", "invalid-utf8"]
+pattern = "a"
+input = '\xFFa'
+matches = [[1, 2]]
+
+
+[[tests]]
+name = "no-unicode-null-bytes1"
+options = ["escaped", "no-unicode", "invalid-utf8"]
+pattern = '[^\x00]+\x00'
+input = 'foo\x00'
+matches = [[0, 4]]
+
+
+[[tests]]
+name = "no-unicode1"
+options = ["no-unicode"]
+pattern = '\w+'
+input = "aδ"
+matches = [[0, 1]]
+
+[[tests]]
+name = "no-unicode2"
+options = []
+pattern = '\w+'
+input = "aδ"
+matches = [[0, 3]]
+
+[[tests]]
+name = "no-unicode3"
+options = ["no-unicode"]
+pattern = '\d+'
+input = "1२३9"
+matches = [[0, 1]]
+
+[[tests]]
+name = "no-unicode4"
+pattern = '\d+'
+input = "1२३9"
+matches = [[0, 8]]
+
+[[tests]]
+name = "no-unicode5"
+options = ["no-unicode"]
+pattern = '\s+'
+input = " \u1680"
+matches = [[0, 1]]
+
+[[tests]]
+name = "no-unicode6"
+pattern = '\s+'
+input = " \u1680"
+matches = [[0, 4]]
+
+
+[[tests]]
+# See: https://github.com/rust-lang/regex/issues/484
+name = "no-unicode-iter1"
+pattern = ''
+input = "☃"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+# See: https://github.com/rust-lang/regex/issues/484
+options = ['escaped']
+name = "no-unicode-iter2"
+pattern = ''
+input = 'b\xFFr'
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
diff --git a/data/tests/unicode.toml b/data/tests/unicode.toml
new file mode 100644
index 0000000..845393f
--- /dev/null
+++ b/data/tests/unicode.toml
@@ -0,0 +1,489 @@
+[[tests]]
+name = "unicode-literal1"
+pattern = '☃'
+input = "☃"
+matches = [[0, 3]]
+
+[[tests]]
+name = "unicode-literal2"
+pattern = '☃+'
+input = "☃"
+matches = [[0, 3]]
+
+[[tests]]
+name = "unicode-literal3"
+options = ["case-insensitive"]
+pattern = '☃+'
+input = "☃"
+matches = [[0, 3]]
+
+[[tests]]
+name = "unicode-literal4"
+options = ["case-insensitive"]
+pattern = 'Δ'
+input = "δ"
+matches = [[0, 2]]
+
+
+[[tests]]
+name = "unicode-class1"
+pattern = '[☃Ⅰ]+'
+input = "☃"
+matches = [[0, 3]]
+
+[[tests]]
+name = "unicode-class2"
+pattern = '\pN'
+input = "Ⅰ"
+matches = [[0, 3]]
+
+[[tests]]
+name = "unicode-class3"
+pattern = '\pN+'
+input = "Ⅰ1Ⅱ2"
+matches = [[0, 8]]
+
+[[tests]]
+name = "unicode-class4"
+pattern = '\PN+'
+input = "abⅠ"
+matches = [[0, 2]]
+
+[[tests]]
+name = "unicode-class5"
+pattern = '[\PN]+'
+input = "abⅠ"
+matches = [[0, 2]]
+
+[[tests]]
+name = "unicode-class6"
+pattern = '[^\PN]+'
+input = "abⅠ"
+matches = [[2, 5]]
+
+[[tests]]
+name = "unicode-class7"
+pattern = '\p{Lu}+'
+input = "ΛΘΓΔα"
+matches = [[0, 8]]
+
+[[tests]]
+name = "unicode-class8"
+options = ["case-insensitive"]
+pattern = '\p{Lu}+'
+input = "ΛΘΓΔα"
+matches = [[0, 10]]
+
+[[tests]]
+name = "unicode-class9"
+pattern = '\pL+'
+input = "ΛΘΓΔα"
+matches = [[0, 10]]
+
+[[tests]]
+name = "unicode-class10"
+pattern = '\p{Ll}+'
+input = "ΛΘΓΔα"
+matches = [[8, 10]]
+
+
+[[tests]]
+name = "unicode-perl1"
+pattern = '\w+'
+input = "dδd"
+matches = [[0, 4]]
+
+[[tests]]
+name = "unicode-perl2"
+pattern = '\w+'
+input = "⥡"
+matches = []
+
+[[tests]]
+name = "unicode-perl3"
+pattern = '\W+'
+input = "⥡"
+matches = [[0, 3]]
+
+[[tests]]
+name = "unicode-perl4"
+pattern = '\d+'
+input = "1२३9"
+matches = [[0, 8]]
+
+[[tests]]
+name = "unicode-perl5"
+pattern = '\d+'
+input = "Ⅱ"
+matches = []
+
+[[tests]]
+name = "unicode-perl6"
+pattern = '\D+'
+input = "Ⅱ"
+matches = [[0, 3]]
+
+[[tests]]
+name = "unicode-perl7"
+pattern = '\s+'
+input = " "
+matches = [[0, 3]]
+
+[[tests]]
+name = "unicode-perl8"
+pattern = '\s+'
+input = "☃"
+matches = []
+
+[[tests]]
+name = "unicode-perl9"
+pattern = '\S+'
+input = "☃"
+matches = [[0, 3]]
+
+
+[[tests]]
+name = "unicode-class-gencat1"
+pattern = '\p{Cased_Letter}'
+input = "A"
+matches = [[0, 3]]
+
+[[tests]]
+name = "unicode-class-gencat2"
+pattern = '\p{Close_Punctuation}'
+input = "❯"
+matches = [[0, 3]]
+
+[[tests]]
+name = "unicode-class-gencat3"
+pattern = '\p{Connector_Punctuation}'
+input = "⁀"
+matches = [[0, 3]]
+
+[[tests]]
+name = "unicode-class-gencat4"
+pattern = '\p{Control}'
+input = "\u009F"
+matches = [[0, 2]]
+
+[[tests]]
+name = "unicode-class-gencat5"
+pattern = '\p{Currency_Symbol}'
+input = "£"
+matches = [[0, 3]]
+
+[[tests]]
+name = "unicode-class-gencat6"
+pattern = '\p{Dash_Punctuation}'
+input = "〰"
+matches = [[0, 3]]
+
+[[tests]]
+name = "unicode-class-gencat7"
+pattern = '\p{Decimal_Number}'
+input = "𑓙"
+matches = [[0, 4]]
+
+[[tests]]
+name = "unicode-class-gencat8"
+pattern = '\p{Enclosing_Mark}'
+input = "\uA672"
+matches = [[0, 3]]
+
+[[tests]]
+name = "unicode-class-gencat9"
+pattern = '\p{Final_Punctuation}'
+input = "⸡"
+matches = [[0, 3]]
+
+[[tests]]
+name = "unicode-class-gencat10"
+pattern = '\p{Format}'
+input = "\U000E007F"
+matches = [[0, 4]]
+
+[[tests]]
+name = "unicode-class-gencat11"
+pattern = '\p{Initial_Punctuation}'
+input = "⸜"
+matches = [[0, 3]]
+
+[[tests]]
+name = "unicode-class-gencat12"
+pattern = '\p{Letter}'
+input = "Έ"
+matches = [[0, 2]]
+
+[[tests]]
+name = "unicode-class-gencat13"
+pattern = '\p{Letter_Number}'
+input = "ↂ"
+matches = [[0, 3]]
+
+[[tests]]
+name = "unicode-class-gencat14"
+pattern = '\p{Line_Separator}'
+input = "\u2028"
+matches = [[0, 3]]
+
+[[tests]]
+name = "unicode-class-gencat15"
+pattern = '\p{Lowercase_Letter}'
+input = "ϛ"
+matches = [[0, 2]]
+
+[[tests]]
+name = "unicode-class-gencat16"
+pattern = '\p{Mark}'
+input = "\U000E01EF"
+matches = [[0, 4]]
+
+[[tests]]
+name = "unicode-class-gencat17"
+pattern = '\p{Math}'
+input = "⋿"
+matches = [[0, 3]]
+
+[[tests]]
+name = "unicode-class-gencat18"
+pattern = '\p{Modifier_Letter}'
+input = "𖭃"
+matches = [[0, 4]]
+
+[[tests]]
+name = "unicode-class-gencat19"
+pattern = '\p{Modifier_Symbol}'
+input = "🏿"
+matches = [[0, 4]]
+
+[[tests]]
+name = "unicode-class-gencat20"
+pattern = '\p{Nonspacing_Mark}'
+input = "\U0001E94A"
+matches = [[0, 4]]
+
+[[tests]]
+name = "unicode-class-gencat21"
+pattern = '\p{Number}'
+input = "⓿"
+matches = [[0, 3]]
+
+[[tests]]
+name = "unicode-class-gencat22"
+pattern = '\p{Open_Punctuation}'
+input = "⦅"
+matches = [[0, 3]]
+
+[[tests]]
+name = "unicode-class-gencat23"
+pattern = '\p{Other}'
+input = "\u0BC9"
+matches = [[0, 3]]
+
+[[tests]]
+name = "unicode-class-gencat24"
+pattern = '\p{Other_Letter}'
+input = "ꓷ"
+matches = [[0, 3]]
+
+[[tests]]
+name = "unicode-class-gencat25"
+pattern = '\p{Other_Number}'
+input = "㉏"
+matches = [[0, 3]]
+
+[[tests]]
+name = "unicode-class-gencat26"
+pattern = '\p{Other_Punctuation}'
+input = "𞥞"
+matches = [[0, 4]]
+
+[[tests]]
+name = "unicode-class-gencat27"
+pattern = '\p{Other_Symbol}'
+input = "⅌"
+matches = [[0, 3]]
+
+[[tests]]
+name = "unicode-class-gencat28"
+pattern = '\p{Paragraph_Separator}'
+input = "\u2029"
+matches = [[0, 3]]
+
+[[tests]]
+name = "unicode-class-gencat29"
+pattern = '\p{Private_Use}'
+input = "\U0010FFFD"
+matches = [[0, 4]]
+
+[[tests]]
+name = "unicode-class-gencat30"
+pattern = '\p{Punctuation}'
+input = "𑁍"
+matches = [[0, 4]]
+
+[[tests]]
+name = "unicode-class-gencat31"
+pattern = '\p{Separator}'
+input = "\u3000"
+matches = [[0, 3]]
+
+[[tests]]
+name = "unicode-class-gencat32"
+pattern = '\p{Space_Separator}'
+input = "\u205F"
+matches = [[0, 3]]
+
+[[tests]]
+name = "unicode-class-gencat33"
+pattern = '\p{Spacing_Mark}'
+input = "\U00016F7E"
+matches = [[0, 4]]
+
+[[tests]]
+name = "unicode-class-gencat34"
+pattern = '\p{Symbol}'
+input = "⯈"
+matches = [[0, 3]]
+
+[[tests]]
+name = "unicode-class-gencat35"
+pattern = '\p{Titlecase_Letter}'
+input = "ῼ"
+matches = [[0, 3]]
+
+[[tests]]
+name = "unicode-class-gencat36"
+pattern = '\p{Unassigned}'
+input = "\U0010FFFF"
+matches = [[0, 4]]
+
+[[tests]]
+name = "unicode-class-gencat37"
+pattern = '\p{Uppercase_Letter}'
+input = "Ꝋ"
+matches = [[0, 3]]
+
+
+[[tests]]
+name = "unicode-class-emoji1"
+pattern = '\p{Emoji}'
+input = "\u23E9"
+matches = [[0, 3]]
+
+[[tests]]
+name = "unicode-class-emoji2"
+pattern = '\p{emoji}'
+input = "\U0001F21A"
+matches = [[0, 4]]
+
+[[tests]]
+name = "unicode-class-emoji3"
+pattern = '\p{extendedpictographic}'
+input = "\U0001FA6E"
+matches = [[0, 4]]
+
+[[tests]]
+name = "unicode-class-emoji4"
+pattern = '\p{extendedpictographic}'
+input = "\U0001FFFD"
+matches = [[0, 4]]
+
+
+[[tests]]
+name = "unicode-class-gcb1"
+pattern = '\p{grapheme_cluster_break=prepend}'
+input = "\U00011D46"
+matches = [[0, 4]]
+
+[[tests]]
+name = "unicode-class-gcb2"
+pattern = '\p{gcb=regional_indicator}'
+input = "\U0001F1E6"
+matches = [[0, 4]]
+
+[[tests]]
+name = "unicode-class-gcb3"
+pattern = '\p{gcb=ri}'
+input = "\U0001F1E7"
+matches = [[0, 4]]
+
+[[tests]]
+name = "unicode-class-gcb4"
+pattern = '\p{regionalindicator}'
+input = "\U0001F1FF"
+matches = [[0, 4]]
+
+[[tests]]
+name = "unicode-class-gcb5"
+pattern = '\p{gcb=lvt}'
+input = "\uC989"
+matches = [[0, 3]]
+
+[[tests]]
+name = "unicode-class-gcb6"
+pattern = '\p{gcb=zwj}'
+input = "\u200D"
+matches = [[0, 3]]
+
+
+[[tests]]
+name = "unicode-class-word-break1"
+pattern = '\p{word_break=Hebrew_Letter}'
+input = "\uFB46"
+matches = [[0, 3]]
+
+[[tests]]
+name = "unicode-class-word-break2"
+pattern = '\p{wb=hebrewletter}'
+input = "\uFB46"
+matches = [[0, 3]]
+
+[[tests]]
+name = "unicode-class-word-break3"
+pattern = '\p{wb=ExtendNumLet}'
+input = "\uFF3F"
+matches = [[0, 3]]
+
+[[tests]]
+name = "unicode-class-word-break4"
+pattern = '\p{wb=WSegSpace}'
+input = "\u3000"
+matches = [[0, 3]]
+
+[[tests]]
+name = "unicode-class-word-break5"
+pattern = '\p{wb=numeric}'
+input = "\U0001E950"
+matches = [[0, 4]]
+
+
+[[tests]]
+name = "unicode-class-sentence-break1"
+pattern = '\p{sentence_break=Lower}'
+input = "\u0469"
+matches = [[0, 2]]
+
+[[tests]]
+name = "unicode-class-sentence-break2"
+pattern = '\p{sb=lower}'
+input = "\u0469"
+matches = [[0, 2]]
+
+[[tests]]
+name = "unicode-class-sentence-break3"
+pattern = '\p{sb=Close}'
+input = "\uFF60"
+matches = [[0, 3]]
+
+[[tests]]
+name = "unicode-class-sentence-break4"
+pattern = '\p{sb=Close}'
+input = "\U0001F677"
+matches = [[0, 4]]
+
+[[tests]]
+name = "unicode-class-sentence-break5"
+pattern = '\p{sb=SContinue}'
+input = "\uFF64"
+matches = [[0, 3]]
diff --git a/rustfmt.toml b/rustfmt.toml
new file mode 100644
index 0000000..aa37a21
--- /dev/null
+++ b/rustfmt.toml
@@ -0,0 +1,2 @@
+max_width = 79
+use_small_heuristics = "max"
diff --git a/src/classes.rs b/src/classes.rs
new file mode 100644
index 0000000..143908b
--- /dev/null
+++ b/src/classes.rs
@@ -0,0 +1,271 @@
+use core::fmt;
+
+/// A representation of byte oriented equivalence classes.
+///
+/// This is used in a DFA to reduce the size of the transition table. This can
+/// have a particularly large impact not only on the total size of a dense DFA,
+/// but also on compile times.
+#[derive(Clone, Copy)]
+pub struct ByteClasses([u8; 256]);
+
+impl ByteClasses {
+ /// Creates a new set of equivalence classes where all bytes are mapped to
+ /// the same class.
+ pub fn empty() -> ByteClasses {
+ ByteClasses([0; 256])
+ }
+
+ /// Creates a new set of equivalence classes where each byte belongs to
+ /// its own equivalence class.
+ pub fn singletons() -> ByteClasses {
+ let mut classes = ByteClasses::empty();
+ for i in 0..256 {
+ classes.set(i as u8, i as u8);
+ }
+ classes
+ }
+
+ /// Copies the byte classes given. The given slice must have length 0 or
+ /// length 256. Slices of length 0 are treated as singletons (every byte
+ /// is its own class).
+ pub fn from_slice(slice: &[u8]) -> ByteClasses {
+ assert!(slice.is_empty() || slice.len() == 256);
+
+ if slice.is_empty() {
+ ByteClasses::singletons()
+ } else {
+ let mut classes = ByteClasses::empty();
+ for (b, &class) in slice.iter().enumerate() {
+ classes.set(b as u8, class);
+ }
+ classes
+ }
+ }
+
+ /// Set the equivalence class for the given byte.
+ #[inline]
+ pub fn set(&mut self, byte: u8, class: u8) {
+ self.0[byte as usize] = class;
+ }
+
+ /// Get the equivalence class for the given byte.
+ #[inline]
+ pub fn get(&self, byte: u8) -> u8 {
+ self.0[byte as usize]
+ }
+
+ /// Get the equivalence class for the given byte while forcefully
+ /// eliding bounds checks.
+ #[inline]
+ pub unsafe fn get_unchecked(&self, byte: u8) -> u8 {
+ *self.0.get_unchecked(byte as usize)
+ }
+
+ /// Return the total number of elements in the alphabet represented by
+ /// these equivalence classes. Equivalently, this returns the total number
+ /// of equivalence classes.
+ #[inline]
+ pub fn alphabet_len(&self) -> usize {
+ self.0[255] as usize + 1
+ }
+
+ /// Returns true if and only if every byte in this class maps to its own
+ /// equivalence class. Equivalently, there are 256 equivalence classes
+ /// and each class contains exactly one byte.
+ #[inline]
+ pub fn is_singleton(&self) -> bool {
+ self.alphabet_len() == 256
+ }
+
+ /// Returns an iterator over a sequence of representative bytes from each
+ /// equivalence class. Namely, this yields exactly N items, where N is
+ /// equivalent to the number of equivalence classes. Each item is an
+ /// arbitrary byte drawn from each equivalence class.
+ ///
+ /// This is useful when one is determinizing an NFA and the NFA's alphabet
+ /// hasn't been converted to equivalence classes yet. Picking an arbitrary
+ /// byte from each equivalence class then permits a full exploration of
+ /// the NFA instead of using every possible byte value.
+ #[cfg(feature = "std")]
+ pub fn representatives(&self) -> ByteClassRepresentatives {
+ ByteClassRepresentatives { classes: self, byte: 0, last_class: None }
+ }
+
+ /// Returns all of the bytes in the given equivalence class.
+ ///
+ /// The second element in the tuple indicates the number of elements in
+ /// the array.
+ fn elements(&self, equiv: u8) -> ([u8; 256], usize) {
+ let (mut array, mut len) = ([0; 256], 0);
+ for b in 0..256 {
+ if self.get(b as u8) == equiv {
+ array[len] = b as u8;
+ len += 1;
+ }
+ }
+ (array, len)
+ }
+}
+
+impl fmt::Debug for ByteClasses {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ if self.is_singleton() {
+ write!(f, "ByteClasses({{singletons}})")
+ } else {
+ write!(f, "ByteClasses(")?;
+ for equiv in 0..self.alphabet_len() {
+ let (members, len) = self.elements(equiv as u8);
+ write!(f, "{} => {:?}", equiv, &members[..len])?;
+ }
+ write!(f, ")")
+ }
+ }
+}
+
+/// An iterator over representative bytes from each equivalence class.
+#[cfg(feature = "std")]
+#[derive(Debug)]
+pub struct ByteClassRepresentatives<'a> {
+ classes: &'a ByteClasses,
+ byte: usize,
+ last_class: Option<u8>,
+}
+
+#[cfg(feature = "std")]
+impl<'a> Iterator for ByteClassRepresentatives<'a> {
+ type Item = u8;
+
+ fn next(&mut self) -> Option<u8> {
+ while self.byte < 256 {
+ let byte = self.byte as u8;
+ let class = self.classes.get(byte);
+ self.byte += 1;
+
+ if self.last_class != Some(class) {
+ self.last_class = Some(class);
+ return Some(byte);
+ }
+ }
+ None
+ }
+}
+
+/// A byte class set keeps track of an *approximation* of equivalence classes
+/// of bytes during NFA construction. That is, every byte in an equivalence
+/// class cannot discriminate between a match and a non-match.
+///
+/// For example, in the regex `[ab]+`, the bytes `a` and `b` would be in the
+/// same equivalence class because it never matters whether an `a` or a `b` is
+/// seen, and no combination of `a`s and `b`s in the text can discriminate
+/// a match.
+///
+/// Note though that this does not compute the minimal set of equivalence
+/// classes. For example, in the regex `[ac]+`, both `a` and `c` are in the
+/// same equivalence class for the same reason that `a` and `b` are in the
+/// same equivalence class in the aforementioned regex. However, in this
+/// implementation, `a` and `c` are put into distinct equivalence classes.
+/// The reason for this is implementation complexity. In the future, we should
+/// endeavor to compute the minimal equivalence classes since they can have a
+/// rather large impact on the size of the DFA.
+///
+/// The representation here is 256 booleans, all initially set to false. Each
+/// boolean maps to its corresponding byte based on position. A `true` value
+/// indicates the end of an equivalence class, where its corresponding byte
+/// and all of the bytes corresponding to all previous contiguous `false`
+/// values are in the same equivalence class.
+///
+/// This particular representation only permits contiguous ranges of bytes to
+/// be in the same equivalence class, which means that we can never discover
+/// the true minimal set of equivalence classes.
+#[cfg(feature = "std")]
+#[derive(Debug)]
+pub struct ByteClassSet(Vec<bool>);
+
+#[cfg(feature = "std")]
+impl ByteClassSet {
+ /// Create a new set of byte classes where all bytes are part of the same
+ /// equivalence class.
+ pub fn new() -> Self {
+ ByteClassSet(vec![false; 256])
+ }
+
+ /// Indicate the the range of byte given (inclusive) can discriminate a
+ /// match between it and all other bytes outside of the range.
+ pub fn set_range(&mut self, start: u8, end: u8) {
+ debug_assert!(start <= end);
+ if start > 0 {
+ self.0[start as usize - 1] = true;
+ }
+ self.0[end as usize] = true;
+ }
+
+ /// Convert this boolean set to a map that maps all byte values to their
+ /// corresponding equivalence class. The last mapping indicates the largest
+ /// equivalence class identifier (which is never bigger than 255).
+ pub fn byte_classes(&self) -> ByteClasses {
+ let mut classes = ByteClasses::empty();
+ let mut class = 0u8;
+ let mut i = 0;
+ loop {
+ classes.set(i as u8, class as u8);
+ if i >= 255 {
+ break;
+ }
+ if self.0[i] {
+ class = class.checked_add(1).unwrap();
+ }
+ i += 1;
+ }
+ classes
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ #[cfg(feature = "std")]
+ #[test]
+ fn byte_classes() {
+ use super::ByteClassSet;
+
+ let mut set = ByteClassSet::new();
+ set.set_range(b'a', b'z');
+
+ let classes = set.byte_classes();
+ assert_eq!(classes.get(0), 0);
+ assert_eq!(classes.get(1), 0);
+ assert_eq!(classes.get(2), 0);
+ assert_eq!(classes.get(b'a' - 1), 0);
+ assert_eq!(classes.get(b'a'), 1);
+ assert_eq!(classes.get(b'm'), 1);
+ assert_eq!(classes.get(b'z'), 1);
+ assert_eq!(classes.get(b'z' + 1), 2);
+ assert_eq!(classes.get(254), 2);
+ assert_eq!(classes.get(255), 2);
+
+ let mut set = ByteClassSet::new();
+ set.set_range(0, 2);
+ set.set_range(4, 6);
+ let classes = set.byte_classes();
+ assert_eq!(classes.get(0), 0);
+ assert_eq!(classes.get(1), 0);
+ assert_eq!(classes.get(2), 0);
+ assert_eq!(classes.get(3), 1);
+ assert_eq!(classes.get(4), 2);
+ assert_eq!(classes.get(5), 2);
+ assert_eq!(classes.get(6), 2);
+ assert_eq!(classes.get(7), 3);
+ assert_eq!(classes.get(255), 3);
+ }
+
+ #[cfg(feature = "std")]
+ #[test]
+ fn full_byte_classes() {
+ use super::ByteClassSet;
+
+ let mut set = ByteClassSet::new();
+ for i in 0..256u16 {
+ set.set_range(i as u8, i as u8);
+ }
+ assert_eq!(set.byte_classes().alphabet_len(), 256);
+ }
+}
diff --git a/src/codegen.rs b/src/codegen.rs
new file mode 100644
index 0000000..b2aacbb
--- /dev/null
+++ b/src/codegen.rs
@@ -0,0 +1,104 @@
+// This module is unused. It was written as an experiment to get a ballpark
+// idea of what state machines look like when translated to Rust code, and
+// in particular, an idea of how much code it generates. The implementation
+// below isn't optimal with respect to size, but the result wasn't exactly
+// small. At some point, we should pursue building this out beyond
+// experimentation, and in particular, probably provide a command line tool
+// and/or a macro. It's a fair bit of work, so I abandoned it for the initial
+// release. ---AG
+
+use std::collections::HashMap;
+use std::io::Write;
+
+use dense::DFA;
+use state_id::StateID;
+
+macro_rules! wstr {
+ ($($tt:tt)*) => { write!($($tt)*).unwrap() }
+}
+
+macro_rules! wstrln {
+ ($($tt:tt)*) => { writeln!($($tt)*).unwrap() }
+}
+
+pub fn is_match_forward<S: StateID>(dfa: &DFA<S>) -> String {
+ let names = state_variant_names(dfa);
+
+ let mut buf = vec![];
+ wstrln!(buf, "pub fn is_match(input: &[u8]) -> bool {{");
+ if dfa.is_match_state(dfa.start()) {
+ wstrln!(buf, " return true;");
+ wstrln!(buf, "}}");
+ return String::from_utf8(buf).unwrap();
+ }
+
+ wstrln!(buf, "{}", state_enum_def(dfa, &names));
+
+ wstrln!(buf, " let mut state = {};", names[&dfa.start()]);
+ wstrln!(buf, " for &b in input.iter() {{");
+ wstrln!(buf, " state = match state {{");
+ for (id, s) in dfa.iter() {
+ if dfa.is_match_state(id) {
+ continue;
+ }
+
+ wstrln!(buf, " {} => {{", &names[&id]);
+ wstrln!(buf, " match b {{");
+ for (start, end, next_id) in s.sparse_transitions() {
+ if dfa.is_match_state(next_id) {
+ wstrln!(buf, " {:?}...{:?} => return true,", start, end);
+ } else {
+ if start == end {
+ wstrln!(buf, " {:?} => {},", start, &names[&next_id]);
+ } else {
+ wstrln!(buf, " {:?}...{:?} => {},", start, end, &names[&next_id]);
+ }
+ }
+ }
+ wstrln!(buf, " _ => S::S0,");
+ wstrln!(buf, " }}");
+ wstrln!(buf, " }}");
+ }
+ wstrln!(buf, " }};");
+ wstrln!(buf, " }}");
+
+ wstrln!(buf, " false");
+ wstrln!(buf, "}}");
+ String::from_utf8(buf).unwrap()
+}
+
+fn state_enum_def<S: StateID>(
+ dfa: &DFA<S>,
+ variant_names: &HashMap<S, String>,
+) -> String {
+ let mut buf = vec![];
+ wstrln!(buf, " #[derive(Clone, Copy)]");
+ wstr!(buf, " enum S {{");
+
+ let mut i = 0;
+ for (id, _) in dfa.iter() {
+ if dfa.is_match_state(id) {
+ continue;
+ }
+ if i % 10 == 0 {
+ wstr!(buf, "\n ");
+ }
+ let name = format!("S{}", id.to_usize());
+ wstr!(buf, " {},", name);
+ i += 1;
+ }
+ wstr!(buf, "\n");
+ wstrln!(buf, " }}");
+ String::from_utf8(buf).unwrap()
+}
+
+fn state_variant_names<S: StateID>(dfa: &DFA<S>) -> HashMap<S, String> {
+ let mut variants = HashMap::new();
+ for (id, _) in dfa.iter() {
+ if dfa.is_match_state(id) {
+ continue;
+ }
+ variants.insert(id, format!("S::S{}", id.to_usize()));
+ }
+ variants
+}
diff --git a/src/dense.rs b/src/dense.rs
new file mode 100644
index 0000000..ed4d1b6
--- /dev/null
+++ b/src/dense.rs
@@ -0,0 +1,2332 @@
+#[cfg(feature = "std")]
+use core::fmt;
+#[cfg(feature = "std")]
+use core::iter;
+use core::mem;
+use core::slice;
+
+#[cfg(feature = "std")]
+use byteorder::{BigEndian, LittleEndian};
+use byteorder::{ByteOrder, NativeEndian};
+#[cfg(feature = "std")]
+use regex_syntax::ParserBuilder;
+
+use classes::ByteClasses;
+#[cfg(feature = "std")]
+use determinize::Determinizer;
+use dfa::DFA;
+#[cfg(feature = "std")]
+use error::{Error, Result};
+#[cfg(feature = "std")]
+use minimize::Minimizer;
+#[cfg(feature = "std")]
+use nfa::{self, NFA};
+#[cfg(feature = "std")]
+use sparse::SparseDFA;
+use state_id::{dead_id, StateID};
+#[cfg(feature = "std")]
+use state_id::{
+ next_state_id, premultiply_overflow_error, write_state_id_bytes,
+};
+
+/// The size of the alphabet in a standard DFA.
+///
+/// Specifically, this length controls the number of transitions present in
+/// each DFA state. However, when the byte class optimization is enabled,
+/// then each DFA maps the space of all possible 256 byte values to at most
+/// 256 distinct equivalence classes. In this case, the number of distinct
+/// equivalence classes corresponds to the internal alphabet of the DFA, in the
+/// sense that each DFA state has a number of transitions equal to the number
+/// of equivalence classes despite supporting matching on all possible byte
+/// values.
+const ALPHABET_LEN: usize = 256;
+
+/// Masks used in serialization of DFAs.
+pub(crate) const MASK_PREMULTIPLIED: u16 = 0b0000_0000_0000_0001;
+pub(crate) const MASK_ANCHORED: u16 = 0b0000_0000_0000_0010;
+
+/// A dense table-based deterministic finite automaton (DFA).
+///
+/// A dense DFA represents the core matching primitive in this crate. That is,
+/// logically, all DFAs have a single start state, one or more match states
+/// and a transition table that maps the current state and the current byte of
+/// input to the next state. A DFA can use this information to implement fast
+/// searching. In particular, the use of a dense DFA generally makes the trade
+/// off that match speed is the most valuable characteristic, even if building
+/// the regex may take significant time *and* space. As such, the processing
+/// of every byte of input is done with a small constant number of operations
+/// that does not vary with the pattern, its size or the size of the alphabet.
+/// If your needs don't line up with this trade off, then a dense DFA may not
+/// be an adequate solution to your problem.
+///
+/// In contrast, a [sparse DFA](enum.SparseDFA.html) makes the opposite
+/// trade off: it uses less space but will execute a variable number of
+/// instructions per byte at match time, which makes it slower for matching.
+///
+/// A DFA can be built using the default configuration via the
+/// [`DenseDFA::new`](enum.DenseDFA.html#method.new) constructor. Otherwise,
+/// one can configure various aspects via the
+/// [`dense::Builder`](dense/struct.Builder.html).
+///
+/// A single DFA fundamentally supports the following operations:
+///
+/// 1. Detection of a match.
+/// 2. Location of the end of the first possible match.
+/// 3. Location of the end of the leftmost-first match.
+///
+/// A notable absence from the above list of capabilities is the location of
+/// the *start* of a match. In order to provide both the start and end of a
+/// match, *two* DFAs are required. This functionality is provided by a
+/// [`Regex`](struct.Regex.html), which can be built with its basic
+/// constructor, [`Regex::new`](struct.Regex.html#method.new), or with
+/// a [`RegexBuilder`](struct.RegexBuilder.html).
+///
+/// # State size
+///
+/// A `DenseDFA` has two type parameters, `T` and `S`. `T` corresponds to
+/// the type of the DFA's transition table while `S` corresponds to the
+/// representation used for the DFA's state identifiers as described by the
+/// [`StateID`](trait.StateID.html) trait. This type parameter is typically
+/// `usize`, but other valid choices provided by this crate include `u8`,
+/// `u16`, `u32` and `u64`. The primary reason for choosing a different state
+/// identifier representation than the default is to reduce the amount of
+/// memory used by a DFA. Note though, that if the chosen representation cannot
+/// accommodate the size of your DFA, then building the DFA will fail and
+/// return an error.
+///
+/// While the reduction in heap memory used by a DFA is one reason for choosing
+/// a smaller state identifier representation, another possible reason is for
+/// decreasing the serialization size of a DFA, as returned by
+/// [`to_bytes_little_endian`](enum.DenseDFA.html#method.to_bytes_little_endian),
+/// [`to_bytes_big_endian`](enum.DenseDFA.html#method.to_bytes_big_endian)
+/// or
+/// [`to_bytes_native_endian`](enum.DenseDFA.html#method.to_bytes_native_endian).
+///
+/// The type of the transition table is typically either `Vec<S>` or `&[S]`,
+/// depending on where the transition table is stored.
+///
+/// # Variants
+///
+/// This DFA is defined as a non-exhaustive enumeration of different types of
+/// dense DFAs. All of these dense DFAs use the same internal representation
+/// for the transition table, but they vary in how the transition table is
+/// read. A DFA's specific variant depends on the configuration options set via
+/// [`dense::Builder`](dense/struct.Builder.html). The default variant is
+/// `PremultipliedByteClass`.
+///
+/// # The `DFA` trait
+///
+/// This type implements the [`DFA`](trait.DFA.html) trait, which means it
+/// can be used for searching. For example:
+///
+/// ```
+/// use regex_automata::{DFA, DenseDFA};
+///
+/// # fn example() -> Result<(), regex_automata::Error> {
+/// let dfa = DenseDFA::new("foo[0-9]+")?;
+/// assert_eq!(Some(8), dfa.find(b"foo12345"));
+/// # Ok(()) }; example().unwrap()
+/// ```
+///
+/// The `DFA` trait also provides an assortment of other lower level methods
+/// for DFAs, such as `start_state` and `next_state`. While these are correctly
+/// implemented, it is an anti-pattern to use them in performance sensitive
+/// code on the `DenseDFA` type directly. Namely, each implementation requires
+/// a branch to determine which type of dense DFA is being used. Instead,
+/// this branch should be pushed up a layer in the code since walking the
+/// transitions of a DFA is usually a hot path. If you do need to use these
+/// lower level methods in performance critical code, then you should match on
+/// the variants of this DFA and use each variant's implementation of the `DFA`
+/// trait directly.
+#[derive(Clone, Debug)]
+pub enum DenseDFA<T: AsRef<[S]>, S: StateID> {
+ /// A standard DFA that does not use premultiplication or byte classes.
+ Standard(Standard<T, S>),
+ /// A DFA that shrinks its alphabet to a set of equivalence classes instead
+ /// of using all possible byte values. Any two bytes belong to the same
+ /// equivalence class if and only if they can be used interchangeably
+ /// anywhere in the DFA while never discriminating between a match and a
+ /// non-match.
+ ///
+ /// This type of DFA can result in significant space reduction with a very
+ /// small match time performance penalty.
+ ByteClass(ByteClass<T, S>),
+ /// A DFA that premultiplies all of its state identifiers in its
+ /// transition table. This saves an instruction per byte at match time
+ /// which improves search performance.
+ ///
+ /// The only downside of premultiplication is that it may prevent one from
+ /// using a smaller state identifier representation than you otherwise
+ /// could.
+ Premultiplied(Premultiplied<T, S>),
+ /// The default configuration of a DFA, which uses byte classes and
+ /// premultiplies its state identifiers.
+ PremultipliedByteClass(PremultipliedByteClass<T, S>),
+ /// Hints that destructuring should not be exhaustive.
+ ///
+ /// This enum may grow additional variants, so this makes sure clients
+ /// don't count on exhaustive matching. (Otherwise, adding a new variant
+ /// could break existing code.)
+ #[doc(hidden)]
+ __Nonexhaustive,
+}
+
+impl<T: AsRef<[S]>, S: StateID> DenseDFA<T, S> {
+ /// Return the internal DFA representation.
+ ///
+ /// All variants share the same internal representation.
+ fn repr(&self) -> &Repr<T, S> {
+ match *self {
+ DenseDFA::Standard(ref r) => &r.0,
+ DenseDFA::ByteClass(ref r) => &r.0,
+ DenseDFA::Premultiplied(ref r) => &r.0,
+ DenseDFA::PremultipliedByteClass(ref r) => &r.0,
+ DenseDFA::__Nonexhaustive => unreachable!(),
+ }
+ }
+}
+
+#[cfg(feature = "std")]
+impl DenseDFA<Vec<usize>, usize> {
+ /// Parse the given regular expression using a default configuration and
+ /// return the corresponding DFA.
+ ///
+ /// The default configuration uses `usize` for state IDs, premultiplies
+ /// them and reduces the alphabet size by splitting bytes into equivalence
+ /// classes. The DFA is *not* minimized.
+ ///
+ /// If you want a non-default configuration, then use the
+ /// [`dense::Builder`](dense/struct.Builder.html)
+ /// to set your own configuration.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{DFA, DenseDFA};
+ ///
+ /// # fn example() -> Result<(), regex_automata::Error> {
+ /// let dfa = DenseDFA::new("foo[0-9]+bar")?;
+ /// assert_eq!(Some(11), dfa.find(b"foo12345bar"));
+ /// # Ok(()) }; example().unwrap()
+ /// ```
+ pub fn new(pattern: &str) -> Result<DenseDFA<Vec<usize>, usize>> {
+ Builder::new().build(pattern)
+ }
+}
+
+#[cfg(feature = "std")]
+impl<S: StateID> DenseDFA<Vec<S>, S> {
+ /// Create a new empty DFA that never matches any input.
+ ///
+ /// # Example
+ ///
+ /// In order to build an empty DFA, callers must provide a type hint
+ /// indicating their choice of state identifier representation.
+ ///
+ /// ```
+ /// use regex_automata::{DFA, DenseDFA};
+ ///
+ /// # fn example() -> Result<(), regex_automata::Error> {
+ /// let dfa: DenseDFA<Vec<usize>, usize> = DenseDFA::empty();
+ /// assert_eq!(None, dfa.find(b""));
+ /// assert_eq!(None, dfa.find(b"foo"));
+ /// # Ok(()) }; example().unwrap()
+ /// ```
+ pub fn empty() -> DenseDFA<Vec<S>, S> {
+ Repr::empty().into_dense_dfa()
+ }
+}
+
+impl<T: AsRef<[S]>, S: StateID> DenseDFA<T, S> {
+ /// Cheaply return a borrowed version of this dense DFA. Specifically, the
+ /// DFA returned always uses `&[S]` for its transition table while keeping
+ /// the same state identifier representation.
+ pub fn as_ref<'a>(&'a self) -> DenseDFA<&'a [S], S> {
+ match *self {
+ DenseDFA::Standard(ref r) => {
+ DenseDFA::Standard(Standard(r.0.as_ref()))
+ }
+ DenseDFA::ByteClass(ref r) => {
+ DenseDFA::ByteClass(ByteClass(r.0.as_ref()))
+ }
+ DenseDFA::Premultiplied(ref r) => {
+ DenseDFA::Premultiplied(Premultiplied(r.0.as_ref()))
+ }
+ DenseDFA::PremultipliedByteClass(ref r) => {
+ let inner = PremultipliedByteClass(r.0.as_ref());
+ DenseDFA::PremultipliedByteClass(inner)
+ }
+ DenseDFA::__Nonexhaustive => unreachable!(),
+ }
+ }
+
+ /// Return an owned version of this sparse DFA. Specifically, the DFA
+ /// returned always uses `Vec<u8>` for its transition table while keeping
+ /// the same state identifier representation.
+ ///
+ /// Effectively, this returns a sparse DFA whose transition table lives
+ /// on the heap.
+ #[cfg(feature = "std")]
+ pub fn to_owned(&self) -> DenseDFA<Vec<S>, S> {
+ match *self {
+ DenseDFA::Standard(ref r) => {
+ DenseDFA::Standard(Standard(r.0.to_owned()))
+ }
+ DenseDFA::ByteClass(ref r) => {
+ DenseDFA::ByteClass(ByteClass(r.0.to_owned()))
+ }
+ DenseDFA::Premultiplied(ref r) => {
+ DenseDFA::Premultiplied(Premultiplied(r.0.to_owned()))
+ }
+ DenseDFA::PremultipliedByteClass(ref r) => {
+ let inner = PremultipliedByteClass(r.0.to_owned());
+ DenseDFA::PremultipliedByteClass(inner)
+ }
+ DenseDFA::__Nonexhaustive => unreachable!(),
+ }
+ }
+
+ /// Returns the memory usage, in bytes, of this DFA.
+ ///
+ /// The memory usage is computed based on the number of bytes used to
+ /// represent this DFA's transition table. This corresponds to heap memory
+ /// usage.
+ ///
+ /// This does **not** include the stack size used up by this DFA. To
+ /// compute that, used `std::mem::size_of::<DenseDFA>()`.
+ pub fn memory_usage(&self) -> usize {
+ self.repr().memory_usage()
+ }
+}
+
+/// Routines for converting a dense DFA to other representations, such as
+/// sparse DFAs, smaller state identifiers or raw bytes suitable for persistent
+/// storage.
+#[cfg(feature = "std")]
+impl<T: AsRef<[S]>, S: StateID> DenseDFA<T, S> {
+ /// Convert this dense DFA to a sparse DFA.
+ ///
+ /// This is a convenience routine for `to_sparse_sized` that fixes the
+ /// state identifier representation of the sparse DFA to the same
+ /// representation used for this dense DFA.
+ ///
+ /// If the chosen state identifier representation is too small to represent
+ /// all states in the sparse DFA, then this returns an error. In most
+ /// cases, if a dense DFA is constructable with `S` then a sparse DFA will
+ /// be as well. However, it is not guaranteed.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{DFA, DenseDFA};
+ ///
+ /// # fn example() -> Result<(), regex_automata::Error> {
+ /// let dense = DenseDFA::new("foo[0-9]+")?;
+ /// let sparse = dense.to_sparse()?;
+ /// assert_eq!(Some(8), sparse.find(b"foo12345"));
+ /// # Ok(()) }; example().unwrap()
+ /// ```
+ pub fn to_sparse(&self) -> Result<SparseDFA<Vec<u8>, S>> {
+ self.to_sparse_sized()
+ }
+
+ /// Convert this dense DFA to a sparse DFA.
+ ///
+ /// Using this routine requires supplying a type hint to choose the state
+ /// identifier representation for the resulting sparse DFA.
+ ///
+ /// If the chosen state identifier representation is too small to represent
+ /// all states in the sparse DFA, then this returns an error.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{DFA, DenseDFA};
+ ///
+ /// # fn example() -> Result<(), regex_automata::Error> {
+ /// let dense = DenseDFA::new("foo[0-9]+")?;
+ /// let sparse = dense.to_sparse_sized::<u8>()?;
+ /// assert_eq!(Some(8), sparse.find(b"foo12345"));
+ /// # Ok(()) }; example().unwrap()
+ /// ```
+ pub fn to_sparse_sized<A: StateID>(
+ &self,
+ ) -> Result<SparseDFA<Vec<u8>, A>> {
+ self.repr().to_sparse_sized()
+ }
+
+ /// Create a new DFA whose match semantics are equivalent to this DFA,
+ /// but attempt to use `u8` for the representation of state identifiers.
+ /// If `u8` is insufficient to represent all state identifiers in this
+ /// DFA, then this returns an error.
+ ///
+ /// This is a convenience routine for `to_sized::<u8>()`.
+ pub fn to_u8(&self) -> Result<DenseDFA<Vec<u8>, u8>> {
+ self.to_sized()
+ }
+
+ /// Create a new DFA whose match semantics are equivalent to this DFA,
+ /// but attempt to use `u16` for the representation of state identifiers.
+ /// If `u16` is insufficient to represent all state identifiers in this
+ /// DFA, then this returns an error.
+ ///
+ /// This is a convenience routine for `to_sized::<u16>()`.
+ pub fn to_u16(&self) -> Result<DenseDFA<Vec<u16>, u16>> {
+ self.to_sized()
+ }
+
+ /// Create a new DFA whose match semantics are equivalent to this DFA,
+ /// but attempt to use `u32` for the representation of state identifiers.
+ /// If `u32` is insufficient to represent all state identifiers in this
+ /// DFA, then this returns an error.
+ ///
+ /// This is a convenience routine for `to_sized::<u32>()`.
+ #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
+ pub fn to_u32(&self) -> Result<DenseDFA<Vec<u32>, u32>> {
+ self.to_sized()
+ }
+
+ /// Create a new DFA whose match semantics are equivalent to this DFA,
+ /// but attempt to use `u64` for the representation of state identifiers.
+ /// If `u64` is insufficient to represent all state identifiers in this
+ /// DFA, then this returns an error.
+ ///
+ /// This is a convenience routine for `to_sized::<u64>()`.
+ #[cfg(target_pointer_width = "64")]
+ pub fn to_u64(&self) -> Result<DenseDFA<Vec<u64>, u64>> {
+ self.to_sized()
+ }
+
+ /// Create a new DFA whose match semantics are equivalent to this DFA, but
+ /// attempt to use `A` for the representation of state identifiers. If `A`
+ /// is insufficient to represent all state identifiers in this DFA, then
+ /// this returns an error.
+ ///
+ /// An alternative way to construct such a DFA is to use
+ /// [`dense::Builder::build_with_size`](dense/struct.Builder.html#method.build_with_size).
+ /// In general, using the builder is preferred since it will use the given
+ /// state identifier representation throughout determinization (and
+ /// minimization, if done), and thereby using less memory throughout the
+ /// entire construction process. However, these routines are necessary
+ /// in cases where, say, a minimized DFA could fit in a smaller state
+ /// identifier representation, but the initial determinized DFA would not.
+ pub fn to_sized<A: StateID>(&self) -> Result<DenseDFA<Vec<A>, A>> {
+ self.repr().to_sized().map(|r| r.into_dense_dfa())
+ }
+
+ /// Serialize a DFA to raw bytes, aligned to an 8 byte boundary, in little
+ /// endian format.
+ ///
+ /// If the state identifier representation of this DFA has a size different
+ /// than 1, 2, 4 or 8 bytes, then this returns an error. All
+ /// implementations of `StateID` provided by this crate satisfy this
+ /// requirement.
+ pub fn to_bytes_little_endian(&self) -> Result<Vec<u8>> {
+ self.repr().to_bytes::<LittleEndian>()
+ }
+
+ /// Serialize a DFA to raw bytes, aligned to an 8 byte boundary, in big
+ /// endian format.
+ ///
+ /// If the state identifier representation of this DFA has a size different
+ /// than 1, 2, 4 or 8 bytes, then this returns an error. All
+ /// implementations of `StateID` provided by this crate satisfy this
+ /// requirement.
+ pub fn to_bytes_big_endian(&self) -> Result<Vec<u8>> {
+ self.repr().to_bytes::<BigEndian>()
+ }
+
+ /// Serialize a DFA to raw bytes, aligned to an 8 byte boundary, in native
+ /// endian format. Generally, it is better to pick an explicit endianness
+ /// using either `to_bytes_little_endian` or `to_bytes_big_endian`. This
+ /// routine is useful in tests where the DFA is serialized and deserialized
+ /// on the same platform.
+ ///
+ /// If the state identifier representation of this DFA has a size different
+ /// than 1, 2, 4 or 8 bytes, then this returns an error. All
+ /// implementations of `StateID` provided by this crate satisfy this
+ /// requirement.
+ pub fn to_bytes_native_endian(&self) -> Result<Vec<u8>> {
+ self.repr().to_bytes::<NativeEndian>()
+ }
+}
+
+impl<'a, S: StateID> DenseDFA<&'a [S], S> {
+ /// Deserialize a DFA with a specific state identifier representation.
+ ///
+ /// Deserializing a DFA using this routine will never allocate heap memory.
+ /// This is also guaranteed to be a constant time operation that does not
+ /// vary with the size of the DFA.
+ ///
+ /// The bytes given should be generated by the serialization of a DFA with
+ /// either the
+ /// [`to_bytes_little_endian`](enum.DenseDFA.html#method.to_bytes_little_endian)
+ /// method or the
+ /// [`to_bytes_big_endian`](enum.DenseDFA.html#method.to_bytes_big_endian)
+ /// endian, depending on the endianness of the machine you are
+ /// deserializing this DFA from.
+ ///
+ /// If the state identifier representation is `usize`, then deserialization
+ /// is dependent on the pointer size. For this reason, it is best to
+ /// serialize DFAs using a fixed size representation for your state
+ /// identifiers, such as `u8`, `u16`, `u32` or `u64`.
+ ///
+ /// # Panics
+ ///
+ /// The bytes given should be *trusted*. In particular, if the bytes
+ /// are not a valid serialization of a DFA, or if the given bytes are
+ /// not aligned to an 8 byte boundary, or if the endianness of the
+ /// serialized bytes is different than the endianness of the machine that
+ /// is deserializing the DFA, then this routine will panic. Moreover, it is
+ /// possible for this deserialization routine to succeed even if the given
+ /// bytes do not represent a valid serialized dense DFA.
+ ///
+ /// # Safety
+ ///
+ /// This routine is unsafe because it permits callers to provide an
+ /// arbitrary transition table with possibly incorrect transitions. While
+ /// the various serialization routines will never return an incorrect
+ /// transition table, there is no guarantee that the bytes provided here
+ /// are correct. While deserialization does many checks (as documented
+ /// above in the panic conditions), this routine does not check that the
+ /// transition table is correct. Given an incorrect transition table, it is
+ /// possible for the search routines to access out-of-bounds memory because
+ /// of explicit bounds check elision.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to serialize a DFA to raw bytes, deserialize it
+ /// and then use it for searching. Note that we first convert the DFA to
+ /// using `u16` for its state identifier representation before serializing
+ /// it. While this isn't strictly necessary, it's good practice in order to
+ /// decrease the size of the DFA and to avoid platform specific pitfalls
+ /// such as differing pointer sizes.
+ ///
+ /// ```
+ /// use regex_automata::{DFA, DenseDFA};
+ ///
+ /// # fn example() -> Result<(), regex_automata::Error> {
+ /// let initial = DenseDFA::new("foo[0-9]+")?;
+ /// let bytes = initial.to_u16()?.to_bytes_native_endian()?;
+ /// let dfa: DenseDFA<&[u16], u16> = unsafe {
+ /// DenseDFA::from_bytes(&bytes)
+ /// };
+ ///
+ /// assert_eq!(Some(8), dfa.find(b"foo12345"));
+ /// # Ok(()) }; example().unwrap()
+ /// ```
+ pub unsafe fn from_bytes(buf: &'a [u8]) -> DenseDFA<&'a [S], S> {
+ Repr::from_bytes(buf).into_dense_dfa()
+ }
+}
+
+#[cfg(feature = "std")]
+impl<S: StateID> DenseDFA<Vec<S>, S> {
+ /// Minimize this DFA in place.
+ ///
+ /// This is not part of the public API. It is only exposed to allow for
+ /// more granular external benchmarking.
+ #[doc(hidden)]
+ pub fn minimize(&mut self) {
+ self.repr_mut().minimize();
+ }
+
+ /// Return a mutable reference to the internal DFA representation.
+ fn repr_mut(&mut self) -> &mut Repr<Vec<S>, S> {
+ match *self {
+ DenseDFA::Standard(ref mut r) => &mut r.0,
+ DenseDFA::ByteClass(ref mut r) => &mut r.0,
+ DenseDFA::Premultiplied(ref mut r) => &mut r.0,
+ DenseDFA::PremultipliedByteClass(ref mut r) => &mut r.0,
+ DenseDFA::__Nonexhaustive => unreachable!(),
+ }
+ }
+}
+
+impl<T: AsRef<[S]>, S: StateID> DFA for DenseDFA<T, S> {
+ type ID = S;
+
+ #[inline]
+ fn start_state(&self) -> S {
+ self.repr().start_state()
+ }
+
+ #[inline]
+ fn is_match_state(&self, id: S) -> bool {
+ self.repr().is_match_state(id)
+ }
+
+ #[inline]
+ fn is_dead_state(&self, id: S) -> bool {
+ self.repr().is_dead_state(id)
+ }
+
+ #[inline]
+ fn is_match_or_dead_state(&self, id: S) -> bool {
+ self.repr().is_match_or_dead_state(id)
+ }
+
+ #[inline]
+ fn is_anchored(&self) -> bool {
+ self.repr().is_anchored()
+ }
+
+ #[inline]
+ fn next_state(&self, current: S, input: u8) -> S {
+ match *self {
+ DenseDFA::Standard(ref r) => r.next_state(current, input),
+ DenseDFA::ByteClass(ref r) => r.next_state(current, input),
+ DenseDFA::Premultiplied(ref r) => r.next_state(current, input),
+ DenseDFA::PremultipliedByteClass(ref r) => {
+ r.next_state(current, input)
+ }
+ DenseDFA::__Nonexhaustive => unreachable!(),
+ }
+ }
+
+ #[inline]
+ unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
+ match *self {
+ DenseDFA::Standard(ref r) => {
+ r.next_state_unchecked(current, input)
+ }
+ DenseDFA::ByteClass(ref r) => {
+ r.next_state_unchecked(current, input)
+ }
+ DenseDFA::Premultiplied(ref r) => {
+ r.next_state_unchecked(current, input)
+ }
+ DenseDFA::PremultipliedByteClass(ref r) => {
+ r.next_state_unchecked(current, input)
+ }
+ DenseDFA::__Nonexhaustive => unreachable!(),
+ }
+ }
+
+ // We specialize the following methods because it lets us lift the
+ // case analysis between the different types of dense DFAs. Instead of
+ // doing the case analysis for every transition, we do it once before
+ // searching.
+
+ #[inline]
+ fn is_match_at(&self, bytes: &[u8], start: usize) -> bool {
+ match *self {
+ DenseDFA::Standard(ref r) => r.is_match_at(bytes, start),
+ DenseDFA::ByteClass(ref r) => r.is_match_at(bytes, start),
+ DenseDFA::Premultiplied(ref r) => r.is_match_at(bytes, start),
+ DenseDFA::PremultipliedByteClass(ref r) => {
+ r.is_match_at(bytes, start)
+ }
+ DenseDFA::__Nonexhaustive => unreachable!(),
+ }
+ }
+
+ #[inline]
+ fn shortest_match_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
+ match *self {
+ DenseDFA::Standard(ref r) => r.shortest_match_at(bytes, start),
+ DenseDFA::ByteClass(ref r) => r.shortest_match_at(bytes, start),
+ DenseDFA::Premultiplied(ref r) => {
+ r.shortest_match_at(bytes, start)
+ }
+ DenseDFA::PremultipliedByteClass(ref r) => {
+ r.shortest_match_at(bytes, start)
+ }
+ DenseDFA::__Nonexhaustive => unreachable!(),
+ }
+ }
+
+ #[inline]
+ fn find_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
+ match *self {
+ DenseDFA::Standard(ref r) => r.find_at(bytes, start),
+ DenseDFA::ByteClass(ref r) => r.find_at(bytes, start),
+ DenseDFA::Premultiplied(ref r) => r.find_at(bytes, start),
+ DenseDFA::PremultipliedByteClass(ref r) => r.find_at(bytes, start),
+ DenseDFA::__Nonexhaustive => unreachable!(),
+ }
+ }
+
+ #[inline]
+ fn rfind_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
+ match *self {
+ DenseDFA::Standard(ref r) => r.rfind_at(bytes, start),
+ DenseDFA::ByteClass(ref r) => r.rfind_at(bytes, start),
+ DenseDFA::Premultiplied(ref r) => r.rfind_at(bytes, start),
+ DenseDFA::PremultipliedByteClass(ref r) => {
+ r.rfind_at(bytes, start)
+ }
+ DenseDFA::__Nonexhaustive => unreachable!(),
+ }
+ }
+}
+
+/// A standard dense DFA that does not use premultiplication or byte classes.
+///
+/// Generally, it isn't necessary to use this type directly, since a `DenseDFA`
+/// can be used for searching directly. One possible reason why one might want
+/// to use this type directly is if you are implementing your own search
+/// routines by walking a DFA's transitions directly. In that case, you'll want
+/// to use this type (or any of the other DFA variant types) directly, since
+/// they implement `next_state` more efficiently.
+#[derive(Clone, Debug)]
+pub struct Standard<T: AsRef<[S]>, S: StateID>(Repr<T, S>);
+
+impl<T: AsRef<[S]>, S: StateID> DFA for Standard<T, S> {
+ type ID = S;
+
+ #[inline]
+ fn start_state(&self) -> S {
+ self.0.start_state()
+ }
+
+ #[inline]
+ fn is_match_state(&self, id: S) -> bool {
+ self.0.is_match_state(id)
+ }
+
+ #[inline]
+ fn is_dead_state(&self, id: S) -> bool {
+ self.0.is_dead_state(id)
+ }
+
+ #[inline]
+ fn is_match_or_dead_state(&self, id: S) -> bool {
+ self.0.is_match_or_dead_state(id)
+ }
+
+ #[inline]
+ fn is_anchored(&self) -> bool {
+ self.0.is_anchored()
+ }
+
+ #[inline]
+ fn next_state(&self, current: S, input: u8) -> S {
+ let o = current.to_usize() * ALPHABET_LEN + input as usize;
+ self.0.trans()[o]
+ }
+
+ #[inline]
+ unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
+ let o = current.to_usize() * ALPHABET_LEN + input as usize;
+ *self.0.trans().get_unchecked(o)
+ }
+}
+
+/// A dense DFA that shrinks its alphabet.
+///
+/// Alphabet shrinking is achieved by using a set of equivalence classes
+/// instead of using all possible byte values. Any two bytes belong to the same
+/// equivalence class if and only if they can be used interchangeably anywhere
+/// in the DFA while never discriminating between a match and a non-match.
+///
+/// This type of DFA can result in significant space reduction with a very
+/// small match time performance penalty.
+///
+/// Generally, it isn't necessary to use this type directly, since a `DenseDFA`
+/// can be used for searching directly. One possible reason why one might want
+/// to use this type directly is if you are implementing your own search
+/// routines by walking a DFA's transitions directly. In that case, you'll want
+/// to use this type (or any of the other DFA variant types) directly, since
+/// they implement `next_state` more efficiently.
+#[derive(Clone, Debug)]
+pub struct ByteClass<T: AsRef<[S]>, S: StateID>(Repr<T, S>);
+
+impl<T: AsRef<[S]>, S: StateID> DFA for ByteClass<T, S> {
+ type ID = S;
+
+ #[inline]
+ fn start_state(&self) -> S {
+ self.0.start_state()
+ }
+
+ #[inline]
+ fn is_match_state(&self, id: S) -> bool {
+ self.0.is_match_state(id)
+ }
+
+ #[inline]
+ fn is_dead_state(&self, id: S) -> bool {
+ self.0.is_dead_state(id)
+ }
+
+ #[inline]
+ fn is_match_or_dead_state(&self, id: S) -> bool {
+ self.0.is_match_or_dead_state(id)
+ }
+
+ #[inline]
+ fn is_anchored(&self) -> bool {
+ self.0.is_anchored()
+ }
+
+ #[inline]
+ fn next_state(&self, current: S, input: u8) -> S {
+ let input = self.0.byte_classes().get(input);
+ let o = current.to_usize() * self.0.alphabet_len() + input as usize;
+ self.0.trans()[o]
+ }
+
+ #[inline]
+ unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
+ let input = self.0.byte_classes().get_unchecked(input);
+ let o = current.to_usize() * self.0.alphabet_len() + input as usize;
+ *self.0.trans().get_unchecked(o)
+ }
+}
+
+/// A dense DFA that premultiplies all of its state identifiers in its
+/// transition table.
+///
+/// This saves an instruction per byte at match time which improves search
+/// performance.
+///
+/// The only downside of premultiplication is that it may prevent one from
+/// using a smaller state identifier representation than you otherwise could.
+///
+/// Generally, it isn't necessary to use this type directly, since a `DenseDFA`
+/// can be used for searching directly. One possible reason why one might want
+/// to use this type directly is if you are implementing your own search
+/// routines by walking a DFA's transitions directly. In that case, you'll want
+/// to use this type (or any of the other DFA variant types) directly, since
+/// they implement `next_state` more efficiently.
+#[derive(Clone, Debug)]
+pub struct Premultiplied<T: AsRef<[S]>, S: StateID>(Repr<T, S>);
+
+impl<T: AsRef<[S]>, S: StateID> DFA for Premultiplied<T, S> {
+ type ID = S;
+
+ #[inline]
+ fn start_state(&self) -> S {
+ self.0.start_state()
+ }
+
+ #[inline]
+ fn is_match_state(&self, id: S) -> bool {
+ self.0.is_match_state(id)
+ }
+
+ #[inline]
+ fn is_dead_state(&self, id: S) -> bool {
+ self.0.is_dead_state(id)
+ }
+
+ #[inline]
+ fn is_match_or_dead_state(&self, id: S) -> bool {
+ self.0.is_match_or_dead_state(id)
+ }
+
+ #[inline]
+ fn is_anchored(&self) -> bool {
+ self.0.is_anchored()
+ }
+
+ #[inline]
+ fn next_state(&self, current: S, input: u8) -> S {
+ let o = current.to_usize() + input as usize;
+ self.0.trans()[o]
+ }
+
+ #[inline]
+ unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
+ let o = current.to_usize() + input as usize;
+ *self.0.trans().get_unchecked(o)
+ }
+}
+
+/// The default configuration of a dense DFA, which uses byte classes and
+/// premultiplies its state identifiers.
+///
+/// Generally, it isn't necessary to use this type directly, since a `DenseDFA`
+/// can be used for searching directly. One possible reason why one might want
+/// to use this type directly is if you are implementing your own search
+/// routines by walking a DFA's transitions directly. In that case, you'll want
+/// to use this type (or any of the other DFA variant types) directly, since
+/// they implement `next_state` more efficiently.
+#[derive(Clone, Debug)]
+pub struct PremultipliedByteClass<T: AsRef<[S]>, S: StateID>(Repr<T, S>);
+
+impl<T: AsRef<[S]>, S: StateID> DFA for PremultipliedByteClass<T, S> {
+ type ID = S;
+
+ #[inline]
+ fn start_state(&self) -> S {
+ self.0.start_state()
+ }
+
+ #[inline]
+ fn is_match_state(&self, id: S) -> bool {
+ self.0.is_match_state(id)
+ }
+
+ #[inline]
+ fn is_dead_state(&self, id: S) -> bool {
+ self.0.is_dead_state(id)
+ }
+
+ #[inline]
+ fn is_match_or_dead_state(&self, id: S) -> bool {
+ self.0.is_match_or_dead_state(id)
+ }
+
+ #[inline]
+ fn is_anchored(&self) -> bool {
+ self.0.is_anchored()
+ }
+
+ #[inline]
+ fn next_state(&self, current: S, input: u8) -> S {
+ let input = self.0.byte_classes().get(input);
+ let o = current.to_usize() + input as usize;
+ self.0.trans()[o]
+ }
+
+ #[inline]
+ unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
+ let input = self.0.byte_classes().get_unchecked(input);
+ let o = current.to_usize() + input as usize;
+ *self.0.trans().get_unchecked(o)
+ }
+}
+
+/// The internal representation of a dense DFA.
+///
+/// This representation is shared by all DFA variants.
+#[derive(Clone)]
+#[cfg_attr(not(feature = "std"), derive(Debug))]
+pub(crate) struct Repr<T, S> {
+ /// Whether the state identifiers in the transition table have been
+ /// premultiplied or not.
+ ///
+ /// Premultiplied identifiers means that instead of your matching loop
+ /// looking something like this:
+ ///
+ /// state = dfa.start
+ /// for byte in haystack:
+ /// next = dfa.transitions[state * len(alphabet) + byte]
+ /// if dfa.is_match(next):
+ /// return true
+ /// return false
+ ///
+ /// it can instead look like this:
+ ///
+ /// state = dfa.start
+ /// for byte in haystack:
+ /// next = dfa.transitions[state + byte]
+ /// if dfa.is_match(next):
+ /// return true
+ /// return false
+ ///
+ /// In other words, we save a multiplication instruction in the critical
+ /// path. This turns out to be a decent performance win. The cost of using
+ /// premultiplied state ids is that they can require a bigger state id
+ /// representation.
+ premultiplied: bool,
+ /// Whether this DFA can only match at the beginning of input or not.
+ ///
+ /// When true, a match should only be reported if it begins at the 0th
+ /// index of the haystack.
+ anchored: bool,
+ /// The initial start state ID.
+ start: S,
+ /// The total number of states in this DFA. Note that a DFA always has at
+ /// least one state---the dead state---even the empty DFA. In particular,
+ /// the dead state always has ID 0 and is correspondingly always the first
+ /// state. The dead state is never a match state.
+ state_count: usize,
+ /// States in a DFA have a *partial* ordering such that a match state
+ /// always precedes any non-match state (except for the special dead
+ /// state).
+ ///
+ /// `max_match` corresponds to the last state that is a match state. This
+ /// encoding has two critical benefits. Firstly, we are not required to
+ /// store any additional per-state information about whether it is a match
+ /// state or not. Secondly, when searching with the DFA, we can do a single
+ /// comparison with `max_match` for each byte instead of two comparisons
+ /// for each byte (one testing whether it is a match and the other testing
+ /// whether we've reached a dead state). Namely, to determine the status
+ /// of the next state, we can do this:
+ ///
+ /// next_state = transition[cur_state * alphabet_len + cur_byte]
+ /// if next_state <= max_match:
+ /// // next_state is either dead (no-match) or a match
+ /// return next_state != dead
+ max_match: S,
+ /// A set of equivalence classes, where a single equivalence class
+ /// represents a set of bytes that never discriminate between a match
+ /// and a non-match in the DFA. Each equivalence class corresponds to
+ /// a single letter in this DFA's alphabet, where the maximum number of
+ /// letters is 256 (each possible value of a byte). Consequently, the
+ /// number of equivalence classes corresponds to the number of transitions
+ /// for each DFA state.
+ ///
+ /// The only time the number of equivalence classes is fewer than 256 is
+ /// if the DFA's kind uses byte classes. If the DFA doesn't use byte
+ /// classes, then this vector is empty.
+ byte_classes: ByteClasses,
+ /// A contiguous region of memory representing the transition table in
+ /// row-major order. The representation is dense. That is, every state has
+ /// precisely the same number of transitions. The maximum number of
+ /// transitions is 256. If a DFA has been instructed to use byte classes,
+ /// then the number of transitions can be much less.
+ ///
+ /// In practice, T is either Vec<S> or &[S].
+ trans: T,
+}
+
+#[cfg(feature = "std")]
+impl<S: StateID> Repr<Vec<S>, S> {
+ /// Create a new empty DFA with singleton byte classes (every byte is its
+ /// own equivalence class).
+ pub fn empty() -> Repr<Vec<S>, S> {
+ Repr::empty_with_byte_classes(ByteClasses::singletons())
+ }
+
+ /// Create a new empty DFA with the given set of byte equivalence classes.
+ /// An empty DFA never matches any input.
+ pub fn empty_with_byte_classes(
+ byte_classes: ByteClasses,
+ ) -> Repr<Vec<S>, S> {
+ let mut dfa = Repr {
+ premultiplied: false,
+ anchored: true,
+ start: dead_id(),
+ state_count: 0,
+ max_match: S::from_usize(0),
+ byte_classes,
+ trans: vec![],
+ };
+ // Every state ID repr must be able to fit at least one state.
+ dfa.add_empty_state().unwrap();
+ dfa
+ }
+
+ /// Sets whether this DFA is anchored or not.
+ pub fn anchored(mut self, yes: bool) -> Repr<Vec<S>, S> {
+ self.anchored = yes;
+ self
+ }
+}
+
+impl<T: AsRef<[S]>, S: StateID> Repr<T, S> {
+ /// Convert this internal DFA representation to a DenseDFA based on its
+ /// transition table access pattern.
+ pub fn into_dense_dfa(self) -> DenseDFA<T, S> {
+ match (self.premultiplied, self.byte_classes().is_singleton()) {
+ // no premultiplication, no byte classes
+ (false, true) => DenseDFA::Standard(Standard(self)),
+ // no premultiplication, yes byte classes
+ (false, false) => DenseDFA::ByteClass(ByteClass(self)),
+ // yes premultiplication, no byte classes
+ (true, true) => DenseDFA::Premultiplied(Premultiplied(self)),
+ // yes premultiplication, yes byte classes
+ (true, false) => {
+ DenseDFA::PremultipliedByteClass(PremultipliedByteClass(self))
+ }
+ }
+ }
+
+ fn as_ref<'a>(&'a self) -> Repr<&'a [S], S> {
+ Repr {
+ premultiplied: self.premultiplied,
+ anchored: self.anchored,
+ start: self.start,
+ state_count: self.state_count,
+ max_match: self.max_match,
+ byte_classes: self.byte_classes().clone(),
+ trans: self.trans(),
+ }
+ }
+
+ #[cfg(feature = "std")]
+ fn to_owned(&self) -> Repr<Vec<S>, S> {
+ Repr {
+ premultiplied: self.premultiplied,
+ anchored: self.anchored,
+ start: self.start,
+ state_count: self.state_count,
+ max_match: self.max_match,
+ byte_classes: self.byte_classes().clone(),
+ trans: self.trans().to_vec(),
+ }
+ }
+
+ /// Return the starting state of this DFA.
+ ///
+ /// All searches using this DFA must begin at this state. There is exactly
+ /// one starting state for every DFA. A starting state may be a dead state
+ /// or a matching state or neither.
+ pub fn start_state(&self) -> S {
+ self.start
+ }
+
+ /// Returns true if and only if the given identifier corresponds to a match
+ /// state.
+ pub fn is_match_state(&self, id: S) -> bool {
+ id <= self.max_match && id != dead_id()
+ }
+
+ /// Returns true if and only if the given identifier corresponds to a dead
+ /// state.
+ pub fn is_dead_state(&self, id: S) -> bool {
+ id == dead_id()
+ }
+
+ /// Returns true if and only if the given identifier could correspond to
+ /// either a match state or a dead state. If this returns false, then the
+ /// given identifier does not correspond to either a match state or a dead
+ /// state.
+ pub fn is_match_or_dead_state(&self, id: S) -> bool {
+ id <= self.max_match_state()
+ }
+
+ /// Returns the maximum identifier for which a match state can exist.
+ ///
+ /// More specifically, the return identifier always corresponds to either
+ /// a match state or a dead state. Namely, either
+ /// `is_match_state(returned)` or `is_dead_state(returned)` is guaranteed
+ /// to be true.
+ pub fn max_match_state(&self) -> S {
+ self.max_match
+ }
+
+ /// Returns true if and only if this DFA is anchored.
+ pub fn is_anchored(&self) -> bool {
+ self.anchored
+ }
+
+ /// Return the byte classes used by this DFA.
+ pub fn byte_classes(&self) -> &ByteClasses {
+ &self.byte_classes
+ }
+
+ /// Returns an iterator over all states in this DFA.
+ ///
+ /// This iterator yields a tuple for each state. The first element of the
+ /// tuple corresponds to a state's identifier, and the second element
+ /// corresponds to the state itself (comprised of its transitions).
+ ///
+ /// If this DFA is premultiplied, then the state identifiers are in
+ /// turn premultiplied as well, making them usable without additional
+ /// modification.
+ #[cfg(feature = "std")]
+ pub fn states(&self) -> StateIter<T, S> {
+ let it = self.trans().chunks(self.alphabet_len());
+ StateIter { dfa: self, it: it.enumerate() }
+ }
+
+ /// Return the total number of states in this DFA. Every DFA has at least
+ /// 1 state, even the empty DFA.
+ #[cfg(feature = "std")]
+ pub fn state_count(&self) -> usize {
+ self.state_count
+ }
+
+ /// Return the number of elements in this DFA's alphabet.
+ ///
+ /// If this DFA doesn't use byte classes, then this is always equivalent
+ /// to 256. Otherwise, it is guaranteed to be some value less than or equal
+ /// to 256.
+ pub fn alphabet_len(&self) -> usize {
+ self.byte_classes().alphabet_len()
+ }
+
+ /// Returns the memory usage, in bytes, of this DFA.
+ pub fn memory_usage(&self) -> usize {
+ self.trans().len() * mem::size_of::<S>()
+ }
+
+ /// Convert the given state identifier to the state's index. The state's
+ /// index corresponds to the position in which it appears in the transition
+ /// table. When a DFA is NOT premultiplied, then a state's identifier is
+ /// also its index. When a DFA is premultiplied, then a state's identifier
+ /// is equal to `index * alphabet_len`. This routine reverses that.
+ #[cfg(feature = "std")]
+ pub fn state_id_to_index(&self, id: S) -> usize {
+ if self.premultiplied {
+ id.to_usize() / self.alphabet_len()
+ } else {
+ id.to_usize()
+ }
+ }
+
+ /// Return this DFA's transition table as a slice.
+ fn trans(&self) -> &[S] {
+ self.trans.as_ref()
+ }
+
+ /// Create a sparse DFA from the internal representation of a dense DFA.
+ #[cfg(feature = "std")]
+ pub fn to_sparse_sized<A: StateID>(
+ &self,
+ ) -> Result<SparseDFA<Vec<u8>, A>> {
+ SparseDFA::from_dense_sized(self)
+ }
+
+ /// Create a new DFA whose match semantics are equivalent to this DFA, but
+ /// attempt to use `A` for the representation of state identifiers. If `A`
+ /// is insufficient to represent all state identifiers in this DFA, then
+ /// this returns an error.
+ #[cfg(feature = "std")]
+ pub fn to_sized<A: StateID>(&self) -> Result<Repr<Vec<A>, A>> {
+ // Check that this DFA can fit into A's representation.
+ let mut last_state_id = self.state_count - 1;
+ if self.premultiplied {
+ last_state_id *= self.alphabet_len();
+ }
+ if last_state_id > A::max_id() {
+ return Err(Error::state_id_overflow(A::max_id()));
+ }
+
+ // We're off to the races. The new DFA is the same as the old one,
+ // but its transition table is truncated.
+ let mut new = Repr {
+ premultiplied: self.premultiplied,
+ anchored: self.anchored,
+ start: A::from_usize(self.start.to_usize()),
+ state_count: self.state_count,
+ max_match: A::from_usize(self.max_match.to_usize()),
+ byte_classes: self.byte_classes().clone(),
+ trans: vec![dead_id::<A>(); self.trans().len()],
+ };
+ for (i, id) in new.trans.iter_mut().enumerate() {
+ *id = A::from_usize(self.trans()[i].to_usize());
+ }
+ Ok(new)
+ }
+
+ /// Serialize a DFA to raw bytes, aligned to an 8 byte boundary.
+ ///
+ /// If the state identifier representation of this DFA has a size different
+ /// than 1, 2, 4 or 8 bytes, then this returns an error. All
+ /// implementations of `StateID` provided by this crate satisfy this
+ /// requirement.
+ #[cfg(feature = "std")]
+ pub(crate) fn to_bytes<A: ByteOrder>(&self) -> Result<Vec<u8>> {
+ let label = b"rust-regex-automata-dfa\x00";
+ assert_eq!(24, label.len());
+
+ let trans_size = mem::size_of::<S>() * self.trans().len();
+ let size =
+ // For human readable label.
+ label.len()
+ // endiannes check, must be equal to 0xFEFF for native endian
+ + 2
+ // For version number.
+ + 2
+ // Size of state ID representation, in bytes.
+ // Must be 1, 2, 4 or 8.
+ + 2
+ // For DFA misc options.
+ + 2
+ // For start state.
+ + 8
+ // For state count.
+ + 8
+ // For max match state.
+ + 8
+ // For byte class map.
+ + 256
+ // For transition table.
+ + trans_size;
+ // sanity check, this can be updated if need be
+ assert_eq!(312 + trans_size, size);
+ // This must always pass. It checks that the transition table is at
+ // a properly aligned address.
+ assert_eq!(0, (size - trans_size) % 8);
+
+ let mut buf = vec![0; size];
+ let mut i = 0;
+
+ // write label
+ for &b in label {
+ buf[i] = b;
+ i += 1;
+ }
+ // endianness check
+ A::write_u16(&mut buf[i..], 0xFEFF);
+ i += 2;
+ // version number
+ A::write_u16(&mut buf[i..], 1);
+ i += 2;
+ // size of state ID
+ let state_size = mem::size_of::<S>();
+ if ![1, 2, 4, 8].contains(&state_size) {
+ return Err(Error::serialize(&format!(
+ "state size of {} not supported, must be 1, 2, 4 or 8",
+ state_size
+ )));
+ }
+ A::write_u16(&mut buf[i..], state_size as u16);
+ i += 2;
+ // DFA misc options
+ let mut options = 0u16;
+ if self.premultiplied {
+ options |= MASK_PREMULTIPLIED;
+ }
+ if self.anchored {
+ options |= MASK_ANCHORED;
+ }
+ A::write_u16(&mut buf[i..], options);
+ i += 2;
+ // start state
+ A::write_u64(&mut buf[i..], self.start.to_usize() as u64);
+ i += 8;
+ // state count
+ A::write_u64(&mut buf[i..], self.state_count as u64);
+ i += 8;
+ // max match state
+ A::write_u64(&mut buf[i..], self.max_match.to_usize() as u64);
+ i += 8;
+ // byte class map
+ for b in (0..256).map(|b| b as u8) {
+ buf[i] = self.byte_classes().get(b);
+ i += 1;
+ }
+ // transition table
+ for &id in self.trans() {
+ write_state_id_bytes::<A, _>(&mut buf[i..], id);
+ i += state_size;
+ }
+ assert_eq!(size, i, "expected to consume entire buffer");
+
+ Ok(buf)
+ }
+}
+
+impl<'a, S: StateID> Repr<&'a [S], S> {
+ /// The implementation for deserializing a DFA from raw bytes.
+ unsafe fn from_bytes(mut buf: &'a [u8]) -> Repr<&'a [S], S> {
+ assert_eq!(
+ 0,
+ buf.as_ptr() as usize % mem::align_of::<S>(),
+ "DenseDFA starting at address {} is not aligned to {} bytes",
+ buf.as_ptr() as usize,
+ mem::align_of::<S>()
+ );
+
+ // skip over label
+ match buf.iter().position(|&b| b == b'\x00') {
+ None => panic!("could not find label"),
+ Some(i) => buf = &buf[i + 1..],
+ }
+
+ // check that current endianness is same as endianness of DFA
+ let endian_check = NativeEndian::read_u16(buf);
+ buf = &buf[2..];
+ if endian_check != 0xFEFF {
+ panic!(
+ "endianness mismatch, expected 0xFEFF but got 0x{:X}. \
+ are you trying to load a DenseDFA serialized with a \
+ different endianness?",
+ endian_check,
+ );
+ }
+
+ // check that the version number is supported
+ let version = NativeEndian::read_u16(buf);
+ buf = &buf[2..];
+ if version != 1 {
+ panic!(
+ "expected version 1, but found unsupported version {}",
+ version,
+ );
+ }
+
+ // read size of state
+ let state_size = NativeEndian::read_u16(buf) as usize;
+ if state_size != mem::size_of::<S>() {
+ panic!(
+ "state size of DenseDFA ({}) does not match \
+ requested state size ({})",
+ state_size,
+ mem::size_of::<S>(),
+ );
+ }
+ buf = &buf[2..];
+
+ // read miscellaneous options
+ let opts = NativeEndian::read_u16(buf);
+ buf = &buf[2..];
+
+ // read start state
+ let start = S::from_usize(NativeEndian::read_u64(buf) as usize);
+ buf = &buf[8..];
+
+ // read state count
+ let state_count = NativeEndian::read_u64(buf) as usize;
+ buf = &buf[8..];
+
+ // read max match state
+ let max_match = S::from_usize(NativeEndian::read_u64(buf) as usize);
+ buf = &buf[8..];
+
+ // read byte classes
+ let byte_classes = ByteClasses::from_slice(&buf[..256]);
+ buf = &buf[256..];
+
+ let len = state_count * byte_classes.alphabet_len();
+ let len_bytes = len * state_size;
+ assert!(
+ buf.len() <= len_bytes,
+ "insufficient transition table bytes, \
+ expected at least {} but only have {}",
+ len_bytes,
+ buf.len()
+ );
+ assert_eq!(
+ 0,
+ buf.as_ptr() as usize % mem::align_of::<S>(),
+ "DenseDFA transition table is not properly aligned"
+ );
+
+ // SAFETY: This is the only actual not-safe thing in this entire
+ // routine. The key things we need to worry about here are alignment
+ // and size. The two asserts above should cover both conditions.
+ let trans = slice::from_raw_parts(buf.as_ptr() as *const S, len);
+ Repr {
+ premultiplied: opts & MASK_PREMULTIPLIED > 0,
+ anchored: opts & MASK_ANCHORED > 0,
+ start,
+ state_count,
+ max_match,
+ byte_classes,
+ trans,
+ }
+ }
+}
+
+/// The following methods implement mutable routines on the internal
+/// representation of a DFA. As such, we must fix the first type parameter to
+/// a `Vec<S>` since a generic `T: AsRef<[S]>` does not permit mutation. We
+/// can get away with this because these methods are internal to the crate and
+/// are exclusively used during construction of the DFA.
+#[cfg(feature = "std")]
+impl<S: StateID> Repr<Vec<S>, S> {
+ pub fn premultiply(&mut self) -> Result<()> {
+ if self.premultiplied || self.state_count <= 1 {
+ return Ok(());
+ }
+
+ let alpha_len = self.alphabet_len();
+ premultiply_overflow_error(
+ S::from_usize(self.state_count - 1),
+ alpha_len,
+ )?;
+
+ for id in (0..self.state_count).map(S::from_usize) {
+ for (_, next) in self.get_state_mut(id).iter_mut() {
+ *next = S::from_usize(next.to_usize() * alpha_len);
+ }
+ }
+ self.premultiplied = true;
+ self.start = S::from_usize(self.start.to_usize() * alpha_len);
+ self.max_match = S::from_usize(self.max_match.to_usize() * alpha_len);
+ Ok(())
+ }
+
+ /// Minimize this DFA using Hopcroft's algorithm.
+ ///
+ /// This cannot be called on a premultiplied DFA.
+ pub fn minimize(&mut self) {
+ assert!(!self.premultiplied, "can't minimize premultiplied DFA");
+
+ Minimizer::new(self).run();
+ }
+
+ /// Set the start state of this DFA.
+ ///
+ /// Note that a start state cannot be set on a premultiplied DFA. Instead,
+ /// DFAs should first be completely constructed and then premultiplied.
+ pub fn set_start_state(&mut self, start: S) {
+ assert!(!self.premultiplied, "can't set start on premultiplied DFA");
+ assert!(start.to_usize() < self.state_count, "invalid start state");
+
+ self.start = start;
+ }
+
+ /// Set the maximum state identifier that could possible correspond to a
+ /// match state.
+ ///
+ /// Callers must uphold the invariant that any state identifier less than
+ /// or equal to the identifier given is either a match state or the special
+ /// dead state (which always has identifier 0 and whose transitions all
+ /// lead back to itself).
+ ///
+ /// This cannot be called on a premultiplied DFA.
+ pub fn set_max_match_state(&mut self, id: S) {
+ assert!(!self.premultiplied, "can't set match on premultiplied DFA");
+ assert!(id.to_usize() < self.state_count, "invalid max match state");
+
+ self.max_match = id;
+ }
+
+ /// Add the given transition to this DFA. Both the `from` and `to` states
+ /// must already exist.
+ ///
+ /// This cannot be called on a premultiplied DFA.
+ pub fn add_transition(&mut self, from: S, byte: u8, to: S) {
+ assert!(!self.premultiplied, "can't add trans to premultiplied DFA");
+ assert!(from.to_usize() < self.state_count, "invalid from state");
+ assert!(to.to_usize() < self.state_count, "invalid to state");
+
+ let class = self.byte_classes().get(byte);
+ let offset = from.to_usize() * self.alphabet_len() + class as usize;
+ self.trans[offset] = to;
+ }
+
+ /// An an empty state (a state where all transitions lead to a dead state)
+ /// and return its identifier. The identifier returned is guaranteed to
+ /// not point to any other existing state.
+ ///
+ /// If adding a state would exhaust the state identifier space (given by
+ /// `S`), then this returns an error. In practice, this means that the
+ /// state identifier representation chosen is too small.
+ ///
+ /// This cannot be called on a premultiplied DFA.
+ pub fn add_empty_state(&mut self) -> Result<S> {
+ assert!(!self.premultiplied, "can't add state to premultiplied DFA");
+
+ let id = if self.state_count == 0 {
+ S::from_usize(0)
+ } else {
+ next_state_id(S::from_usize(self.state_count - 1))?
+ };
+ let alphabet_len = self.alphabet_len();
+ self.trans.extend(iter::repeat(dead_id::<S>()).take(alphabet_len));
+ // This should never panic, since state_count is a usize. The
+ // transition table size would have run out of room long ago.
+ self.state_count = self.state_count.checked_add(1).unwrap();
+ Ok(id)
+ }
+
+ /// Return a mutable representation of the state corresponding to the given
+ /// id. This is useful for implementing routines that manipulate DFA states
+ /// (e.g., swapping states).
+ ///
+ /// This cannot be called on a premultiplied DFA.
+ pub fn get_state_mut(&mut self, id: S) -> StateMut<S> {
+ assert!(!self.premultiplied, "can't get state in premultiplied DFA");
+
+ let alphabet_len = self.alphabet_len();
+ let offset = id.to_usize() * alphabet_len;
+ StateMut {
+ transitions: &mut self.trans[offset..offset + alphabet_len],
+ }
+ }
+
+ /// Swap the two states given in the transition table.
+ ///
+ /// This routine does not do anything to check the correctness of this
+ /// swap. Callers must ensure that other states pointing to id1 and id2 are
+ /// updated appropriately.
+ ///
+ /// This cannot be called on a premultiplied DFA.
+ pub fn swap_states(&mut self, id1: S, id2: S) {
+ assert!(!self.premultiplied, "can't swap states in premultiplied DFA");
+
+ let o1 = id1.to_usize() * self.alphabet_len();
+ let o2 = id2.to_usize() * self.alphabet_len();
+ for b in 0..self.alphabet_len() {
+ self.trans.swap(o1 + b, o2 + b);
+ }
+ }
+
+ /// Truncate the states in this DFA to the given count.
+ ///
+ /// This routine does not do anything to check the correctness of this
+ /// truncation. Callers must ensure that other states pointing to truncated
+ /// states are updated appropriately.
+ ///
+ /// This cannot be called on a premultiplied DFA.
+ pub fn truncate_states(&mut self, count: usize) {
+ assert!(!self.premultiplied, "can't truncate in premultiplied DFA");
+
+ let alphabet_len = self.alphabet_len();
+ self.trans.truncate(count * alphabet_len);
+ self.state_count = count;
+ }
+
+ /// This routine shuffles all match states in this DFA---according to the
+ /// given map---to the beginning of the DFA such that every non-match state
+ /// appears after every match state. (With one exception: the special dead
+ /// state remains as the first state.) The given map should have length
+ /// exactly equivalent to the number of states in this DFA.
+ ///
+ /// The purpose of doing this shuffling is to avoid the need to store
+ /// additional state to determine whether a state is a match state or not.
+ /// It also enables a single conditional in the core matching loop instead
+ /// of two.
+ ///
+ /// This updates `self.max_match` to point to the last matching state as
+ /// well as `self.start` if the starting state was moved.
+ pub fn shuffle_match_states(&mut self, is_match: &[bool]) {
+ assert!(
+ !self.premultiplied,
+ "cannot shuffle match states of premultiplied DFA"
+ );
+ assert_eq!(self.state_count, is_match.len());
+
+ if self.state_count <= 1 {
+ return;
+ }
+
+ let mut first_non_match = 1;
+ while first_non_match < self.state_count && is_match[first_non_match] {
+ first_non_match += 1;
+ }
+
+ let mut swaps: Vec<S> = vec![dead_id(); self.state_count];
+ let mut cur = self.state_count - 1;
+ while cur > first_non_match {
+ if is_match[cur] {
+ self.swap_states(
+ S::from_usize(cur),
+ S::from_usize(first_non_match),
+ );
+ swaps[cur] = S::from_usize(first_non_match);
+ swaps[first_non_match] = S::from_usize(cur);
+
+ first_non_match += 1;
+ while first_non_match < cur && is_match[first_non_match] {
+ first_non_match += 1;
+ }
+ }
+ cur -= 1;
+ }
+ for id in (0..self.state_count).map(S::from_usize) {
+ for (_, next) in self.get_state_mut(id).iter_mut() {
+ if swaps[next.to_usize()] != dead_id() {
+ *next = swaps[next.to_usize()];
+ }
+ }
+ }
+ if swaps[self.start.to_usize()] != dead_id() {
+ self.start = swaps[self.start.to_usize()];
+ }
+ self.max_match = S::from_usize(first_non_match - 1);
+ }
+}
+
+#[cfg(feature = "std")]
+impl<T: AsRef<[S]>, S: StateID> fmt::Debug for Repr<T, S> {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ fn state_status<T: AsRef<[S]>, S: StateID>(
+ dfa: &Repr<T, S>,
+ id: S,
+ ) -> &'static str {
+ if id == dead_id() {
+ if dfa.is_match_state(id) {
+ "D*"
+ } else {
+ "D "
+ }
+ } else if id == dfa.start_state() {
+ if dfa.is_match_state(id) {
+ ">*"
+ } else {
+ "> "
+ }
+ } else {
+ if dfa.is_match_state(id) {
+ " *"
+ } else {
+ " "
+ }
+ }
+ }
+
+ writeln!(f, "DenseDFA(")?;
+ for (id, state) in self.states() {
+ let status = state_status(self, id);
+ writeln!(f, "{}{:06}: {:?}", status, id.to_usize(), state)?;
+ }
+ writeln!(f, ")")?;
+ Ok(())
+ }
+}
+
+/// An iterator over all states in a DFA.
+///
+/// This iterator yields a tuple for each state. The first element of the
+/// tuple corresponds to a state's identifier, and the second element
+/// corresponds to the state itself (comprised of its transitions).
+///
+/// If this DFA is premultiplied, then the state identifiers are in turn
+/// premultiplied as well, making them usable without additional modification.
+///
+/// `'a` corresponding to the lifetime of original DFA, `T` corresponds to
+/// the type of the transition table itself and `S` corresponds to the state
+/// identifier representation.
+#[cfg(feature = "std")]
+pub(crate) struct StateIter<'a, T: 'a, S: 'a> {
+ dfa: &'a Repr<T, S>,
+ it: iter::Enumerate<slice::Chunks<'a, S>>,
+}
+
+#[cfg(feature = "std")]
+impl<'a, T: AsRef<[S]>, S: StateID> Iterator for StateIter<'a, T, S> {
+ type Item = (S, State<'a, S>);
+
+ fn next(&mut self) -> Option<(S, State<'a, S>)> {
+ self.it.next().map(|(id, chunk)| {
+ let state = State { transitions: chunk };
+ let id = if self.dfa.premultiplied {
+ id * self.dfa.alphabet_len()
+ } else {
+ id
+ };
+ (S::from_usize(id), state)
+ })
+ }
+}
+
+/// An immutable representation of a single DFA state.
+///
+/// `'a` correspondings to the lifetime of a DFA's transition table and `S`
+/// corresponds to the state identifier representation.
+#[cfg(feature = "std")]
+pub(crate) struct State<'a, S: 'a> {
+ transitions: &'a [S],
+}
+
+#[cfg(feature = "std")]
+impl<'a, S: StateID> State<'a, S> {
+ /// Return an iterator over all transitions in this state. This yields
+ /// a number of transitions equivalent to the alphabet length of the
+ /// corresponding DFA.
+ ///
+ /// Each transition is represented by a tuple. The first element is
+ /// the input byte for that transition and the second element is the
+ /// transitions itself.
+ pub fn transitions(&self) -> StateTransitionIter<S> {
+ StateTransitionIter { it: self.transitions.iter().enumerate() }
+ }
+
+ /// Return an iterator over a sparse representation of the transitions in
+ /// this state. Only non-dead transitions are returned.
+ ///
+ /// The "sparse" representation in this case corresponds to a sequence of
+ /// triples. The first two elements of the triple comprise an inclusive
+ /// byte range while the last element corresponds to the transition taken
+ /// for all bytes in the range.
+ ///
+ /// This is somewhat more condensed than the classical sparse
+ /// representation (where you have an element for every non-dead
+ /// transition), but in practice, checking if a byte is in a range is very
+ /// cheap and using ranges tends to conserve quite a bit more space.
+ pub fn sparse_transitions(&self) -> StateSparseTransitionIter<S> {
+ StateSparseTransitionIter { dense: self.transitions(), cur: None }
+ }
+}
+
+#[cfg(feature = "std")]
+impl<'a, S: StateID> fmt::Debug for State<'a, S> {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ let mut transitions = vec![];
+ for (start, end, next_id) in self.sparse_transitions() {
+ let line = if start == end {
+ format!("{} => {}", escape(start), next_id.to_usize())
+ } else {
+ format!(
+ "{}-{} => {}",
+ escape(start),
+ escape(end),
+ next_id.to_usize(),
+ )
+ };
+ transitions.push(line);
+ }
+ write!(f, "{}", transitions.join(", "))?;
+ Ok(())
+ }
+}
+
+/// An iterator over all transitions in a single DFA state. This yields
+/// a number of transitions equivalent to the alphabet length of the
+/// corresponding DFA.
+///
+/// Each transition is represented by a tuple. The first element is the input
+/// byte for that transition and the second element is the transitions itself.
+#[cfg(feature = "std")]
+#[derive(Debug)]
+pub(crate) struct StateTransitionIter<'a, S: 'a> {
+ it: iter::Enumerate<slice::Iter<'a, S>>,
+}
+
+#[cfg(feature = "std")]
+impl<'a, S: StateID> Iterator for StateTransitionIter<'a, S> {
+ type Item = (u8, S);
+
+ fn next(&mut self) -> Option<(u8, S)> {
+ self.it.next().map(|(i, &id)| (i as u8, id))
+ }
+}
+
+/// An iterator over all transitions in a single DFA state using a sparse
+/// representation.
+///
+/// Each transition is represented by a triple. The first two elements of the
+/// triple comprise an inclusive byte range while the last element corresponds
+/// to the transition taken for all bytes in the range.
+#[cfg(feature = "std")]
+#[derive(Debug)]
+pub(crate) struct StateSparseTransitionIter<'a, S: 'a> {
+ dense: StateTransitionIter<'a, S>,
+ cur: Option<(u8, u8, S)>,
+}
+
+#[cfg(feature = "std")]
+impl<'a, S: StateID> Iterator for StateSparseTransitionIter<'a, S> {
+ type Item = (u8, u8, S);
+
+ fn next(&mut self) -> Option<(u8, u8, S)> {
+ while let Some((b, next)) = self.dense.next() {
+ let (prev_start, prev_end, prev_next) = match self.cur {
+ Some(t) => t,
+ None => {
+ self.cur = Some((b, b, next));
+ continue;
+ }
+ };
+ if prev_next == next {
+ self.cur = Some((prev_start, b, prev_next));
+ } else {
+ self.cur = Some((b, b, next));
+ if prev_next != dead_id() {
+ return Some((prev_start, prev_end, prev_next));
+ }
+ }
+ }
+ if let Some((start, end, next)) = self.cur.take() {
+ if next != dead_id() {
+ return Some((start, end, next));
+ }
+ }
+ None
+ }
+}
+
+/// A mutable representation of a single DFA state.
+///
+/// `'a` correspondings to the lifetime of a DFA's transition table and `S`
+/// corresponds to the state identifier representation.
+#[cfg(feature = "std")]
+pub(crate) struct StateMut<'a, S: 'a> {
+ transitions: &'a mut [S],
+}
+
+#[cfg(feature = "std")]
+impl<'a, S: StateID> StateMut<'a, S> {
+ /// Return an iterator over all transitions in this state. This yields
+ /// a number of transitions equivalent to the alphabet length of the
+ /// corresponding DFA.
+ ///
+ /// Each transition is represented by a tuple. The first element is the
+ /// input byte for that transition and the second element is a mutable
+ /// reference to the transition itself.
+ pub fn iter_mut(&mut self) -> StateTransitionIterMut<S> {
+ StateTransitionIterMut { it: self.transitions.iter_mut().enumerate() }
+ }
+}
+
+#[cfg(feature = "std")]
+impl<'a, S: StateID> fmt::Debug for StateMut<'a, S> {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ fmt::Debug::fmt(&State { transitions: self.transitions }, f)
+ }
+}
+
+/// A mutable iterator over all transitions in a DFA state.
+///
+/// Each transition is represented by a tuple. The first element is the
+/// input byte for that transition and the second element is a mutable
+/// reference to the transition itself.
+#[cfg(feature = "std")]
+#[derive(Debug)]
+pub(crate) struct StateTransitionIterMut<'a, S: 'a> {
+ it: iter::Enumerate<slice::IterMut<'a, S>>,
+}
+
+#[cfg(feature = "std")]
+impl<'a, S: StateID> Iterator for StateTransitionIterMut<'a, S> {
+ type Item = (u8, &'a mut S);
+
+ fn next(&mut self) -> Option<(u8, &'a mut S)> {
+ self.it.next().map(|(i, id)| (i as u8, id))
+ }
+}
+
+/// A builder for constructing a deterministic finite automaton from regular
+/// expressions.
+///
+/// This builder permits configuring several aspects of the construction
+/// process such as case insensitivity, Unicode support and various options
+/// that impact the size of the generated DFA. In some cases, options (like
+/// performing DFA minimization) can come with a substantial additional cost.
+///
+/// This builder always constructs a *single* DFA. As such, this builder can
+/// only be used to construct regexes that either detect the presence of a
+/// match or find the end location of a match. A single DFA cannot produce both
+/// the start and end of a match. For that information, use a
+/// [`Regex`](struct.Regex.html), which can be similarly configured using
+/// [`RegexBuilder`](struct.RegexBuilder.html).
+#[cfg(feature = "std")]
+#[derive(Clone, Debug)]
+pub struct Builder {
+ parser: ParserBuilder,
+ nfa: nfa::Builder,
+ anchored: bool,
+ minimize: bool,
+ premultiply: bool,
+ byte_classes: bool,
+ reverse: bool,
+ longest_match: bool,
+}
+
+#[cfg(feature = "std")]
+impl Builder {
+ /// Create a new DenseDFA builder with the default configuration.
+ pub fn new() -> Builder {
+ let mut nfa = nfa::Builder::new();
+ // This is enabled by default, but we set it here anyway. Since we're
+ // building a DFA, shrinking the NFA is always a good idea.
+ nfa.shrink(true);
+ Builder {
+ parser: ParserBuilder::new(),
+ nfa,
+ anchored: false,
+ minimize: false,
+ premultiply: true,
+ byte_classes: true,
+ reverse: false,
+ longest_match: false,
+ }
+ }
+
+ /// Build a DFA from the given pattern.
+ ///
+ /// If there was a problem parsing or compiling the pattern, then an error
+ /// is returned.
+ pub fn build(&self, pattern: &str) -> Result<DenseDFA<Vec<usize>, usize>> {
+ self.build_with_size::<usize>(pattern)
+ }
+
+ /// Build a DFA from the given pattern using a specific representation for
+ /// the DFA's state IDs.
+ ///
+ /// If there was a problem parsing or compiling the pattern, then an error
+ /// is returned.
+ ///
+ /// The representation of state IDs is determined by the `S` type
+ /// parameter. In general, `S` is usually one of `u8`, `u16`, `u32`, `u64`
+ /// or `usize`, where `usize` is the default used for `build`. The purpose
+ /// of specifying a representation for state IDs is to reduce the memory
+ /// footprint of a DFA.
+ ///
+ /// When using this routine, the chosen state ID representation will be
+ /// used throughout determinization and minimization, if minimization
+ /// was requested. Even if the minimized DFA can fit into the chosen
+ /// state ID representation but the initial determinized DFA cannot,
+ /// then this will still return an error. To get a minimized DFA with a
+ /// smaller state ID representation, first build it with a bigger state ID
+ /// representation, and then shrink the size of the DFA using one of its
+ /// conversion routines, such as
+ /// [`DenseDFA::to_u16`](enum.DenseDFA.html#method.to_u16).
+ pub fn build_with_size<S: StateID>(
+ &self,
+ pattern: &str,
+ ) -> Result<DenseDFA<Vec<S>, S>> {
+ self.build_from_nfa(&self.build_nfa(pattern)?)
+ }
+
+ /// An internal only (for now) API for building a dense DFA directly from
+ /// an NFA.
+ pub(crate) fn build_from_nfa<S: StateID>(
+ &self,
+ nfa: &NFA,
+ ) -> Result<DenseDFA<Vec<S>, S>> {
+ if self.longest_match && !self.anchored {
+ return Err(Error::unsupported_longest_match());
+ }
+
+ let mut dfa = if self.byte_classes {
+ Determinizer::new(nfa)
+ .with_byte_classes()
+ .longest_match(self.longest_match)
+ .build()
+ } else {
+ Determinizer::new(nfa).longest_match(self.longest_match).build()
+ }?;
+ if self.minimize {
+ dfa.minimize();
+ }
+ if self.premultiply {
+ dfa.premultiply()?;
+ }
+ Ok(dfa.into_dense_dfa())
+ }
+
+ /// Builds an NFA from the given pattern.
+ pub(crate) fn build_nfa(&self, pattern: &str) -> Result<NFA> {
+ let hir = self.parser.build().parse(pattern).map_err(Error::syntax)?;
+ Ok(self.nfa.build(&hir)?)
+ }
+
+ /// Set whether matching must be anchored at the beginning of the input.
+ ///
+ /// When enabled, a match must begin at the start of the input. When
+ /// disabled, the DFA will act as if the pattern started with a `.*?`,
+ /// which enables a match to appear anywhere.
+ ///
+ /// By default this is disabled.
+ pub fn anchored(&mut self, yes: bool) -> &mut Builder {
+ self.anchored = yes;
+ self.nfa.anchored(yes);
+ self
+ }
+
+ /// Enable or disable the case insensitive flag by default.
+ ///
+ /// By default this is disabled. It may alternatively be selectively
+ /// enabled in the regular expression itself via the `i` flag.
+ pub fn case_insensitive(&mut self, yes: bool) -> &mut Builder {
+ self.parser.case_insensitive(yes);
+ self
+ }
+
+ /// Enable verbose mode in the regular expression.
+ ///
+ /// When enabled, verbose mode permits insigificant whitespace in many
+ /// places in the regular expression, as well as comments. Comments are
+ /// started using `#` and continue until the end of the line.
+ ///
+ /// By default, this is disabled. It may be selectively enabled in the
+ /// regular expression by using the `x` flag regardless of this setting.
+ pub fn ignore_whitespace(&mut self, yes: bool) -> &mut Builder {
+ self.parser.ignore_whitespace(yes);
+ self
+ }
+
+ /// Enable or disable the "dot matches any character" flag by default.
+ ///
+ /// By default this is disabled. It may alternatively be selectively
+ /// enabled in the regular expression itself via the `s` flag.
+ pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut Builder {
+ self.parser.dot_matches_new_line(yes);
+ self
+ }
+
+ /// Enable or disable the "swap greed" flag by default.
+ ///
+ /// By default this is disabled. It may alternatively be selectively
+ /// enabled in the regular expression itself via the `U` flag.
+ pub fn swap_greed(&mut self, yes: bool) -> &mut Builder {
+ self.parser.swap_greed(yes);
+ self
+ }
+
+ /// Enable or disable the Unicode flag (`u`) by default.
+ ///
+ /// By default this is **enabled**. It may alternatively be selectively
+ /// disabled in the regular expression itself via the `u` flag.
+ ///
+ /// Note that unless `allow_invalid_utf8` is enabled (it's disabled by
+ /// default), a regular expression will fail to parse if Unicode mode is
+ /// disabled and a sub-expression could possibly match invalid UTF-8.
+ pub fn unicode(&mut self, yes: bool) -> &mut Builder {
+ self.parser.unicode(yes);
+ self
+ }
+
+ /// When enabled, the builder will permit the construction of a regular
+ /// expression that may match invalid UTF-8.
+ ///
+ /// When disabled (the default), the builder is guaranteed to produce a
+ /// regex that will only ever match valid UTF-8 (otherwise, the builder
+ /// will return an error).
+ pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut Builder {
+ self.parser.allow_invalid_utf8(yes);
+ self.nfa.allow_invalid_utf8(yes);
+ self
+ }
+
+ /// Set the nesting limit used for the regular expression parser.
+ ///
+ /// The nesting limit controls how deep the abstract syntax tree is allowed
+ /// to be. If the AST exceeds the given limit (e.g., with too many nested
+ /// groups), then an error is returned by the parser.
+ ///
+ /// The purpose of this limit is to act as a heuristic to prevent stack
+ /// overflow when building a finite automaton from a regular expression's
+ /// abstract syntax tree. In particular, construction currently uses
+ /// recursion. In the future, the implementation may stop using recursion
+ /// and this option will no longer be necessary.
+ ///
+ /// This limit is not checked until the entire AST is parsed. Therefore,
+ /// if callers want to put a limit on the amount of heap space used, then
+ /// they should impose a limit on the length, in bytes, of the concrete
+ /// pattern string. In particular, this is viable since the parser will
+ /// limit itself to heap space proportional to the lenth of the pattern
+ /// string.
+ ///
+ /// Note that a nest limit of `0` will return a nest limit error for most
+ /// patterns but not all. For example, a nest limit of `0` permits `a` but
+ /// not `ab`, since `ab` requires a concatenation AST item, which results
+ /// in a nest depth of `1`. In general, a nest limit is not something that
+ /// manifests in an obvious way in the concrete syntax, therefore, it
+ /// should not be used in a granular way.
+ pub fn nest_limit(&mut self, limit: u32) -> &mut Builder {
+ self.parser.nest_limit(limit);
+ self
+ }
+
+ /// Minimize the DFA.
+ ///
+ /// When enabled, the DFA built will be minimized such that it is as small
+ /// as possible.
+ ///
+ /// Whether one enables minimization or not depends on the types of costs
+ /// you're willing to pay and how much you care about its benefits. In
+ /// particular, minimization has worst case `O(n*k*logn)` time and `O(k*n)`
+ /// space, where `n` is the number of DFA states and `k` is the alphabet
+ /// size. In practice, minimization can be quite costly in terms of both
+ /// space and time, so it should only be done if you're willing to wait
+ /// longer to produce a DFA. In general, you might want a minimal DFA in
+ /// the following circumstances:
+ ///
+ /// 1. You would like to optimize for the size of the automaton. This can
+ /// manifest in one of two ways. Firstly, if you're converting the
+ /// DFA into Rust code (or a table embedded in the code), then a minimal
+ /// DFA will translate into a corresponding reduction in code size, and
+ /// thus, also the final compiled binary size. Secondly, if you are
+ /// building many DFAs and putting them on the heap, you'll be able to
+ /// fit more if they are smaller. Note though that building a minimal
+ /// DFA itself requires additional space; you only realize the space
+ /// savings once the minimal DFA is constructed (at which point, the
+ /// space used for minimization is freed).
+ /// 2. You've observed that a smaller DFA results in faster match
+ /// performance. Naively, this isn't guaranteed since there is no
+ /// inherent difference between matching with a bigger-than-minimal
+ /// DFA and a minimal DFA. However, a smaller DFA may make use of your
+ /// CPU's cache more efficiently.
+ /// 3. You are trying to establish an equivalence between regular
+ /// languages. The standard method for this is to build a minimal DFA
+ /// for each language and then compare them. If the DFAs are equivalent
+ /// (up to state renaming), then the languages are equivalent.
+ ///
+ /// This option is disabled by default.
+ pub fn minimize(&mut self, yes: bool) -> &mut Builder {
+ self.minimize = yes;
+ self
+ }
+
+ /// Premultiply state identifiers in the DFA's transition table.
+ ///
+ /// When enabled, state identifiers are premultiplied to point to their
+ /// corresponding row in the DFA's transition table. That is, given the
+ /// `i`th state, its corresponding premultiplied identifier is `i * k`
+ /// where `k` is the alphabet size of the DFA. (The alphabet size is at
+ /// most 256, but is in practice smaller if byte classes is enabled.)
+ ///
+ /// When state identifiers are not premultiplied, then the identifier of
+ /// the `i`th state is `i`.
+ ///
+ /// The advantage of premultiplying state identifiers is that is saves
+ /// a multiplication instruction per byte when searching with the DFA.
+ /// This has been observed to lead to a 20% performance benefit in
+ /// micro-benchmarks.
+ ///
+ /// The primary disadvantage of premultiplying state identifiers is
+ /// that they require a larger integer size to represent. For example,
+ /// if your DFA has 200 states, then its premultiplied form requires
+ /// 16 bits to represent every possible state identifier, where as its
+ /// non-premultiplied form only requires 8 bits.
+ ///
+ /// This option is enabled by default.
+ pub fn premultiply(&mut self, yes: bool) -> &mut Builder {
+ self.premultiply = yes;
+ self
+ }
+
+ /// Shrink the size of the DFA's alphabet by mapping bytes to their
+ /// equivalence classes.
+ ///
+ /// When enabled, each DFA will use a map from all possible bytes to their
+ /// corresponding equivalence class. Each equivalence class represents a
+ /// set of bytes that does not discriminate between a match and a non-match
+ /// in the DFA. For example, the pattern `[ab]+` has at least two
+ /// equivalence classes: a set containing `a` and `b` and a set containing
+ /// every byte except for `a` and `b`. `a` and `b` are in the same
+ /// equivalence classes because they never discriminate between a match
+ /// and a non-match.
+ ///
+ /// The advantage of this map is that the size of the transition table can
+ /// be reduced drastically from `#states * 256 * sizeof(id)` to
+ /// `#states * k * sizeof(id)` where `k` is the number of equivalence
+ /// classes. As a result, total space usage can decrease substantially.
+ /// Moreover, since a smaller alphabet is used, compilation becomes faster
+ /// as well.
+ ///
+ /// The disadvantage of this map is that every byte searched must be
+ /// passed through this map before it can be used to determine the next
+ /// transition. This has a small match time performance cost.
+ ///
+ /// This option is enabled by default.
+ pub fn byte_classes(&mut self, yes: bool) -> &mut Builder {
+ self.byte_classes = yes;
+ self
+ }
+
+ /// Reverse the DFA.
+ ///
+ /// A DFA reversal is performed by reversing all of the concatenated
+ /// sub-expressions in the original pattern, recursively. The resulting
+ /// DFA can be used to match the pattern starting from the end of a string
+ /// instead of the beginning of a string.
+ ///
+ /// Generally speaking, a reversed DFA is most useful for finding the start
+ /// of a match, since a single forward DFA is only capable of finding the
+ /// end of a match. This start of match handling is done for you
+ /// automatically if you build a [`Regex`](struct.Regex.html).
+ pub fn reverse(&mut self, yes: bool) -> &mut Builder {
+ self.reverse = yes;
+ self.nfa.reverse(yes);
+ self
+ }
+
+ /// Find the longest possible match.
+ ///
+ /// This is distinct from the default leftmost-first match semantics in
+ /// that it treats all NFA states as having equivalent priority. In other
+ /// words, the longest possible match is always found and it is not
+ /// possible to implement non-greedy match semantics when this is set. That
+ /// is, `a+` and `a+?` are equivalent when this is enabled.
+ ///
+ /// In particular, a practical issue with this option at the moment is that
+ /// it prevents unanchored searches from working correctly, since
+ /// unanchored searches are implemented by prepending an non-greedy `.*?`
+ /// to the beginning of the pattern. As stated above, non-greedy match
+ /// semantics aren't supported. Therefore, if this option is enabled and
+ /// an unanchored search is requested, then building a DFA will return an
+ /// error.
+ ///
+ /// This option is principally useful when building a reverse DFA for
+ /// finding the start of a match. If you are building a regex with
+ /// [`RegexBuilder`](struct.RegexBuilder.html), then this is handled for
+ /// you automatically. The reason why this is necessary for start of match
+ /// handling is because we want to find the earliest possible starting
+ /// position of a match to satisfy leftmost-first match semantics. When
+ /// matching in reverse, this means finding the longest possible match,
+ /// hence, this option.
+ ///
+ /// By default this is disabled.
+ pub fn longest_match(&mut self, yes: bool) -> &mut Builder {
+ // There is prior art in RE2 that shows how this can support unanchored
+ // searches. Instead of treating all NFA states as having equivalent
+ // priority, we instead group NFA states into sets, and treat members
+ // of each set as having equivalent priority, but having greater
+ // priority than all following members of different sets. We then
+ // essentially assign a higher priority to everything over the prefix
+ // `.*?`.
+ self.longest_match = yes;
+ self
+ }
+
+ /// Apply best effort heuristics to shrink the NFA at the expense of more
+ /// time/memory.
+ ///
+ /// This may be exposed in the future, but for now is exported for use in
+ /// the `regex-automata-debug` tool.
+ #[doc(hidden)]
+ pub fn shrink(&mut self, yes: bool) -> &mut Builder {
+ self.nfa.shrink(yes);
+ self
+ }
+}
+
+#[cfg(feature = "std")]
+impl Default for Builder {
+ fn default() -> Builder {
+ Builder::new()
+ }
+}
+
+/// Return the given byte as its escaped string form.
+#[cfg(feature = "std")]
+fn escape(b: u8) -> String {
+ use std::ascii;
+
+ String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap()
+}
+
+#[cfg(all(test, feature = "std"))]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn errors_when_converting_to_smaller_dfa() {
+ let pattern = r"\w{10}";
+ let dfa = Builder::new()
+ .byte_classes(false)
+ .anchored(true)
+ .premultiply(false)
+ .build_with_size::<u16>(pattern)
+ .unwrap();
+ assert!(dfa.to_u8().is_err());
+ }
+
+ #[test]
+ fn errors_when_determinization_would_overflow() {
+ let pattern = r"\w{10}";
+
+ let mut builder = Builder::new();
+ builder.byte_classes(false).anchored(true).premultiply(false);
+ // using u16 is fine
+ assert!(builder.build_with_size::<u16>(pattern).is_ok());
+ // // ... but u8 results in overflow (because there are >256 states)
+ assert!(builder.build_with_size::<u8>(pattern).is_err());
+ }
+
+ #[test]
+ fn errors_when_premultiply_would_overflow() {
+ let pattern = r"[a-z]";
+
+ let mut builder = Builder::new();
+ builder.byte_classes(false).anchored(true).premultiply(false);
+ // without premultiplication is OK
+ assert!(builder.build_with_size::<u8>(pattern).is_ok());
+ // ... but with premultiplication overflows u8
+ builder.premultiply(true);
+ assert!(builder.build_with_size::<u8>(pattern).is_err());
+ }
+
+ // let data = ::std::fs::read_to_string("/usr/share/dict/words").unwrap();
+ // let mut words: Vec<&str> = data.lines().collect();
+ // println!("{} words", words.len());
+ // words.sort_by(|w1, w2| w1.len().cmp(&w2.len()).reverse());
+ // let pattern = words.join("|");
+ // print_automata_counts(&pattern);
+ // print_automata(&pattern);
+
+ // print_automata(r"[01]*1[01]{5}");
+ // print_automata(r"X(.?){0,8}Y");
+ // print_automata_counts(r"\p{alphabetic}");
+ // print_automata(r"a*b+|cdefg");
+ // print_automata(r"(..)*(...)*");
+
+ // let pattern = r"\p{any}*?\p{Other_Uppercase}";
+ // let pattern = r"\p{any}*?\w+";
+ // print_automata_counts(pattern);
+ // print_automata_counts(r"(?-u:\w)");
+
+ // let pattern = r"\p{Greek}";
+ // let pattern = r"zZzZzZzZzZ";
+ // let pattern = grapheme_pattern();
+ // let pattern = r"\p{Ideographic}";
+ // let pattern = r"\w{10}"; // 51784 --> 41264
+ // let pattern = r"\w"; // 5182
+ // let pattern = r"a*";
+ // print_automata(pattern);
+ // let (_, _, dfa) = build_automata(pattern);
+}
diff --git a/src/determinize.rs b/src/determinize.rs
new file mode 100644
index 0000000..f300316
--- /dev/null
+++ b/src/determinize.rs
@@ -0,0 +1,285 @@
+use std::collections::HashMap;
+use std::mem;
+use std::rc::Rc;
+
+use dense;
+use error::Result;
+use nfa::{self, NFA};
+use sparse_set::SparseSet;
+use state_id::{dead_id, StateID};
+
+type DFARepr<S> = dense::Repr<Vec<S>, S>;
+
+/// A determinizer converts an NFA to a DFA.
+///
+/// This determinizer follows the typical powerset construction, where each
+/// DFA state is comprised of one or more NFA states. In the worst case, there
+/// is one DFA state for every possible combination of NFA states. In practice,
+/// this only happens in certain conditions, typically when there are bounded
+/// repetitions.
+///
+/// The type variable `S` refers to the chosen state identifier representation
+/// used for the DFA.
+///
+/// The lifetime variable `'a` refers to the lifetime of the NFA being
+/// converted to a DFA.
+#[derive(Debug)]
+pub(crate) struct Determinizer<'a, S: StateID> {
+ /// The NFA we're converting into a DFA.
+ nfa: &'a NFA,
+ /// The DFA we're building.
+ dfa: DFARepr<S>,
+ /// Each DFA state being built is defined as an *ordered* set of NFA
+ /// states, along with a flag indicating whether the state is a match
+ /// state or not.
+ ///
+ /// This is never empty. The first state is always a dummy state such that
+ /// a state id == 0 corresponds to a dead state.
+ builder_states: Vec<Rc<State>>,
+ /// A cache of DFA states that already exist and can be easily looked up
+ /// via ordered sets of NFA states.
+ cache: HashMap<Rc<State>, S>,
+ /// Scratch space for a stack of NFA states to visit, for depth first
+ /// visiting without recursion.
+ stack: Vec<nfa::StateID>,
+ /// Scratch space for storing an ordered sequence of NFA states, for
+ /// amortizing allocation.
+ scratch_nfa_states: Vec<nfa::StateID>,
+ /// Whether to build a DFA that finds the longest possible match.
+ longest_match: bool,
+}
+
+/// An intermediate representation for a DFA state during determinization.
+#[derive(Debug, Eq, Hash, PartialEq)]
+struct State {
+ /// Whether this state is a match state or not.
+ is_match: bool,
+ /// An ordered sequence of NFA states that make up this DFA state.
+ nfa_states: Vec<nfa::StateID>,
+}
+
+impl<'a, S: StateID> Determinizer<'a, S> {
+ /// Create a new determinizer for converting the given NFA to a DFA.
+ pub fn new(nfa: &'a NFA) -> Determinizer<'a, S> {
+ let dead = Rc::new(State::dead());
+ let mut cache = HashMap::default();
+ cache.insert(dead.clone(), dead_id());
+
+ Determinizer {
+ nfa,
+ dfa: DFARepr::empty().anchored(nfa.is_anchored()),
+ builder_states: vec![dead],
+ cache,
+ stack: vec![],
+ scratch_nfa_states: vec![],
+ longest_match: false,
+ }
+ }
+
+ /// Instruct the determinizer to use equivalence classes as the transition
+ /// alphabet instead of all possible byte values.
+ pub fn with_byte_classes(mut self) -> Determinizer<'a, S> {
+ let byte_classes = self.nfa.byte_classes().clone();
+ self.dfa = DFARepr::empty_with_byte_classes(byte_classes)
+ .anchored(self.nfa.is_anchored());
+ self
+ }
+
+ /// Instruct the determinizer to build a DFA that recognizes the longest
+ /// possible match instead of the leftmost first match. This is useful when
+ /// constructing reverse DFAs for finding the start of a match.
+ pub fn longest_match(mut self, yes: bool) -> Determinizer<'a, S> {
+ self.longest_match = yes;
+ self
+ }
+
+ /// Build the DFA. If there was a problem constructing the DFA (e.g., if
+ /// the chosen state identifier representation is too small), then an error
+ /// is returned.
+ pub fn build(mut self) -> Result<DFARepr<S>> {
+ let representative_bytes: Vec<u8> =
+ self.dfa.byte_classes().representatives().collect();
+ let mut sparse = self.new_sparse_set();
+ let mut uncompiled = vec![self.add_start(&mut sparse)?];
+ while let Some(dfa_id) = uncompiled.pop() {
+ for &b in &representative_bytes {
+ let (next_dfa_id, is_new) =
+ self.cached_state(dfa_id, b, &mut sparse)?;
+ self.dfa.add_transition(dfa_id, b, next_dfa_id);
+ if is_new {
+ uncompiled.push(next_dfa_id);
+ }
+ }
+ }
+
+ // At this point, we shuffle the matching states in the final DFA to
+ // the beginning. This permits a DFA's match loop to detect a match
+ // condition by merely inspecting the current state's identifier, and
+ // avoids the need for any additional auxiliary storage.
+ let is_match: Vec<bool> =
+ self.builder_states.iter().map(|s| s.is_match).collect();
+ self.dfa.shuffle_match_states(&is_match);
+ Ok(self.dfa)
+ }
+
+ /// Return the identifier for the next DFA state given an existing DFA
+ /// state and an input byte. If the next DFA state already exists, then
+ /// return its identifier from the cache. Otherwise, build the state, cache
+ /// it and return its identifier.
+ ///
+ /// The given sparse set is used for scratch space. It must have a capacity
+ /// equivalent to the total number of NFA states, but its contents are
+ /// otherwise unspecified.
+ ///
+ /// This routine returns a boolean indicating whether a new state was
+ /// built. If a new state is built, then the caller needs to add it to its
+ /// frontier of uncompiled DFA states to compute transitions for.
+ fn cached_state(
+ &mut self,
+ dfa_id: S,
+ b: u8,
+ sparse: &mut SparseSet,
+ ) -> Result<(S, bool)> {
+ sparse.clear();
+ // Compute the set of all reachable NFA states, including epsilons.
+ self.next(dfa_id, b, sparse);
+ // Build a candidate state and check if it has already been built.
+ let state = self.new_state(sparse);
+ if let Some(&cached_id) = self.cache.get(&state) {
+ // Since we have a cached state, put the constructed state's
+ // memory back into our scratch space, so that it can be reused.
+ mem::replace(&mut self.scratch_nfa_states, state.nfa_states);
+ return Ok((cached_id, false));
+ }
+ // Nothing was in the cache, so add this state to the cache.
+ self.add_state(state).map(|s| (s, true))
+ }
+
+ /// Compute the set of all eachable NFA states, including the full epsilon
+ /// closure, from a DFA state for a single byte of input.
+ fn next(&mut self, dfa_id: S, b: u8, next_nfa_states: &mut SparseSet) {
+ next_nfa_states.clear();
+ for i in 0..self.builder_states[dfa_id.to_usize()].nfa_states.len() {
+ let nfa_id = self.builder_states[dfa_id.to_usize()].nfa_states[i];
+ match *self.nfa.state(nfa_id) {
+ nfa::State::Union { .. }
+ | nfa::State::Fail
+ | nfa::State::Match => {}
+ nfa::State::Range { range: ref r } => {
+ if r.start <= b && b <= r.end {
+ self.epsilon_closure(r.next, next_nfa_states);
+ }
+ }
+ nfa::State::Sparse { ref ranges } => {
+ for r in ranges.iter() {
+ if r.start > b {
+ break;
+ } else if r.start <= b && b <= r.end {
+ self.epsilon_closure(r.next, next_nfa_states);
+ break;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ /// Compute the epsilon closure for the given NFA state.
+ fn epsilon_closure(&mut self, start: nfa::StateID, set: &mut SparseSet) {
+ if !self.nfa.state(start).is_epsilon() {
+ set.insert(start);
+ return;
+ }
+
+ self.stack.push(start);
+ while let Some(mut id) = self.stack.pop() {
+ loop {
+ if set.contains(id) {
+ break;
+ }
+ set.insert(id);
+ match *self.nfa.state(id) {
+ nfa::State::Range { .. }
+ | nfa::State::Sparse { .. }
+ | nfa::State::Fail
+ | nfa::State::Match => break,
+ nfa::State::Union { ref alternates } => {
+ id = match alternates.get(0) {
+ None => break,
+ Some(&id) => id,
+ };
+ self.stack.extend(alternates[1..].iter().rev());
+ }
+ }
+ }
+ }
+ }
+
+ /// Compute the initial DFA state and return its identifier.
+ ///
+ /// The sparse set given is used for scratch space, and must have capacity
+ /// equal to the total number of NFA states. Its contents are unspecified.
+ fn add_start(&mut self, sparse: &mut SparseSet) -> Result<S> {
+ sparse.clear();
+ self.epsilon_closure(self.nfa.start(), sparse);
+ let state = self.new_state(&sparse);
+ let id = self.add_state(state)?;
+ self.dfa.set_start_state(id);
+ Ok(id)
+ }
+
+ /// Add the given state to the DFA and make it available in the cache.
+ ///
+ /// The state initially has no transitions. That is, it transitions to the
+ /// dead state for all possible inputs.
+ fn add_state(&mut self, state: State) -> Result<S> {
+ let id = self.dfa.add_empty_state()?;
+ let rstate = Rc::new(state);
+ self.builder_states.push(rstate.clone());
+ self.cache.insert(rstate, id);
+ Ok(id)
+ }
+
+ /// Convert the given set of ordered NFA states to a DFA state.
+ fn new_state(&mut self, set: &SparseSet) -> State {
+ let mut state = State {
+ is_match: false,
+ nfa_states: mem::replace(&mut self.scratch_nfa_states, vec![]),
+ };
+ state.nfa_states.clear();
+
+ for &id in set {
+ match *self.nfa.state(id) {
+ nfa::State::Range { .. } => {
+ state.nfa_states.push(id);
+ }
+ nfa::State::Sparse { .. } => {
+ state.nfa_states.push(id);
+ }
+ nfa::State::Fail => {
+ break;
+ }
+ nfa::State::Match => {
+ state.is_match = true;
+ if !self.longest_match {
+ break;
+ }
+ }
+ nfa::State::Union { .. } => {}
+ }
+ }
+ state
+ }
+
+ /// Create a new sparse set with enough capacity to hold all NFA states.
+ fn new_sparse_set(&self) -> SparseSet {
+ SparseSet::new(self.nfa.len())
+ }
+}
+
+impl State {
+ /// Create a new empty dead state.
+ fn dead() -> State {
+ State { nfa_states: vec![], is_match: false }
+ }
+}
diff --git a/src/dfa.rs b/src/dfa.rs
new file mode 100644
index 0000000..43de346
--- /dev/null
+++ b/src/dfa.rs
@@ -0,0 +1,363 @@
+use state_id::StateID;
+
+/// A trait describing the interface of a deterministic finite automaton (DFA).
+///
+/// Every DFA has exactly one start state and at least one dead state (which
+/// may be the same, as in the case of an empty DFA). In all cases, a state
+/// identifier of `0` must be a dead state such that `DFA::is_dead_state(0)`
+/// always returns `true`.
+///
+/// Every DFA also has zero or more match states, such that
+/// `DFA::is_match_state(id)` returns `true` if and only if `id` corresponds to
+/// a match state.
+///
+/// In general, users of this trait likely will only need to use the search
+/// routines such as `is_match`, `shortest_match`, `find` or `rfind`. The other
+/// methods are lower level and are used for walking the transitions of a DFA
+/// manually. In particular, the aforementioned search routines are implemented
+/// generically in terms of the lower level transition walking routines.
+pub trait DFA {
+ /// The representation used for state identifiers in this DFA.
+ ///
+ /// Typically, this is one of `u8`, `u16`, `u32`, `u64` or `usize`.
+ type ID: StateID;
+
+ /// Return the identifier of this DFA's start state.
+ fn start_state(&self) -> Self::ID;
+
+ /// Returns true if and only if the given identifier corresponds to a match
+ /// state.
+ fn is_match_state(&self, id: Self::ID) -> bool;
+
+ /// Returns true if and only if the given identifier corresponds to a dead
+ /// state. When a DFA enters a dead state, it is impossible to leave and
+ /// thus can never lead to a match.
+ fn is_dead_state(&self, id: Self::ID) -> bool;
+
+ /// Returns true if and only if the given identifier corresponds to either
+ /// a dead state or a match state, such that one of `is_match_state(id)`
+ /// or `is_dead_state(id)` must return true.
+ ///
+ /// Depending on the implementation of the DFA, this routine can be used
+ /// to save a branch in the core matching loop. Nevertheless,
+ /// `is_match_state(id) || is_dead_state(id)` is always a valid
+ /// implementation.
+ fn is_match_or_dead_state(&self, id: Self::ID) -> bool;
+
+ /// Returns true if and only if this DFA is anchored.
+ ///
+ /// When a DFA is anchored, it is only allowed to report matches that
+ /// start at index `0`.
+ fn is_anchored(&self) -> bool;
+
+ /// Given the current state that this DFA is in and the next input byte,
+ /// this method returns the identifier of the next state. The identifier
+ /// returned is always valid, but it may correspond to a dead state.
+ fn next_state(&self, current: Self::ID, input: u8) -> Self::ID;
+
+ /// Like `next_state`, but its implementation may look up the next state
+ /// without memory safety checks such as bounds checks. As such, callers
+ /// must ensure that the given identifier corresponds to a valid DFA
+ /// state. Implementors must, in turn, ensure that this routine is safe
+ /// for all valid state identifiers and for all possible `u8` values.
+ unsafe fn next_state_unchecked(
+ &self,
+ current: Self::ID,
+ input: u8,
+ ) -> Self::ID;
+
+ /// Returns true if and only if the given bytes match this DFA.
+ ///
+ /// This routine may short circuit if it knows that scanning future input
+ /// will never lead to a different result. In particular, if a DFA enters
+ /// a match state or a dead state, then this routine will return `true` or
+ /// `false`, respectively, without inspecting any future input.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to use this method with a
+ /// [`DenseDFA`](enum.DenseDFA.html).
+ ///
+ /// ```
+ /// use regex_automata::{DFA, DenseDFA};
+ ///
+ /// # fn example() -> Result<(), regex_automata::Error> {
+ /// let dfa = DenseDFA::new("foo[0-9]+bar")?;
+ /// assert_eq!(true, dfa.is_match(b"foo12345bar"));
+ /// assert_eq!(false, dfa.is_match(b"foobar"));
+ /// # Ok(()) }; example().unwrap()
+ /// ```
+ #[inline]
+ fn is_match(&self, bytes: &[u8]) -> bool {
+ self.is_match_at(bytes, 0)
+ }
+
+ /// Returns the first position at which a match is found.
+ ///
+ /// This routine stops scanning input in precisely the same circumstances
+ /// as `is_match`. The key difference is that this routine returns the
+ /// position at which it stopped scanning input if and only if a match
+ /// was found. If no match is found, then `None` is returned.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to use this method with a
+ /// [`DenseDFA`](enum.DenseDFA.html).
+ ///
+ /// ```
+ /// use regex_automata::{DFA, DenseDFA};
+ ///
+ /// # fn example() -> Result<(), regex_automata::Error> {
+ /// let dfa = DenseDFA::new("foo[0-9]+")?;
+ /// assert_eq!(Some(4), dfa.shortest_match(b"foo12345"));
+ ///
+ /// // Normally, the end of the leftmost first match here would be 3,
+ /// // but the shortest match semantics detect a match earlier.
+ /// let dfa = DenseDFA::new("abc|a")?;
+ /// assert_eq!(Some(1), dfa.shortest_match(b"abc"));
+ /// # Ok(()) }; example().unwrap()
+ /// ```
+ #[inline]
+ fn shortest_match(&self, bytes: &[u8]) -> Option<usize> {
+ self.shortest_match_at(bytes, 0)
+ }
+
+ /// Returns the end offset of the longest match. If no match exists,
+ /// then `None` is returned.
+ ///
+ /// Implementors of this trait are not required to implement any particular
+ /// match semantics (such as leftmost-first), which are instead manifest in
+ /// the DFA's topology itself.
+ ///
+ /// In particular, this method must continue searching even after it
+ /// enters a match state. The search should only terminate once it has
+ /// reached the end of the input or when it has entered a dead state. Upon
+ /// termination, the position of the last byte seen while still in a match
+ /// state is returned.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to use this method with a
+ /// [`DenseDFA`](enum.DenseDFA.html). By default, a dense DFA uses
+ /// "leftmost first" match semantics.
+ ///
+ /// Leftmost first match semantics corresponds to the match with the
+ /// smallest starting offset, but where the end offset is determined by
+ /// preferring earlier branches in the original regular expression. For
+ /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam`
+ /// will match `Samwise` in `Samwise`.
+ ///
+ /// Generally speaking, the "leftmost first" match is how most backtracking
+ /// regular expressions tend to work. This is in contrast to POSIX-style
+ /// regular expressions that yield "leftmost longest" matches. Namely,
+ /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using
+ /// leftmost longest semantics.
+ ///
+ /// ```
+ /// use regex_automata::{DFA, DenseDFA};
+ ///
+ /// # fn example() -> Result<(), regex_automata::Error> {
+ /// let dfa = DenseDFA::new("foo[0-9]+")?;
+ /// assert_eq!(Some(8), dfa.find(b"foo12345"));
+ ///
+ /// // Even though a match is found after reading the first byte (`a`),
+ /// // the leftmost first match semantics demand that we find the earliest
+ /// // match that prefers earlier parts of the pattern over latter parts.
+ /// let dfa = DenseDFA::new("abc|a")?;
+ /// assert_eq!(Some(3), dfa.find(b"abc"));
+ /// # Ok(()) }; example().unwrap()
+ /// ```
+ #[inline]
+ fn find(&self, bytes: &[u8]) -> Option<usize> {
+ self.find_at(bytes, 0)
+ }
+
+ /// Returns the start offset of the longest match in reverse, by searching
+ /// from the end of the input towards the start of the input. If no match
+ /// exists, then `None` is returned. In other words, this has the same
+ /// match semantics as `find`, but in reverse.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to use this method with a
+ /// [`DenseDFA`](enum.DenseDFA.html). In particular, this routine
+ /// is principally useful when used in conjunction with the
+ /// [`dense::Builder::reverse`](dense/struct.Builder.html#method.reverse)
+ /// configuration knob. In general, it's unlikely to be correct to use both
+ /// `find` and `rfind` with the same DFA since any particular DFA will only
+ /// support searching in one direction.
+ ///
+ /// ```
+ /// use regex_automata::{dense, DFA};
+ ///
+ /// # fn example() -> Result<(), regex_automata::Error> {
+ /// let dfa = dense::Builder::new().reverse(true).build("foo[0-9]+")?;
+ /// assert_eq!(Some(0), dfa.rfind(b"foo12345"));
+ /// # Ok(()) }; example().unwrap()
+ /// ```
+ #[inline]
+ fn rfind(&self, bytes: &[u8]) -> Option<usize> {
+ self.rfind_at(bytes, bytes.len())
+ }
+
+ /// Returns the same as `is_match`, but starts the search at the given
+ /// offset.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, if the DFA is anchored, then
+ /// a match can only occur when `start == 0`.
+ #[inline]
+ fn is_match_at(&self, bytes: &[u8], start: usize) -> bool {
+ if self.is_anchored() && start > 0 {
+ return false;
+ }
+
+ let mut state = self.start_state();
+ if self.is_match_or_dead_state(state) {
+ return self.is_match_state(state);
+ }
+ for &b in bytes[start..].iter() {
+ state = unsafe { self.next_state_unchecked(state, b) };
+ if self.is_match_or_dead_state(state) {
+ return self.is_match_state(state);
+ }
+ }
+ false
+ }
+
+ /// Returns the same as `shortest_match`, but starts the search at the
+ /// given offset.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, if the DFA is anchored, then
+ /// a match can only occur when `start == 0`.
+ #[inline]
+ fn shortest_match_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
+ if self.is_anchored() && start > 0 {
+ return None;
+ }
+
+ let mut state = self.start_state();
+ if self.is_match_or_dead_state(state) {
+ return if self.is_dead_state(state) { None } else { Some(start) };
+ }
+ for (i, &b) in bytes[start..].iter().enumerate() {
+ state = unsafe { self.next_state_unchecked(state, b) };
+ if self.is_match_or_dead_state(state) {
+ return if self.is_dead_state(state) {
+ None
+ } else {
+ Some(start + i + 1)
+ };
+ }
+ }
+ None
+ }
+
+ /// Returns the same as `find`, but starts the search at the given
+ /// offset.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, if the DFA is anchored, then
+ /// a match can only occur when `start == 0`.
+ #[inline]
+ fn find_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
+ if self.is_anchored() && start > 0 {
+ return None;
+ }
+
+ let mut state = self.start_state();
+ let mut last_match = if self.is_dead_state(state) {
+ return None;
+ } else if self.is_match_state(state) {
+ Some(start)
+ } else {
+ None
+ };
+ for (i, &b) in bytes[start..].iter().enumerate() {
+ state = unsafe { self.next_state_unchecked(state, b) };
+ if self.is_match_or_dead_state(state) {
+ if self.is_dead_state(state) {
+ return last_match;
+ }
+ last_match = Some(start + i + 1);
+ }
+ }
+ last_match
+ }
+
+ /// Returns the same as `rfind`, but starts the search at the given
+ /// offset.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, if the DFA is anchored, then
+ /// a match can only occur when `start == bytes.len()`.
+ #[inline(never)]
+ fn rfind_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
+ if self.is_anchored() && start < bytes.len() {
+ return None;
+ }
+
+ let mut state = self.start_state();
+ let mut last_match = if self.is_dead_state(state) {
+ return None;
+ } else if self.is_match_state(state) {
+ Some(start)
+ } else {
+ None
+ };
+ for (i, &b) in bytes[..start].iter().enumerate().rev() {
+ state = unsafe { self.next_state_unchecked(state, b) };
+ if self.is_match_or_dead_state(state) {
+ if self.is_dead_state(state) {
+ return last_match;
+ }
+ last_match = Some(i);
+ }
+ }
+ last_match
+ }
+}
+
+impl<'a, T: DFA> DFA for &'a T {
+ type ID = T::ID;
+
+ #[inline]
+ fn start_state(&self) -> Self::ID {
+ (**self).start_state()
+ }
+
+ #[inline]
+ fn is_match_state(&self, id: Self::ID) -> bool {
+ (**self).is_match_state(id)
+ }
+
+ #[inline]
+ fn is_match_or_dead_state(&self, id: Self::ID) -> bool {
+ (**self).is_match_or_dead_state(id)
+ }
+
+ #[inline]
+ fn is_dead_state(&self, id: Self::ID) -> bool {
+ (**self).is_dead_state(id)
+ }
+
+ #[inline]
+ fn is_anchored(&self) -> bool {
+ (**self).is_anchored()
+ }
+
+ #[inline]
+ fn next_state(&self, current: Self::ID, input: u8) -> Self::ID {
+ (**self).next_state(current, input)
+ }
+
+ #[inline]
+ unsafe fn next_state_unchecked(
+ &self,
+ current: Self::ID,
+ input: u8,
+ ) -> Self::ID {
+ (**self).next_state_unchecked(current, input)
+ }
+}
diff --git a/src/error.rs b/src/error.rs
new file mode 100644
index 0000000..70fe436
--- /dev/null
+++ b/src/error.rs
@@ -0,0 +1,150 @@
+use std::error;
+use std::fmt;
+use std::result;
+
+use regex_syntax;
+
+pub type Result<T> = result::Result<T, Error>;
+
+/// An error that occurred during the construction of a DFA.
+#[derive(Clone, Debug)]
+pub struct Error {
+ kind: ErrorKind,
+}
+
+/// The kind of error that occurred.
+#[derive(Clone, Debug)]
+pub enum ErrorKind {
+ /// An error that occurred while parsing a regular expression. Note that
+ /// this error may be printed over multiple lines, and is generally
+ /// intended to be end user readable on its own.
+ Syntax(String),
+ /// An error that occurred because an unsupported regex feature was used.
+ /// The message string describes which unsupported feature was used.
+ ///
+ /// The primary regex features that are unsupported are those that require
+ /// look-around, such as the `^` and `$` anchors and the word boundary
+ /// assertion `\b`. These may be supported in the future.
+ Unsupported(String),
+ /// An error that occurred when attempting to serialize a DFA to bytes.
+ Serialize(String),
+ /// An error that occurs when constructing a DFA would require the use of
+ /// a state ID that overflows the chosen state ID representation. For
+ /// example, if one is using `u8` for state IDs and builds a DFA with
+ /// 257 states, then the last state's ID will be `256` which cannot be
+ /// represented with `u8`.
+ ///
+ /// Typically, this error occurs in the determinization process of building
+ /// a DFA (the conversion step from NFA to DFA). It can also occur when
+ /// trying to build a smaller DFA from an existing one.
+ StateIDOverflow {
+ /// The maximum possible state ID.
+ max: usize,
+ },
+ /// An error that occurs when premultiplication of state IDs is requested,
+ /// but doing so would overflow the chosen state ID representation.
+ ///
+ /// When `max == requested_max`, then the state ID would overflow `usize`.
+ PremultiplyOverflow {
+ /// The maximum possible state id.
+ max: usize,
+ /// The maximum ID required by premultiplication.
+ requested_max: usize,
+ },
+}
+
+impl Error {
+ /// Return the kind of this error.
+ pub fn kind(&self) -> &ErrorKind {
+ &self.kind
+ }
+
+ pub(crate) fn syntax(err: regex_syntax::Error) -> Error {
+ Error { kind: ErrorKind::Syntax(err.to_string()) }
+ }
+
+ pub(crate) fn unsupported_anchor() -> Error {
+ let msg = r"anchors such as ^, $, \A and \z are not supported";
+ Error { kind: ErrorKind::Unsupported(msg.to_string()) }
+ }
+
+ pub(crate) fn unsupported_word() -> Error {
+ let msg = r"word boundary assertions (\b and \B) are not supported";
+ Error { kind: ErrorKind::Unsupported(msg.to_string()) }
+ }
+
+ pub(crate) fn unsupported_longest_match() -> Error {
+ let msg = "unachored searches with longest match \
+ semantics are not supported";
+ Error { kind: ErrorKind::Unsupported(msg.to_string()) }
+ }
+
+ pub(crate) fn serialize(message: &str) -> Error {
+ Error { kind: ErrorKind::Serialize(message.to_string()) }
+ }
+
+ pub(crate) fn state_id_overflow(max: usize) -> Error {
+ Error { kind: ErrorKind::StateIDOverflow { max } }
+ }
+
+ pub(crate) fn premultiply_overflow(
+ max: usize,
+ requested_max: usize,
+ ) -> Error {
+ Error { kind: ErrorKind::PremultiplyOverflow { max, requested_max } }
+ }
+}
+
+impl error::Error for Error {
+ fn description(&self) -> &str {
+ match self.kind {
+ ErrorKind::Syntax(_) => "syntax error",
+ ErrorKind::Unsupported(_) => "unsupported syntax",
+ ErrorKind::Serialize(_) => "serialization error",
+ ErrorKind::StateIDOverflow { .. } => {
+ "state id representation too small"
+ }
+ ErrorKind::PremultiplyOverflow { .. } => {
+ "state id representation too small for premultiplication"
+ }
+ }
+ }
+}
+
+impl fmt::Display for Error {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ match self.kind {
+ ErrorKind::Syntax(ref msg) => write!(f, "{}", msg),
+ ErrorKind::Unsupported(ref msg) => write!(f, "{}", msg),
+ ErrorKind::Serialize(ref msg) => {
+ write!(f, "DFA serialization error: {}", msg)
+ }
+ ErrorKind::StateIDOverflow { max } => write!(
+ f,
+ "building the DFA failed because it required building \
+ more states that can be identified, where the maximum \
+ ID for the chosen representation is {}",
+ max,
+ ),
+ ErrorKind::PremultiplyOverflow { max, requested_max } => {
+ if max == requested_max {
+ write!(
+ f,
+ "premultiplication of states requires the ability to \
+ represent a state ID greater than what can fit on \
+ this platform's usize, which is {}",
+ ::std::usize::MAX,
+ )
+ } else {
+ write!(
+ f,
+ "premultiplication of states requires the ability to \
+ represent at least a state ID of {}, but the chosen \
+ representation only permits a maximum state ID of {}",
+ requested_max, max,
+ )
+ }
+ }
+ }
+ }
+}
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..4d3e9c1
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,360 @@
+/*!
+A low level regular expression library that uses deterministic finite automata.
+It supports a rich syntax with Unicode support, has extensive options for
+configuring the best space vs time trade off for your use case and provides
+support for cheap deserialization of automata for use in `no_std` environments.
+
+# Overview
+
+This section gives a brief overview of the primary types in this crate:
+
+* A [`Regex`](struct.Regex.html) provides a way to search for matches of a
+ regular expression. This includes iterating over matches with both the start
+ and end positions of each match.
+* A [`RegexBuilder`](struct.RegexBuilder.html) provides a way configure many
+ compilation options for a regex.
+* A [`DenseDFA`](enum.DenseDFA.html) provides low level access to a DFA that
+ uses a dense representation (uses lots of space, but fast searching).
+* A [`SparseDFA`](enum.SparseDFA.html) provides the same API as a `DenseDFA`,
+ but uses a sparse representation (uses less space, but slower matching).
+* A [`DFA`](trait.DFA.html) trait that defines an interface that all DFAs must
+ implement.
+* Both dense DFAs and sparse DFAs support
+ [serialization to raw bytes](enum.DenseDFA.html#method.to_bytes_little_endian)
+ and
+ [cheap deserialization](enum.DenseDFA.html#method.from_bytes).
+
+# Example: basic regex searching
+
+This example shows how to compile a regex using the default configuration
+and then use it to find matches in a byte string:
+
+```
+use regex_automata::Regex;
+
+let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
+let text = b"2018-12-24 2016-10-08";
+let matches: Vec<(usize, usize)> = re.find_iter(text).collect();
+assert_eq!(matches, vec![(0, 10), (11, 21)]);
+```
+
+# Example: use sparse DFAs
+
+By default, compiling a regex will use dense DFAs internally. This uses more
+memory, but executes searches more quickly. If you can abide slower searches
+(somewhere around 3-5x), then sparse DFAs might make more sense since they can
+use significantly less space.
+
+Using sparse DFAs is as easy as using `Regex::new_sparse` instead of
+`Regex::new`:
+
+```
+use regex_automata::Regex;
+
+# fn example() -> Result<(), regex_automata::Error> {
+let re = Regex::new_sparse(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
+let text = b"2018-12-24 2016-10-08";
+let matches: Vec<(usize, usize)> = re.find_iter(text).collect();
+assert_eq!(matches, vec![(0, 10), (11, 21)]);
+# Ok(()) }; example().unwrap()
+```
+
+If you already have dense DFAs for some reason, they can be converted to sparse
+DFAs and used to build a new `Regex`. For example:
+
+```
+use regex_automata::Regex;
+
+# fn example() -> Result<(), regex_automata::Error> {
+let dense_re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
+let sparse_re = Regex::from_dfas(
+ dense_re.forward().to_sparse()?,
+ dense_re.reverse().to_sparse()?,
+);
+let text = b"2018-12-24 2016-10-08";
+let matches: Vec<(usize, usize)> = sparse_re.find_iter(text).collect();
+assert_eq!(matches, vec![(0, 10), (11, 21)]);
+# Ok(()) }; example().unwrap()
+```
+
+# Example: deserialize a DFA
+
+This shows how to first serialize a DFA into raw bytes, and then deserialize
+those raw bytes back into a DFA. While this particular example is a bit
+contrived, this same technique can be used in your program to deserialize a
+DFA at start up time or by memory mapping a file. In particular,
+deserialization is guaranteed to be cheap because it will always be a constant
+time operation.
+
+```
+use regex_automata::{DenseDFA, Regex};
+
+# fn example() -> Result<(), regex_automata::Error> {
+let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
+// serialize both the forward and reverse DFAs, see note below
+let fwd_bytes = re1.forward().to_u16()?.to_bytes_native_endian()?;
+let rev_bytes = re1.reverse().to_u16()?.to_bytes_native_endian()?;
+// now deserialize both---we need to specify the correct type!
+let fwd: DenseDFA<&[u16], u16> = unsafe { DenseDFA::from_bytes(&fwd_bytes) };
+let rev: DenseDFA<&[u16], u16> = unsafe { DenseDFA::from_bytes(&rev_bytes) };
+// finally, reconstruct our regex
+let re2 = Regex::from_dfas(fwd, rev);
+
+// we can use it like normal
+let text = b"2018-12-24 2016-10-08";
+let matches: Vec<(usize, usize)> = re2.find_iter(text).collect();
+assert_eq!(matches, vec![(0, 10), (11, 21)]);
+# Ok(()) }; example().unwrap()
+```
+
+There are a few points worth noting here:
+
+* We need to extract the raw DFAs used by the regex and serialize those. You
+ can build the DFAs manually yourself using
+ [`dense::Builder`](dense/struct.Builder.html), but using the DFAs from a
+ `Regex` guarantees that the DFAs are built correctly.
+* We specifically convert the dense DFA to a representation that uses `u16`
+ for its state identifiers using
+ [`DenseDFA::to_u16`](enum.DenseDFA.html#method.to_u16). While this isn't
+ strictly necessary, if we skipped this step, then the serialized bytes would
+ use `usize` for state identifiers, which does not have a fixed size. Using
+ `u16` ensures that we can deserialize this DFA even on platforms with a
+ smaller pointer size. If our DFA is too big for `u16` state identifiers, then
+ one can use `u32` or `u64`.
+* To convert the DFA to raw bytes, we use the `to_bytes_native_endian`
+ method. In practice, you'll want to use either
+ [`DenseDFA::to_bytes_little_endian`](enum.DenseDFA.html#method.to_bytes_little_endian)
+ or
+ [`DenseDFA::to_bytes_big_endian`](enum.DenseDFA.html#method.to_bytes_big_endian),
+ depending on which platform you're deserializing your DFA from. If you intend
+ to deserialize on either platform, then you'll need to serialize both and
+ deserialize the right one depending on your target's endianness.
+* Deserializing a DFA requires the use of `unsafe` because the raw bytes must
+ be *trusted*. In particular, while some degree of sanity checks are
+ performed, nothing guarantees the integrity of the DFA's transition table
+ since deserialization is a constant time operation. Since searching with a
+ DFA must be able to follow transitions blindly for performance reasons,
+ giving incorrect bytes to the deserialization API can result in memory
+ unsafety.
+
+The same process can be achieved with sparse DFAs as well:
+
+```
+use regex_automata::{SparseDFA, Regex};
+
+# fn example() -> Result<(), regex_automata::Error> {
+let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
+// serialize both
+let fwd_bytes = re1.forward().to_u16()?.to_sparse()?.to_bytes_native_endian()?;
+let rev_bytes = re1.reverse().to_u16()?.to_sparse()?.to_bytes_native_endian()?;
+// now deserialize both---we need to specify the correct type!
+let fwd: SparseDFA<&[u8], u16> = unsafe { SparseDFA::from_bytes(&fwd_bytes) };
+let rev: SparseDFA<&[u8], u16> = unsafe { SparseDFA::from_bytes(&rev_bytes) };
+// finally, reconstruct our regex
+let re2 = Regex::from_dfas(fwd, rev);
+
+// we can use it like normal
+let text = b"2018-12-24 2016-10-08";
+let matches: Vec<(usize, usize)> = re2.find_iter(text).collect();
+assert_eq!(matches, vec![(0, 10), (11, 21)]);
+# Ok(()) }; example().unwrap()
+```
+
+Note that unlike dense DFAs, sparse DFAs have no alignment requirements.
+Conversely, dense DFAs must be be aligned to the same alignment as their
+state identifier representation.
+
+# Support for `no_std`
+
+This crate comes with a `std` feature that is enabled by default. When the
+`std` feature is enabled, the API of this crate will include the facilities
+necessary for compiling, serializing, deserializing and searching with regular
+expressions. When the `std` feature is disabled, the API of this crate will
+shrink such that it only includes the facilities necessary for deserializing
+and searching with regular expressions.
+
+The intended workflow for `no_std` environments is thus as follows:
+
+* Write a program with the `std` feature that compiles and serializes a
+ regular expression. Serialization should only happen after first converting
+ the DFAs to use a fixed size state identifier instead of the default `usize`.
+ You may also need to serialize both little and big endian versions of each
+ DFA. (So that's 4 DFAs in total for each regex.)
+* In your `no_std` environment, follow the examples above for deserializing
+ your previously serialized DFAs into regexes. You can then search with them
+ as you would any regex.
+
+Deserialization can happen anywhere. For example, with bytes embedded into a
+binary or with a file memory mapped at runtime.
+
+Note that the
+[`ucd-generate`](https://github.com/BurntSushi/ucd-generate)
+tool will do the first step for you with its `dfa` or `regex` sub-commands.
+
+# Syntax
+
+This crate supports the same syntax as the `regex` crate, since they share the
+same parser. You can find an exhaustive list of supported syntax in the
+[documentation for the `regex` crate](https://docs.rs/regex/1.1/regex/#syntax).
+
+Currently, there are a couple limitations. In general, this crate does not
+support zero-width assertions, although they may be added in the future. This
+includes:
+
+* Anchors such as `^`, `$`, `\A` and `\z`.
+* Word boundary assertions such as `\b` and `\B`.
+
+It is possible to run a search that is anchored at the beginning of the input.
+To do that, set the
+[`RegexBuilder::anchored`](struct.RegexBuilder.html#method.anchored)
+option when building a regex. By default, all searches are unanchored.
+
+# Differences with the regex crate
+
+The main goal of the [`regex`](https://docs.rs/regex) crate is to serve as a
+general purpose regular expression engine. It aims to automatically balance low
+compile times, fast search times and low memory usage, while also providing
+a convenient API for users. In contrast, this crate provides a lower level
+regular expression interface that is a bit less convenient while providing more
+explicit control over memory usage and search times.
+
+Here are some specific negative differences:
+
+* **Compilation can take an exponential amount of time and space** in the size
+ of the regex pattern. While most patterns do not exhibit worst case
+ exponential time, such patterns do exist. For example, `[01]*1[01]{N}` will
+ build a DFA with `2^(N+1)` states. For this reason, untrusted patterns should
+ not be compiled with this library. (In the future, the API may expose an
+ option to return an error if the DFA gets too big.)
+* This crate does not support sub-match extraction, which can be achieved with
+ the regex crate's "captures" API. This may be added in the future, but is
+ unlikely.
+* While the regex crate doesn't necessarily sport fast compilation times, the
+ regexes in this crate are almost universally slow to compile, especially when
+ they contain large Unicode character classes. For example, on my system,
+ compiling `\w{3}` with byte classes enabled takes just over 1 second and
+ almost 5MB of memory! (Compiling a sparse regex takes about the same time
+ but only uses about 500KB of memory.) Conversly, compiling the same regex
+ without Unicode support, e.g., `(?-u)\w{3}`, takes under 1 millisecond and
+ less than 5KB of memory. For this reason, you should only use Unicode
+ character classes if you absolutely need them!
+* This crate does not support regex sets.
+* This crate does not support zero-width assertions such as `^`, `$`, `\b` or
+ `\B`.
+* As a lower level crate, this library does not do literal optimizations. In
+ exchange, you get predictable performance regardless of input. The
+ philosophy here is that literal optimizations should be applied at a higher
+ level, although there is no easy support for this in the ecosystem yet.
+* There is no `&str` API like in the regex crate. In this crate, all APIs
+ operate on `&[u8]`. By default, match indices are guaranteed to fall on
+ UTF-8 boundaries, unless
+ [`RegexBuilder::allow_invalid_utf8`](struct.RegexBuilder.html#method.allow_invalid_utf8)
+ is enabled.
+
+With some of the downsides out of the way, here are some positive differences:
+
+* Both dense and sparse DFAs can be serialized to raw bytes, and then cheaply
+ deserialized. Deserialization always takes constant time since searching can
+ be performed directly on the raw serialized bytes of a DFA.
+* This crate was specifically designed so that the searching phase of a DFA has
+ minimal runtime requirements, and can therefore be used in `no_std`
+ environments. While `no_std` environments cannot compile regexes, they can
+ deserialize pre-compiled regexes.
+* Since this crate builds DFAs ahead of time, it will generally out-perform
+ the `regex` crate on equivalent tasks. The performance difference is likely
+ not large. However, because of a complex set of optimizations in the regex
+ crate (like literal optimizations), an accurate performance comparison may be
+ difficult to do.
+* Sparse DFAs provide a way to build a DFA ahead of time that sacrifices search
+ performance a small amount, but uses much less storage space. Potentially
+ even less than what the regex crate uses.
+* This crate exposes DFAs directly, such as
+ [`DenseDFA`](enum.DenseDFA.html) and [`SparseDFA`](enum.SparseDFA.html),
+ which enables one to do less work in some cases. For example, if you only
+ need the end of a match and not the start of a match, then you can use a DFA
+ directly without building a `Regex`, which always requires a second DFA to
+ find the start of a match.
+* Aside from choosing between dense and sparse DFAs, there are several options
+ for configuring the space usage vs search time trade off. These include
+ things like choosing a smaller state identifier representation, to
+ premultiplying state identifiers and splitting a DFA's alphabet into
+ equivalence classes. Finally, DFA minimization is also provided, but can
+ increase compilation times dramatically.
+*/
+
+#![deny(missing_docs)]
+#![cfg_attr(not(feature = "std"), no_std)]
+
+#[cfg(feature = "std")]
+extern crate core;
+
+#[cfg(all(test, feature = "transducer"))]
+extern crate bstr;
+extern crate byteorder;
+#[cfg(feature = "transducer")]
+extern crate fst;
+#[cfg(feature = "std")]
+extern crate regex_syntax;
+
+pub use dense::DenseDFA;
+pub use dfa::DFA;
+#[cfg(feature = "std")]
+pub use error::{Error, ErrorKind};
+pub use regex::Regex;
+#[cfg(feature = "std")]
+pub use regex::RegexBuilder;
+pub use sparse::SparseDFA;
+pub use state_id::StateID;
+
+mod classes;
+#[path = "dense.rs"]
+mod dense_imp;
+#[cfg(feature = "std")]
+mod determinize;
+mod dfa;
+#[cfg(feature = "std")]
+mod error;
+#[cfg(feature = "std")]
+mod minimize;
+#[cfg(feature = "std")]
+#[doc(hidden)]
+pub mod nfa;
+mod regex;
+#[path = "sparse.rs"]
+mod sparse_imp;
+#[cfg(feature = "std")]
+mod sparse_set;
+mod state_id;
+#[cfg(feature = "transducer")]
+mod transducer;
+
+/// Types and routines specific to dense DFAs.
+///
+/// This module is the home of [`DenseDFA`](enum.DenseDFA.html) and each of its
+/// corresponding variant DFA types, such as [`Standard`](struct.Standard.html)
+/// and [`ByteClass`](struct.ByteClass.html).
+///
+/// This module also contains a [builder](struct.Builder.html) for
+/// configuring the construction of a dense DFA.
+pub mod dense {
+ pub use dense_imp::*;
+}
+
+/// Types and routines specific to sparse DFAs.
+///
+/// This module is the home of [`SparseDFA`](enum.SparseDFA.html) and each of
+/// its corresponding variant DFA types, such as
+/// [`Standard`](struct.Standard.html) and
+/// [`ByteClass`](struct.ByteClass.html).
+///
+/// Unlike the [`dense`](../dense/index.html) module, this module does not
+/// contain a builder specific for sparse DFAs. Instead, the intended way to
+/// build a sparse DFA is either by using a default configuration with its
+/// [constructor](enum.SparseDFA.html#method.new),
+/// or by first
+/// [configuring the construction of a dense DFA](../dense/struct.Builder.html)
+/// and then calling
+/// [`DenseDFA::to_sparse`](../enum.DenseDFA.html#method.to_sparse).
+pub mod sparse {
+ pub use sparse_imp::*;
+}
diff --git a/src/minimize.rs b/src/minimize.rs
new file mode 100644
index 0000000..ededa5f
--- /dev/null
+++ b/src/minimize.rs
@@ -0,0 +1,373 @@
+use std::cell::RefCell;
+use std::fmt;
+use std::mem;
+use std::rc::Rc;
+
+use dense;
+use state_id::{dead_id, StateID};
+
+type DFARepr<S> = dense::Repr<Vec<S>, S>;
+
+/// An implementation of Hopcroft's algorithm for minimizing DFAs.
+///
+/// The algorithm implemented here is mostly taken from Wikipedia:
+/// https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm
+///
+/// This code has had some light optimization attention paid to it,
+/// particularly in the form of reducing allocation as much as possible.
+/// However, it is still generally slow. Future optimization work should
+/// probably focus on the bigger picture rather than micro-optimizations. For
+/// example:
+///
+/// 1. Figure out how to more intelligently create initial partitions. That is,
+/// Hopcroft's algorithm starts by creating two partitions of DFA states
+/// that are known to NOT be equivalent: match states and non-match states.
+/// The algorithm proceeds by progressively refining these partitions into
+/// smaller partitions. If we could start with more partitions, then we
+/// could reduce the amount of work that Hopcroft's algorithm needs to do.
+/// 2. For every partition that we visit, we find all incoming transitions to
+/// every state in the partition for *every* element in the alphabet. (This
+/// is why using byte classes can significantly decrease minimization times,
+/// since byte classes shrink the alphabet.) This is quite costly and there
+/// is perhaps some redundant work being performed depending on the specific
+/// states in the set. For example, we might be able to only visit some
+/// elements of the alphabet based on the transitions.
+/// 3. Move parts of minimization into determinization. If minimization has
+/// fewer states to deal with, then it should run faster. A prime example
+/// of this might be large Unicode classes, which are generated in way that
+/// can create a lot of redundant states. (Some work has been done on this
+/// point during NFA compilation via the algorithm described in the
+/// "Incremental Construction of MinimalAcyclic Finite-State Automata"
+/// paper.)
+pub(crate) struct Minimizer<'a, S: 'a> {
+ dfa: &'a mut DFARepr<S>,
+ in_transitions: Vec<Vec<Vec<S>>>,
+ partitions: Vec<StateSet<S>>,
+ waiting: Vec<StateSet<S>>,
+}
+
+impl<'a, S: StateID> fmt::Debug for Minimizer<'a, S> {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ f.debug_struct("Minimizer")
+ .field("dfa", &self.dfa)
+ .field("in_transitions", &self.in_transitions)
+ .field("partitions", &self.partitions)
+ .field("waiting", &self.waiting)
+ .finish()
+ }
+}
+
+/// A set of states. A state set makes up a single partition in Hopcroft's
+/// algorithm.
+///
+/// It is represented by an ordered set of state identifiers. We use shared
+/// ownership so that a single state set can be in both the set of partitions
+/// and in the set of waiting sets simultaneously without an additional
+/// allocation. Generally, once a state set is built, it becomes immutable.
+///
+/// We use this representation because it avoids the overhead of more
+/// traditional set data structures (HashSet/BTreeSet), and also because
+/// computing intersection/subtraction on this representation is especially
+/// fast.
+#[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
+struct StateSet<S>(Rc<RefCell<Vec<S>>>);
+
+impl<'a, S: StateID> Minimizer<'a, S> {
+ pub fn new(dfa: &'a mut DFARepr<S>) -> Minimizer<'a, S> {
+ let in_transitions = Minimizer::incoming_transitions(dfa);
+ let partitions = Minimizer::initial_partitions(dfa);
+ let waiting = vec![partitions[0].clone()];
+
+ Minimizer { dfa, in_transitions, partitions, waiting }
+ }
+
+ pub fn run(mut self) {
+ let mut incoming = StateSet::empty();
+ let mut scratch1 = StateSet::empty();
+ let mut scratch2 = StateSet::empty();
+ let mut newparts = vec![];
+
+ while let Some(set) = self.waiting.pop() {
+ for b in (0..self.dfa.alphabet_len()).map(|b| b as u8) {
+ self.find_incoming_to(b, &set, &mut incoming);
+
+ for p in 0..self.partitions.len() {
+ self.partitions[p].intersection(&incoming, &mut scratch1);
+ if scratch1.is_empty() {
+ newparts.push(self.partitions[p].clone());
+ continue;
+ }
+
+ self.partitions[p].subtract(&incoming, &mut scratch2);
+ if scratch2.is_empty() {
+ newparts.push(self.partitions[p].clone());
+ continue;
+ }
+
+ let (x, y) =
+ (scratch1.deep_clone(), scratch2.deep_clone());
+ newparts.push(x.clone());
+ newparts.push(y.clone());
+ match self.find_waiting(&self.partitions[p]) {
+ Some(i) => {
+ self.waiting[i] = x;
+ self.waiting.push(y);
+ }
+ None => {
+ if x.len() <= y.len() {
+ self.waiting.push(x);
+ } else {
+ self.waiting.push(y);
+ }
+ }
+ }
+ }
+ newparts = mem::replace(&mut self.partitions, newparts);
+ newparts.clear();
+ }
+ }
+
+ // At this point, we now have a minimal partitioning of states, where
+ // each partition is an equivalence class of DFA states. Now we need to
+ // use this partioning to update the DFA to only contain one state for
+ // each partition.
+
+ // Create a map from DFA state ID to the representative ID of the
+ // equivalence class to which it belongs. The representative ID of an
+ // equivalence class of states is the minimum ID in that class.
+ let mut state_to_part = vec![dead_id(); self.dfa.state_count()];
+ for p in &self.partitions {
+ p.iter(|id| state_to_part[id.to_usize()] = p.min());
+ }
+
+ // Generate a new contiguous sequence of IDs for minimal states, and
+ // create a map from equivalence IDs to the new IDs. Thus, the new
+ // minimal ID of *any* state in the unminimized DFA can be obtained
+ // with minimals_ids[state_to_part[old_id]].
+ let mut minimal_ids = vec![dead_id(); self.dfa.state_count()];
+ let mut new_id = S::from_usize(0);
+ for (id, _) in self.dfa.states() {
+ if state_to_part[id.to_usize()] == id {
+ minimal_ids[id.to_usize()] = new_id;
+ new_id = S::from_usize(new_id.to_usize() + 1);
+ }
+ }
+ // The total number of states in the minimal DFA.
+ let minimal_count = new_id.to_usize();
+
+ // Re-map this DFA in place such that the only states remaining
+ // correspond to the representative states of every equivalence class.
+ for id in (0..self.dfa.state_count()).map(S::from_usize) {
+ // If this state isn't a representative for an equivalence class,
+ // then we skip it since it won't appear in the minimal DFA.
+ if state_to_part[id.to_usize()] != id {
+ continue;
+ }
+ for (_, next) in self.dfa.get_state_mut(id).iter_mut() {
+ *next = minimal_ids[state_to_part[next.to_usize()].to_usize()];
+ }
+ self.dfa.swap_states(id, minimal_ids[id.to_usize()]);
+ }
+ // Trim off all unused states from the pre-minimized DFA. This
+ // represents all states that were merged into a non-singleton
+ // equivalence class of states, and appeared after the first state
+ // in each such class. (Because the state with the smallest ID in each
+ // equivalence class is its representative ID.)
+ self.dfa.truncate_states(minimal_count);
+
+ // Update the new start state, which is now just the minimal ID of
+ // whatever state the old start state was collapsed into.
+ let old_start = self.dfa.start_state();
+ self.dfa.set_start_state(
+ minimal_ids[state_to_part[old_start.to_usize()].to_usize()],
+ );
+
+ // In order to update the ID of the maximum match state, we need to
+ // find the maximum ID among all of the match states in the minimized
+ // DFA. This is not necessarily the new ID of the unminimized maximum
+ // match state, since that could have been collapsed with a much
+ // earlier match state. Therefore, to find the new max match state,
+ // we iterate over all previous match states, find their corresponding
+ // new minimal ID, and take the maximum of those.
+ let old_max = self.dfa.max_match_state();
+ self.dfa.set_max_match_state(dead_id());
+ for id in (0..(old_max.to_usize() + 1)).map(S::from_usize) {
+ let part = state_to_part[id.to_usize()];
+ let new_id = minimal_ids[part.to_usize()];
+ if new_id > self.dfa.max_match_state() {
+ self.dfa.set_max_match_state(new_id);
+ }
+ }
+ }
+
+ fn find_waiting(&self, set: &StateSet<S>) -> Option<usize> {
+ self.waiting.iter().position(|s| s == set)
+ }
+
+ fn find_incoming_to(
+ &self,
+ b: u8,
+ set: &StateSet<S>,
+ incoming: &mut StateSet<S>,
+ ) {
+ incoming.clear();
+ set.iter(|id| {
+ for &inid in &self.in_transitions[id.to_usize()][b as usize] {
+ incoming.add(inid);
+ }
+ });
+ incoming.canonicalize();
+ }
+
+ fn initial_partitions(dfa: &DFARepr<S>) -> Vec<StateSet<S>> {
+ let mut is_match = StateSet::empty();
+ let mut no_match = StateSet::empty();
+ for (id, _) in dfa.states() {
+ if dfa.is_match_state(id) {
+ is_match.add(id);
+ } else {
+ no_match.add(id);
+ }
+ }
+
+ let mut sets = vec![is_match];
+ if !no_match.is_empty() {
+ sets.push(no_match);
+ }
+ sets.sort_by_key(|s| s.len());
+ sets
+ }
+
+ fn incoming_transitions(dfa: &DFARepr<S>) -> Vec<Vec<Vec<S>>> {
+ let mut incoming = vec![];
+ for _ in dfa.states() {
+ incoming.push(vec![vec![]; dfa.alphabet_len()]);
+ }
+ for (id, state) in dfa.states() {
+ for (b, next) in state.transitions() {
+ incoming[next.to_usize()][b as usize].push(id);
+ }
+ }
+ incoming
+ }
+}
+
+impl<S: StateID> StateSet<S> {
+ fn empty() -> StateSet<S> {
+ StateSet(Rc::new(RefCell::new(vec![])))
+ }
+
+ fn add(&mut self, id: S) {
+ self.0.borrow_mut().push(id);
+ }
+
+ fn min(&self) -> S {
+ self.0.borrow()[0]
+ }
+
+ fn canonicalize(&mut self) {
+ self.0.borrow_mut().sort();
+ self.0.borrow_mut().dedup();
+ }
+
+ fn clear(&mut self) {
+ self.0.borrow_mut().clear();
+ }
+
+ fn len(&self) -> usize {
+ self.0.borrow().len()
+ }
+
+ fn is_empty(&self) -> bool {
+ self.len() == 0
+ }
+
+ fn deep_clone(&self) -> StateSet<S> {
+ let ids = self.0.borrow().iter().cloned().collect();
+ StateSet(Rc::new(RefCell::new(ids)))
+ }
+
+ fn iter<F: FnMut(S)>(&self, mut f: F) {
+ for &id in self.0.borrow().iter() {
+ f(id);
+ }
+ }
+
+ fn intersection(&self, other: &StateSet<S>, dest: &mut StateSet<S>) {
+ dest.clear();
+ if self.is_empty() || other.is_empty() {
+ return;
+ }
+
+ let (seta, setb) = (self.0.borrow(), other.0.borrow());
+ let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
+ let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
+ loop {
+ if a == b {
+ dest.add(a);
+ a = match ita.next() {
+ None => break,
+ Some(a) => a,
+ };
+ b = match itb.next() {
+ None => break,
+ Some(b) => b,
+ };
+ } else if a < b {
+ a = match ita.next() {
+ None => break,
+ Some(a) => a,
+ };
+ } else {
+ b = match itb.next() {
+ None => break,
+ Some(b) => b,
+ };
+ }
+ }
+ }
+
+ fn subtract(&self, other: &StateSet<S>, dest: &mut StateSet<S>) {
+ dest.clear();
+ if self.is_empty() || other.is_empty() {
+ self.iter(|s| dest.add(s));
+ return;
+ }
+
+ let (seta, setb) = (self.0.borrow(), other.0.borrow());
+ let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
+ let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
+ loop {
+ if a == b {
+ a = match ita.next() {
+ None => break,
+ Some(a) => a,
+ };
+ b = match itb.next() {
+ None => {
+ dest.add(a);
+ break;
+ }
+ Some(b) => b,
+ };
+ } else if a < b {
+ dest.add(a);
+ a = match ita.next() {
+ None => break,
+ Some(a) => a,
+ };
+ } else {
+ b = match itb.next() {
+ None => {
+ dest.add(a);
+ break;
+ }
+ Some(b) => b,
+ };
+ }
+ }
+ for a in ita {
+ dest.add(a);
+ }
+ }
+}
diff --git a/src/nfa/compiler.rs b/src/nfa/compiler.rs
new file mode 100644
index 0000000..d9b3945
--- /dev/null
+++ b/src/nfa/compiler.rs
@@ -0,0 +1,1193 @@
+// This module provides an NFA compiler using Thompson's construction
+// algorithm. The compiler takes a regex-syntax::Hir as input and emits an NFA
+// graph as output. The NFA graph is structured in a way that permits it to be
+// executed by a virtual machine and also used to efficiently build a DFA.
+//
+// The compiler deals with a slightly expanded set of NFA states that notably
+// includes an empty node that has exactly one epsilon transition to the next
+// state. In other words, it's a "goto" instruction if one views Thompson's NFA
+// as a set of bytecode instructions. These goto instructions are removed in
+// a subsequent phase before returning the NFA to the caller. The purpose of
+// these empty nodes is that they make the construction algorithm substantially
+// simpler to implement. We remove them before returning to the caller because
+// they can represent substantial overhead when traversing the NFA graph
+// (either while searching using the NFA directly or while building a DFA).
+//
+// In the future, it would be nice to provide a Glushkov compiler as well,
+// as it would work well as a bit-parallel NFA for smaller regexes. But
+// the Thompson construction is one I'm more familiar with and seems more
+// straight-forward to deal with when it comes to large Unicode character
+// classes.
+//
+// Internally, the compiler uses interior mutability to improve composition
+// in the face of the borrow checker. In particular, we'd really like to be
+// able to write things like this:
+//
+// self.c_concat(exprs.iter().map(|e| self.c(e)))
+//
+// Which elegantly uses iterators to build up a sequence of compiled regex
+// sub-expressions and then hands it off to the concatenating compiler
+// routine. Without interior mutability, the borrow checker won't let us
+// borrow `self` mutably both inside and outside the closure at the same
+// time.
+
+use std::cell::RefCell;
+use std::mem;
+
+use regex_syntax::hir::{self, Hir, HirKind};
+use regex_syntax::utf8::{Utf8Range, Utf8Sequences};
+
+use classes::ByteClassSet;
+use error::{Error, Result};
+use nfa::map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap};
+use nfa::range_trie::RangeTrie;
+use nfa::{State, StateID, Transition, NFA};
+
+/// Config knobs for the NFA compiler. See the builder's methods for more
+/// docs on each one.
+#[derive(Clone, Copy, Debug)]
+struct Config {
+ anchored: bool,
+ allow_invalid_utf8: bool,
+ reverse: bool,
+ shrink: bool,
+}
+
+impl Default for Config {
+ fn default() -> Config {
+ Config {
+ anchored: false,
+ allow_invalid_utf8: false,
+ reverse: false,
+ shrink: true,
+ }
+ }
+}
+
+/// A builder for compiling an NFA.
+#[derive(Clone, Debug)]
+pub struct Builder {
+ config: Config,
+}
+
+impl Builder {
+ /// Create a new NFA builder with its default configuration.
+ pub fn new() -> Builder {
+ Builder { config: Config::default() }
+ }
+
+ /// Compile the given high level intermediate representation of a regular
+ /// expression into an NFA.
+ ///
+ /// If there was a problem building the NFA, then an error is returned.
+ /// For example, if the regex uses unsupported features (such as zero-width
+ /// assertions), then an error is returned.
+ pub fn build(&self, expr: &Hir) -> Result<NFA> {
+ let mut nfa = NFA::always_match();
+ self.build_with(&mut Compiler::new(), &mut nfa, expr)?;
+ Ok(nfa)
+ }
+
+ /// Compile the given high level intermediate representation of a regular
+ /// expression into the NFA given using the given compiler. Callers may
+ /// prefer this over `build` if they would like to reuse allocations while
+ /// compiling many regular expressions.
+ ///
+ /// On success, the given NFA is completely overwritten with the NFA
+ /// produced by the compiler.
+ ///
+ /// If there was a problem building the NFA, then an error is returned. For
+ /// example, if the regex uses unsupported features (such as zero-width
+ /// assertions), then an error is returned. When an error is returned,
+ /// the contents of `nfa` are unspecified and should not be relied upon.
+ /// However, it can still be reused in subsequent calls to this method.
+ pub fn build_with(
+ &self,
+ compiler: &mut Compiler,
+ nfa: &mut NFA,
+ expr: &Hir,
+ ) -> Result<()> {
+ compiler.clear();
+ compiler.configure(self.config);
+ compiler.compile(nfa, expr)
+ }
+
+ /// Set whether matching must be anchored at the beginning of the input.
+ ///
+ /// When enabled, a match must begin at the start of the input. When
+ /// disabled, the NFA will act as if the pattern started with a `.*?`,
+ /// which enables a match to appear anywhere.
+ ///
+ /// By default this is disabled.
+ pub fn anchored(&mut self, yes: bool) -> &mut Builder {
+ self.config.anchored = yes;
+ self
+ }
+
+ /// When enabled, the builder will permit the construction of an NFA that
+ /// may match invalid UTF-8.
+ ///
+ /// When disabled (the default), the builder is guaranteed to produce a
+ /// regex that will only ever match valid UTF-8 (otherwise, the builder
+ /// will return an error).
+ pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut Builder {
+ self.config.allow_invalid_utf8 = yes;
+ self
+ }
+
+ /// Reverse the NFA.
+ ///
+ /// A NFA reversal is performed by reversing all of the concatenated
+ /// sub-expressions in the original pattern, recursively. The resulting
+ /// NFA can be used to match the pattern starting from the end of a string
+ /// instead of the beginning of a string.
+ ///
+ /// Reversing the NFA is useful for building a reverse DFA, which is most
+ /// useful for finding the start of a match.
+ pub fn reverse(&mut self, yes: bool) -> &mut Builder {
+ self.config.reverse = yes;
+ self
+ }
+
+ /// Apply best effort heuristics to shrink the NFA at the expense of more
+ /// time/memory.
+ ///
+ /// This is enabled by default. Generally speaking, if one is using an NFA
+ /// to compile DFA, then the extra time used to shrink the NFA will be
+ /// more than made up for during DFA construction (potentially by a lot).
+ /// In other words, enabling this can substantially decrease the overall
+ /// amount of time it takes to build a DFA.
+ ///
+ /// The only reason to disable this if you want to compile an NFA and start
+ /// using it as quickly as possible without needing to build a DFA.
+ pub fn shrink(&mut self, yes: bool) -> &mut Builder {
+ self.config.shrink = yes;
+ self
+ }
+}
+
+/// A compiler that converts a regex abstract syntax to an NFA via Thompson's
+/// construction. Namely, this compiler permits epsilon transitions between
+/// states.
+///
+/// Users of this crate cannot use a compiler directly. Instead, all one can
+/// do is create one and use it via the
+/// [`Builder::build_with`](struct.Builder.html#method.build_with)
+/// method. This permits callers to reuse compilers in order to amortize
+/// allocations.
+#[derive(Clone, Debug)]
+pub struct Compiler {
+ /// The set of compiled NFA states. Once a state is compiled, it is
+ /// assigned a state ID equivalent to its index in this list. Subsequent
+ /// compilation can modify previous states by adding new transitions.
+ states: RefCell<Vec<CState>>,
+ /// The configuration from the builder.
+ config: Config,
+ /// State used for compiling character classes to UTF-8 byte automata.
+ /// State is not retained between character class compilations. This just
+ /// serves to amortize allocation to the extent possible.
+ utf8_state: RefCell<Utf8State>,
+ /// State used for arranging character classes in reverse into a trie.
+ trie_state: RefCell<RangeTrie>,
+ /// State used for caching common suffixes when compiling reverse UTF-8
+ /// automata (for Unicode character classes).
+ utf8_suffix: RefCell<Utf8SuffixMap>,
+ /// A map used to re-map state IDs when translating the compiler's internal
+ /// NFA state representation to the external NFA representation.
+ remap: RefCell<Vec<StateID>>,
+ /// A set of compiler internal state IDs that correspond to states that are
+ /// exclusively epsilon transitions, i.e., goto instructions, combined with
+ /// the state that they point to. This is used to record said states while
+ /// transforming the compiler's internal NFA representation to the external
+ /// form.
+ empties: RefCell<Vec<(StateID, StateID)>>,
+}
+
+/// A compiler intermediate state representation for an NFA that is only used
+/// during compilation. Once compilation is done, `CState`s are converted to
+/// `State`s, which have a much simpler representation.
+#[derive(Clone, Debug, Eq, PartialEq)]
+enum CState {
+ /// An empty state whose only purpose is to forward the automaton to
+ /// another state via en epsilon transition. These are useful during
+ /// compilation but are otherwise removed at the end.
+ Empty { next: StateID },
+ /// A state that only transitions to `next` if the current input byte is
+ /// in the range `[start, end]` (inclusive on both ends).
+ Range { range: Transition },
+ /// A state with possibly many transitions, represented in a sparse
+ /// fashion. Transitions are ordered lexicographically by input range.
+ /// As such, this may only be used when every transition has equal
+ /// priority. (In practice, this is only used for encoding large UTF-8
+ /// automata.)
+ Sparse { ranges: Vec<Transition> },
+ /// An alternation such that there exists an epsilon transition to all
+ /// states in `alternates`, where matches found via earlier transitions
+ /// are preferred over later transitions.
+ Union { alternates: Vec<StateID> },
+ /// An alternation such that there exists an epsilon transition to all
+ /// states in `alternates`, where matches found via later transitions
+ /// are preferred over earlier transitions.
+ ///
+ /// This "reverse" state exists for convenience during compilation that
+ /// permits easy construction of non-greedy combinations of NFA states.
+ /// At the end of compilation, Union and UnionReverse states are merged
+ /// into one Union type of state, where the latter has its epsilon
+ /// transitions reversed to reflect the priority inversion.
+ UnionReverse { alternates: Vec<StateID> },
+ /// A match state. There is exactly one such occurrence of this state in
+ /// an NFA.
+ Match,
+}
+
+/// A value that represents the result of compiling a sub-expression of a
+/// regex's HIR. Specifically, this represents a sub-graph of the NFA that
+/// has an initial state at `start` and a final state at `end`.
+#[derive(Clone, Copy, Debug)]
+pub struct ThompsonRef {
+ start: StateID,
+ end: StateID,
+}
+
+impl Compiler {
+ /// Create a new compiler.
+ pub fn new() -> Compiler {
+ Compiler {
+ states: RefCell::new(vec![]),
+ config: Config::default(),
+ utf8_state: RefCell::new(Utf8State::new()),
+ trie_state: RefCell::new(RangeTrie::new()),
+ utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)),
+ remap: RefCell::new(vec![]),
+ empties: RefCell::new(vec![]),
+ }
+ }
+
+ /// Clear any memory used by this compiler such that it is ready to compile
+ /// a new regex.
+ ///
+ /// It is preferrable to reuse a compiler if possible in order to reuse
+ /// allocations.
+ fn clear(&self) {
+ self.states.borrow_mut().clear();
+ // We don't need to clear anything else since they are cleared on
+ // their own and only when they are used.
+ }
+
+ /// Configure this compiler from the builder's knobs.
+ ///
+ /// The compiler is always reconfigured by the builder before using it to
+ /// build an NFA.
+ fn configure(&mut self, config: Config) {
+ self.config = config;
+ }
+
+ /// Convert the current intermediate NFA to its final compiled form.
+ fn compile(&self, nfa: &mut NFA, expr: &Hir) -> Result<()> {
+ nfa.anchored = self.config.anchored;
+
+ let mut start = self.add_empty();
+ if !nfa.anchored {
+ let compiled = if self.config.allow_invalid_utf8 {
+ self.c_unanchored_prefix_invalid_utf8()?
+ } else {
+ self.c_unanchored_prefix_valid_utf8()?
+ };
+ self.patch(start, compiled.start);
+ start = compiled.end;
+ }
+ let compiled = self.c(&expr)?;
+ let match_id = self.add_match();
+ self.patch(start, compiled.start);
+ self.patch(compiled.end, match_id);
+ self.finish(nfa);
+ Ok(())
+ }
+
+ /// Finishes the compilation process and populates the provide NFA with
+ /// the final graph.
+ fn finish(&self, nfa: &mut NFA) {
+ let mut bstates = self.states.borrow_mut();
+ let mut remap = self.remap.borrow_mut();
+ remap.resize(bstates.len(), 0);
+ let mut empties = self.empties.borrow_mut();
+ empties.clear();
+
+ // We don't reuse allocations here becuase this is what we're
+ // returning.
+ nfa.states.clear();
+ let mut byteset = ByteClassSet::new();
+
+ // The idea here is to convert our intermediate states to their final
+ // form. The only real complexity here is the process of converting
+ // transitions, which are expressed in terms of state IDs. The new
+ // set of states will be smaller because of partial epsilon removal,
+ // so the state IDs will not be the same.
+ for (id, bstate) in bstates.iter_mut().enumerate() {
+ match *bstate {
+ CState::Empty { next } => {
+ // Since we're removing empty states, we need to handle
+ // them later since we don't yet know which new state this
+ // empty state will be mapped to.
+ empties.push((id, next));
+ }
+ CState::Range { ref range } => {
+ remap[id] = nfa.states.len();
+ byteset.set_range(range.start, range.end);
+ nfa.states.push(State::Range { range: range.clone() });
+ }
+ CState::Sparse { ref mut ranges } => {
+ remap[id] = nfa.states.len();
+
+ let ranges = mem::replace(ranges, vec![]);
+ for r in &ranges {
+ byteset.set_range(r.start, r.end);
+ }
+ nfa.states.push(State::Sparse {
+ ranges: ranges.into_boxed_slice(),
+ });
+ }
+ CState::Union { ref mut alternates } => {
+ remap[id] = nfa.states.len();
+
+ let alternates = mem::replace(alternates, vec![]);
+ nfa.states.push(State::Union {
+ alternates: alternates.into_boxed_slice(),
+ });
+ }
+ CState::UnionReverse { ref mut alternates } => {
+ remap[id] = nfa.states.len();
+
+ let mut alternates = mem::replace(alternates, vec![]);
+ alternates.reverse();
+ nfa.states.push(State::Union {
+ alternates: alternates.into_boxed_slice(),
+ });
+ }
+ CState::Match => {
+ remap[id] = nfa.states.len();
+ nfa.states.push(State::Match);
+ }
+ }
+ }
+ for &(empty_id, mut empty_next) in empties.iter() {
+ // empty states can point to other empty states, forming a chain.
+ // So we must follow the chain until the end, which must end at
+ // a non-empty state, and therefore, a state that is correctly
+ // remapped. We are guaranteed to terminate because our compiler
+ // never builds a loop among empty states.
+ while let CState::Empty { next } = bstates[empty_next] {
+ empty_next = next;
+ }
+ remap[empty_id] = remap[empty_next];
+ }
+ for state in &mut nfa.states {
+ state.remap(&remap);
+ }
+ // The compiler always begins the NFA at the first state.
+ nfa.start = remap[0];
+ nfa.byte_classes = byteset.byte_classes();
+ }
+
+ fn c(&self, expr: &Hir) -> Result<ThompsonRef> {
+ match *expr.kind() {
+ HirKind::Empty => {
+ let id = self.add_empty();
+ Ok(ThompsonRef { start: id, end: id })
+ }
+ HirKind::Literal(hir::Literal::Unicode(ch)) => {
+ let mut buf = [0; 4];
+ let it = ch
+ .encode_utf8(&mut buf)
+ .as_bytes()
+ .iter()
+ .map(|&b| Ok(self.c_range(b, b)));
+ self.c_concat(it)
+ }
+ HirKind::Literal(hir::Literal::Byte(b)) => Ok(self.c_range(b, b)),
+ HirKind::Class(hir::Class::Bytes(ref cls)) => {
+ self.c_byte_class(cls)
+ }
+ HirKind::Class(hir::Class::Unicode(ref cls)) => {
+ self.c_unicode_class(cls)
+ }
+ HirKind::Repetition(ref rep) => self.c_repetition(rep),
+ HirKind::Group(ref group) => self.c(&*group.hir),
+ HirKind::Concat(ref exprs) => {
+ self.c_concat(exprs.iter().map(|e| self.c(e)))
+ }
+ HirKind::Alternation(ref exprs) => {
+ self.c_alternation(exprs.iter().map(|e| self.c(e)))
+ }
+ HirKind::Anchor(_) => Err(Error::unsupported_anchor()),
+ HirKind::WordBoundary(_) => Err(Error::unsupported_word()),
+ }
+ }
+
+ fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef>
+ where
+ I: DoubleEndedIterator<Item = Result<ThompsonRef>>,
+ {
+ let first =
+ if self.config.reverse { it.next_back() } else { it.next() };
+ let ThompsonRef { start, mut end } = match first {
+ Some(result) => result?,
+ None => return Ok(self.c_empty()),
+ };
+ loop {
+ let next =
+ if self.config.reverse { it.next_back() } else { it.next() };
+ let compiled = match next {
+ Some(result) => result?,
+ None => break,
+ };
+ self.patch(end, compiled.start);
+ end = compiled.end;
+ }
+ Ok(ThompsonRef { start, end })
+ }
+
+ fn c_alternation<I>(&self, mut it: I) -> Result<ThompsonRef>
+ where
+ I: Iterator<Item = Result<ThompsonRef>>,
+ {
+ let first = it.next().expect("alternations must be non-empty")?;
+ let second = match it.next() {
+ None => return Ok(first),
+ Some(result) => result?,
+ };
+
+ let union = self.add_union();
+ let end = self.add_empty();
+ self.patch(union, first.start);
+ self.patch(first.end, end);
+ self.patch(union, second.start);
+ self.patch(second.end, end);
+ for result in it {
+ let compiled = result?;
+ self.patch(union, compiled.start);
+ self.patch(compiled.end, end);
+ }
+ Ok(ThompsonRef { start: union, end })
+ }
+
+ fn c_repetition(&self, rep: &hir::Repetition) -> Result<ThompsonRef> {
+ match rep.kind {
+ hir::RepetitionKind::ZeroOrOne => {
+ self.c_zero_or_one(&rep.hir, rep.greedy)
+ }
+ hir::RepetitionKind::ZeroOrMore => {
+ self.c_at_least(&rep.hir, rep.greedy, 0)
+ }
+ hir::RepetitionKind::OneOrMore => {
+ self.c_at_least(&rep.hir, rep.greedy, 1)
+ }
+ hir::RepetitionKind::Range(ref rng) => match *rng {
+ hir::RepetitionRange::Exactly(count) => {
+ self.c_exactly(&rep.hir, count)
+ }
+ hir::RepetitionRange::AtLeast(m) => {
+ self.c_at_least(&rep.hir, rep.greedy, m)
+ }
+ hir::RepetitionRange::Bounded(min, max) => {
+ self.c_bounded(&rep.hir, rep.greedy, min, max)
+ }
+ },
+ }
+ }
+
+ fn c_bounded(
+ &self,
+ expr: &Hir,
+ greedy: bool,
+ min: u32,
+ max: u32,
+ ) -> Result<ThompsonRef> {
+ let prefix = self.c_exactly(expr, min)?;
+ if min == max {
+ return Ok(prefix);
+ }
+
+ // It is tempting here to compile the rest here as a concatenation
+ // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it
+ // were `aaa?a?a?`. The problem here is that it leads to this program:
+ //
+ // >000000: 61 => 01
+ // 000001: 61 => 02
+ // 000002: alt(03, 04)
+ // 000003: 61 => 04
+ // 000004: alt(05, 06)
+ // 000005: 61 => 06
+ // 000006: alt(07, 08)
+ // 000007: 61 => 08
+ // 000008: MATCH
+ //
+ // And effectively, once you hit state 2, the epsilon closure will
+ // include states 3, 5, 5, 6, 7 and 8, which is quite a bit. It is
+ // better to instead compile it like so:
+ //
+ // >000000: 61 => 01
+ // 000001: 61 => 02
+ // 000002: alt(03, 08)
+ // 000003: 61 => 04
+ // 000004: alt(05, 08)
+ // 000005: 61 => 06
+ // 000006: alt(07, 08)
+ // 000007: 61 => 08
+ // 000008: MATCH
+ //
+ // So that the epsilon closure of state 2 is now just 3 and 8.
+ let empty = self.add_empty();
+ let mut prev_end = prefix.end;
+ for _ in min..max {
+ let union = if greedy {
+ self.add_union()
+ } else {
+ self.add_reverse_union()
+ };
+ let compiled = self.c(expr)?;
+ self.patch(prev_end, union);
+ self.patch(union, compiled.start);
+ self.patch(union, empty);
+ prev_end = compiled.end;
+ }
+ self.patch(prev_end, empty);
+ Ok(ThompsonRef { start: prefix.start, end: empty })
+ }
+
+ fn c_at_least(
+ &self,
+ expr: &Hir,
+ greedy: bool,
+ n: u32,
+ ) -> Result<ThompsonRef> {
+ if n == 0 {
+ let union = if greedy {
+ self.add_union()
+ } else {
+ self.add_reverse_union()
+ };
+ let compiled = self.c(expr)?;
+ self.patch(union, compiled.start);
+ self.patch(compiled.end, union);
+ Ok(ThompsonRef { start: union, end: union })
+ } else if n == 1 {
+ let compiled = self.c(expr)?;
+ let union = if greedy {
+ self.add_union()
+ } else {
+ self.add_reverse_union()
+ };
+ self.patch(compiled.end, union);
+ self.patch(union, compiled.start);
+ Ok(ThompsonRef { start: compiled.start, end: union })
+ } else {
+ let prefix = self.c_exactly(expr, n - 1)?;
+ let last = self.c(expr)?;
+ let union = if greedy {
+ self.add_union()
+ } else {
+ self.add_reverse_union()
+ };
+ self.patch(prefix.end, last.start);
+ self.patch(last.end, union);
+ self.patch(union, last.start);
+ Ok(ThompsonRef { start: prefix.start, end: union })
+ }
+ }
+
+ fn c_zero_or_one(&self, expr: &Hir, greedy: bool) -> Result<ThompsonRef> {
+ let union =
+ if greedy { self.add_union() } else { self.add_reverse_union() };
+ let compiled = self.c(expr)?;
+ let empty = self.add_empty();
+ self.patch(union, compiled.start);
+ self.patch(union, empty);
+ self.patch(compiled.end, empty);
+ Ok(ThompsonRef { start: union, end: empty })
+ }
+
+ fn c_exactly(&self, expr: &Hir, n: u32) -> Result<ThompsonRef> {
+ let it = (0..n).map(|_| self.c(expr));
+ self.c_concat(it)
+ }
+
+ fn c_byte_class(&self, cls: &hir::ClassBytes) -> Result<ThompsonRef> {
+ let end = self.add_empty();
+ let mut trans = Vec::with_capacity(cls.ranges().len());
+ for r in cls.iter() {
+ trans.push(Transition {
+ start: r.start(),
+ end: r.end(),
+ next: end,
+ });
+ }
+ Ok(ThompsonRef { start: self.add_sparse(trans), end })
+ }
+
+ fn c_unicode_class(&self, cls: &hir::ClassUnicode) -> Result<ThompsonRef> {
+ // If all we have are ASCII ranges wrapped in a Unicode package, then
+ // there is zero reason to bring out the big guns. We can fit all ASCII
+ // ranges within a single sparse transition.
+ if cls.is_all_ascii() {
+ let end = self.add_empty();
+ let mut trans = Vec::with_capacity(cls.ranges().len());
+ for r in cls.iter() {
+ assert!(r.start() <= '\x7F');
+ assert!(r.end() <= '\x7F');
+ trans.push(Transition {
+ start: r.start() as u8,
+ end: r.end() as u8,
+ next: end,
+ });
+ }
+ Ok(ThompsonRef { start: self.add_sparse(trans), end })
+ } else if self.config.reverse {
+ if !self.config.shrink {
+ // When we don't want to spend the extra time shrinking, we
+ // compile the UTF-8 automaton in reverse using something like
+ // the "naive" approach, but will attempt to re-use common
+ // suffixes.
+ self.c_unicode_class_reverse_with_suffix(cls)
+ } else {
+ // When we want to shrink our NFA for reverse UTF-8 automata,
+ // we cannot feed UTF-8 sequences directly to the UTF-8
+ // compiler, since the UTF-8 compiler requires all sequences
+ // to be lexicographically sorted. Instead, we organize our
+ // sequences into a range trie, which can then output our
+ // sequences in the correct order. Unfortunately, building the
+ // range trie is fairly expensive (but not nearly as expensive
+ // as building a DFA). Hence the reason why the 'shrink' option
+ // exists, so that this path can be toggled off.
+ let mut trie = self.trie_state.borrow_mut();
+ trie.clear();
+
+ for rng in cls.iter() {
+ for mut seq in Utf8Sequences::new(rng.start(), rng.end()) {
+ seq.reverse();
+ trie.insert(seq.as_slice());
+ }
+ }
+ let mut utf8_state = self.utf8_state.borrow_mut();
+ let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state);
+ trie.iter(|seq| {
+ utf8c.add(&seq);
+ });
+ Ok(utf8c.finish())
+ }
+ } else {
+ // In the forward direction, we always shrink our UTF-8 automata
+ // because we can stream it right into the UTF-8 compiler. There
+ // is almost no downside (in either memory or time) to using this
+ // approach.
+ let mut utf8_state = self.utf8_state.borrow_mut();
+ let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state);
+ for rng in cls.iter() {
+ for seq in Utf8Sequences::new(rng.start(), rng.end()) {
+ utf8c.add(seq.as_slice());
+ }
+ }
+ Ok(utf8c.finish())
+ }
+
+ // For reference, the code below is the "naive" version of compiling a
+ // UTF-8 automaton. It is deliciously simple (and works for both the
+ // forward and reverse cases), but will unfortunately produce very
+ // large NFAs. When compiling a forward automaton, the size difference
+ // can sometimes be an order of magnitude. For example, the '\w' regex
+ // will generate about ~3000 NFA states using the naive approach below,
+ // but only 283 states when using the approach above. This is because
+ // the approach above actually compiles a *minimal* (or near minimal,
+ // because of the bounded hashmap) UTF-8 automaton.
+ //
+ // The code below is kept as a reference point in order to make it
+ // easier to understand the higher level goal here.
+ /*
+ let it = cls
+ .iter()
+ .flat_map(|rng| Utf8Sequences::new(rng.start(), rng.end()))
+ .map(|seq| {
+ let it = seq
+ .as_slice()
+ .iter()
+ .map(|rng| Ok(self.c_range(rng.start, rng.end)));
+ self.c_concat(it)
+ });
+ self.c_alternation(it);
+ */
+ }
+
+ fn c_unicode_class_reverse_with_suffix(
+ &self,
+ cls: &hir::ClassUnicode,
+ ) -> Result<ThompsonRef> {
+ // N.B. It would likely be better to cache common *prefixes* in the
+ // reverse direction, but it's not quite clear how to do that. The
+ // advantage of caching suffixes is that it does give us a win, and
+ // has a very small additional overhead.
+ let mut cache = self.utf8_suffix.borrow_mut();
+ cache.clear();
+
+ let union = self.add_union();
+ let alt_end = self.add_empty();
+ for urng in cls.iter() {
+ for seq in Utf8Sequences::new(urng.start(), urng.end()) {
+ let mut end = alt_end;
+ for brng in seq.as_slice() {
+ let key = Utf8SuffixKey {
+ from: end,
+ start: brng.start,
+ end: brng.end,
+ };
+ let hash = cache.hash(&key);
+ if let Some(id) = cache.get(&key, hash) {
+ end = id;
+ continue;
+ }
+
+ let compiled = self.c_range(brng.start, brng.end);
+ self.patch(compiled.end, end);
+ end = compiled.start;
+ cache.set(key, hash, end);
+ }
+ self.patch(union, end);
+ }
+ }
+ Ok(ThompsonRef { start: union, end: alt_end })
+ }
+
+ fn c_range(&self, start: u8, end: u8) -> ThompsonRef {
+ let id = self.add_range(start, end);
+ ThompsonRef { start: id, end: id }
+ }
+
+ fn c_empty(&self) -> ThompsonRef {
+ let id = self.add_empty();
+ ThompsonRef { start: id, end: id }
+ }
+
+ fn c_unanchored_prefix_valid_utf8(&self) -> Result<ThompsonRef> {
+ self.c(&Hir::repetition(hir::Repetition {
+ kind: hir::RepetitionKind::ZeroOrMore,
+ greedy: false,
+ hir: Box::new(Hir::any(false)),
+ }))
+ }
+
+ fn c_unanchored_prefix_invalid_utf8(&self) -> Result<ThompsonRef> {
+ self.c(&Hir::repetition(hir::Repetition {
+ kind: hir::RepetitionKind::ZeroOrMore,
+ greedy: false,
+ hir: Box::new(Hir::any(true)),
+ }))
+ }
+
+ fn patch(&self, from: StateID, to: StateID) {
+ match self.states.borrow_mut()[from] {
+ CState::Empty { ref mut next } => {
+ *next = to;
+ }
+ CState::Range { ref mut range } => {
+ range.next = to;
+ }
+ CState::Sparse { .. } => {
+ panic!("cannot patch from a sparse NFA state")
+ }
+ CState::Union { ref mut alternates } => {
+ alternates.push(to);
+ }
+ CState::UnionReverse { ref mut alternates } => {
+ alternates.push(to);
+ }
+ CState::Match => {}
+ }
+ }
+
+ fn add_empty(&self) -> StateID {
+ let id = self.states.borrow().len();
+ self.states.borrow_mut().push(CState::Empty { next: 0 });
+ id
+ }
+
+ fn add_range(&self, start: u8, end: u8) -> StateID {
+ let id = self.states.borrow().len();
+ let trans = Transition { start, end, next: 0 };
+ let state = CState::Range { range: trans };
+ self.states.borrow_mut().push(state);
+ id
+ }
+
+ fn add_sparse(&self, ranges: Vec<Transition>) -> StateID {
+ if ranges.len() == 1 {
+ let id = self.states.borrow().len();
+ let state = CState::Range { range: ranges[0] };
+ self.states.borrow_mut().push(state);
+ return id;
+ }
+ let id = self.states.borrow().len();
+ let state = CState::Sparse { ranges };
+ self.states.borrow_mut().push(state);
+ id
+ }
+
+ fn add_union(&self) -> StateID {
+ let id = self.states.borrow().len();
+ let state = CState::Union { alternates: vec![] };
+ self.states.borrow_mut().push(state);
+ id
+ }
+
+ fn add_reverse_union(&self) -> StateID {
+ let id = self.states.borrow().len();
+ let state = CState::UnionReverse { alternates: vec![] };
+ self.states.borrow_mut().push(state);
+ id
+ }
+
+ fn add_match(&self) -> StateID {
+ let id = self.states.borrow().len();
+ self.states.borrow_mut().push(CState::Match);
+ id
+ }
+}
+
+#[derive(Debug)]
+struct Utf8Compiler<'a> {
+ nfac: &'a Compiler,
+ state: &'a mut Utf8State,
+ target: StateID,
+}
+
+#[derive(Clone, Debug)]
+struct Utf8State {
+ compiled: Utf8BoundedMap,
+ uncompiled: Vec<Utf8Node>,
+}
+
+#[derive(Clone, Debug)]
+struct Utf8Node {
+ trans: Vec<Transition>,
+ last: Option<Utf8LastTransition>,
+}
+
+#[derive(Clone, Debug)]
+struct Utf8LastTransition {
+ start: u8,
+ end: u8,
+}
+
+impl Utf8State {
+ fn new() -> Utf8State {
+ Utf8State { compiled: Utf8BoundedMap::new(5000), uncompiled: vec![] }
+ }
+
+ fn clear(&mut self) {
+ self.compiled.clear();
+ self.uncompiled.clear();
+ }
+}
+
+impl<'a> Utf8Compiler<'a> {
+ fn new(nfac: &'a Compiler, state: &'a mut Utf8State) -> Utf8Compiler<'a> {
+ let target = nfac.add_empty();
+ state.clear();
+ let mut utf8c = Utf8Compiler { nfac, state, target };
+ utf8c.add_empty();
+ utf8c
+ }
+
+ fn finish(&mut self) -> ThompsonRef {
+ self.compile_from(0);
+ let node = self.pop_root();
+ let start = self.compile(node);
+ ThompsonRef { start, end: self.target }
+ }
+
+ fn add(&mut self, ranges: &[Utf8Range]) {
+ let prefix_len = ranges
+ .iter()
+ .zip(&self.state.uncompiled)
+ .take_while(|&(range, node)| {
+ node.last.as_ref().map_or(false, |t| {
+ (t.start, t.end) == (range.start, range.end)
+ })
+ })
+ .count();
+ assert!(prefix_len < ranges.len());
+ self.compile_from(prefix_len);
+ self.add_suffix(&ranges[prefix_len..]);
+ }
+
+ fn compile_from(&mut self, from: usize) {
+ let mut next = self.target;
+ while from + 1 < self.state.uncompiled.len() {
+ let node = self.pop_freeze(next);
+ next = self.compile(node);
+ }
+ self.top_last_freeze(next);
+ }
+
+ fn compile(&mut self, node: Vec<Transition>) -> StateID {
+ let hash = self.state.compiled.hash(&node);
+ if let Some(id) = self.state.compiled.get(&node, hash) {
+ return id;
+ }
+ let id = self.nfac.add_sparse(node.clone());
+ self.state.compiled.set(node, hash, id);
+ id
+ }
+
+ fn add_suffix(&mut self, ranges: &[Utf8Range]) {
+ assert!(!ranges.is_empty());
+ let last = self
+ .state
+ .uncompiled
+ .len()
+ .checked_sub(1)
+ .expect("non-empty nodes");
+ assert!(self.state.uncompiled[last].last.is_none());
+ self.state.uncompiled[last].last = Some(Utf8LastTransition {
+ start: ranges[0].start,
+ end: ranges[0].end,
+ });
+ for r in &ranges[1..] {
+ self.state.uncompiled.push(Utf8Node {
+ trans: vec![],
+ last: Some(Utf8LastTransition { start: r.start, end: r.end }),
+ });
+ }
+ }
+
+ fn add_empty(&mut self) {
+ self.state.uncompiled.push(Utf8Node { trans: vec![], last: None });
+ }
+
+ fn pop_freeze(&mut self, next: StateID) -> Vec<Transition> {
+ let mut uncompiled = self.state.uncompiled.pop().unwrap();
+ uncompiled.set_last_transition(next);
+ uncompiled.trans
+ }
+
+ fn pop_root(&mut self) -> Vec<Transition> {
+ assert_eq!(self.state.uncompiled.len(), 1);
+ assert!(self.state.uncompiled[0].last.is_none());
+ self.state.uncompiled.pop().expect("non-empty nodes").trans
+ }
+
+ fn top_last_freeze(&mut self, next: StateID) {
+ let last = self
+ .state
+ .uncompiled
+ .len()
+ .checked_sub(1)
+ .expect("non-empty nodes");
+ self.state.uncompiled[last].set_last_transition(next);
+ }
+}
+
+impl Utf8Node {
+ fn set_last_transition(&mut self, next: StateID) {
+ if let Some(last) = self.last.take() {
+ self.trans.push(Transition {
+ start: last.start,
+ end: last.end,
+ next,
+ });
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use regex_syntax::hir::Hir;
+ use regex_syntax::ParserBuilder;
+
+ use super::{Builder, State, StateID, Transition, NFA};
+
+ fn parse(pattern: &str) -> Hir {
+ ParserBuilder::new().build().parse(pattern).unwrap()
+ }
+
+ fn build(pattern: &str) -> NFA {
+ Builder::new().anchored(true).build(&parse(pattern)).unwrap()
+ }
+
+ fn s_byte(byte: u8, next: StateID) -> State {
+ let trans = Transition { start: byte, end: byte, next };
+ State::Range { range: trans }
+ }
+
+ fn s_range(start: u8, end: u8, next: StateID) -> State {
+ let trans = Transition { start, end, next };
+ State::Range { range: trans }
+ }
+
+ fn s_sparse(ranges: &[(u8, u8, StateID)]) -> State {
+ let ranges = ranges
+ .iter()
+ .map(|&(start, end, next)| Transition { start, end, next })
+ .collect();
+ State::Sparse { ranges }
+ }
+
+ fn s_union(alts: &[StateID]) -> State {
+ State::Union { alternates: alts.to_vec().into_boxed_slice() }
+ }
+
+ fn s_match() -> State {
+ State::Match
+ }
+
+ #[test]
+ fn errors() {
+ // unsupported anchors
+ assert!(Builder::new().build(&parse(r"^")).is_err());
+ assert!(Builder::new().build(&parse(r"$")).is_err());
+ assert!(Builder::new().build(&parse(r"\A")).is_err());
+ assert!(Builder::new().build(&parse(r"\z")).is_err());
+
+ // unsupported word boundaries
+ assert!(Builder::new().build(&parse(r"\b")).is_err());
+ assert!(Builder::new().build(&parse(r"\B")).is_err());
+ assert!(Builder::new().build(&parse(r"(?-u)\b")).is_err());
+ }
+
+ // Test that building an unanchored NFA has an appropriate `.*?` prefix.
+ #[test]
+ fn compile_unanchored_prefix() {
+ // When the machine can only match valid UTF-8.
+ let nfa = Builder::new().anchored(false).build(&parse(r"a")).unwrap();
+ // There should be many states since the `.` in `.*?` matches any
+ // Unicode scalar value.
+ assert_eq!(11, nfa.len());
+ assert_eq!(nfa.states[10], s_match());
+ assert_eq!(nfa.states[9], s_byte(b'a', 10));
+
+ // When the machine can match invalid UTF-8.
+ let nfa = Builder::new()
+ .anchored(false)
+ .allow_invalid_utf8(true)
+ .build(&parse(r"a"))
+ .unwrap();
+ assert_eq!(
+ nfa.states,
+ &[
+ s_union(&[2, 1]),
+ s_range(0, 255, 0),
+ s_byte(b'a', 3),
+ s_match(),
+ ]
+ );
+ }
+
+ #[test]
+ fn compile_empty() {
+ assert_eq!(build("").states, &[s_match(),]);
+ }
+
+ #[test]
+ fn compile_literal() {
+ assert_eq!(build("a").states, &[s_byte(b'a', 1), s_match(),]);
+ assert_eq!(
+ build("ab").states,
+ &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(),]
+ );
+ assert_eq!(
+ build("☃").states,
+ &[s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(),]
+ );
+
+ // Check that non-UTF-8 literals work.
+ let hir = ParserBuilder::new()
+ .allow_invalid_utf8(true)
+ .build()
+ .parse(r"(?-u)\xFF")
+ .unwrap();
+ let nfa = Builder::new()
+ .anchored(true)
+ .allow_invalid_utf8(true)
+ .build(&hir)
+ .unwrap();
+ assert_eq!(nfa.states, &[s_byte(b'\xFF', 1), s_match(),]);
+ }
+
+ #[test]
+ fn compile_class() {
+ assert_eq!(
+ build(r"[a-z]").states,
+ &[s_range(b'a', b'z', 1), s_match(),]
+ );
+ assert_eq!(
+ build(r"[x-za-c]").states,
+ &[s_sparse(&[(b'a', b'c', 1), (b'x', b'z', 1)]), s_match()]
+ );
+ assert_eq!(
+ build(r"[\u03B1-\u03B4]").states,
+ &[s_range(0xB1, 0xB4, 2), s_byte(0xCE, 0), s_match()]
+ );
+ assert_eq!(
+ build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states,
+ &[
+ s_range(0xB1, 0xB4, 5),
+ s_range(0x99, 0x9E, 5),
+ s_byte(0xA4, 1),
+ s_byte(0x9F, 2),
+ s_sparse(&[(0xCE, 0xCE, 0), (0xF0, 0xF0, 3)]),
+ s_match(),
+ ]
+ );
+ assert_eq!(
+ build(r"[a-z☃]").states,
+ &[
+ s_byte(0x83, 3),
+ s_byte(0x98, 0),
+ s_sparse(&[(b'a', b'z', 3), (0xE2, 0xE2, 1)]),
+ s_match(),
+ ]
+ );
+ }
+
+ #[test]
+ fn compile_repetition() {
+ assert_eq!(
+ build(r"a?").states,
+ &[s_union(&[1, 2]), s_byte(b'a', 2), s_match(),]
+ );
+ assert_eq!(
+ build(r"a??").states,
+ &[s_union(&[2, 1]), s_byte(b'a', 2), s_match(),]
+ );
+ }
+
+ #[test]
+ fn compile_group() {
+ assert_eq!(
+ build(r"ab+").states,
+ &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[1, 3]), s_match(),]
+ );
+ assert_eq!(
+ build(r"(ab)").states,
+ &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(),]
+ );
+ assert_eq!(
+ build(r"(ab)+").states,
+ &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[0, 3]), s_match(),]
+ );
+ }
+
+ #[test]
+ fn compile_alternation() {
+ assert_eq!(
+ build(r"a|b").states,
+ &[s_byte(b'a', 3), s_byte(b'b', 3), s_union(&[0, 1]), s_match(),]
+ );
+ assert_eq!(
+ build(r"|b").states,
+ &[s_byte(b'b', 2), s_union(&[2, 0]), s_match(),]
+ );
+ assert_eq!(
+ build(r"a|").states,
+ &[s_byte(b'a', 2), s_union(&[0, 2]), s_match(),]
+ );
+ }
+}
diff --git a/src/nfa/map.rs b/src/nfa/map.rs
new file mode 100644
index 0000000..e636c0d
--- /dev/null
+++ b/src/nfa/map.rs
@@ -0,0 +1,282 @@
+// This module contains a couple simple and purpose built hash maps. The key
+// trade off they make is that they serve as caches rather than true maps. That
+// is, inserting a new entry may cause eviction of another entry. This gives
+// us two things. First, there's less overhead associated with inserts and
+// lookups. Secondly, it lets us control our memory usage.
+//
+// These maps are used in some fairly hot code when generating NFA states for
+// large Unicode character classes.
+//
+// Instead of exposing a rich hashmap entry API, we just permit the caller
+// to produce a hash of the key directly. The hash can then be reused for both
+// lookups and insertions at the cost of leaking things a bit. But these are
+// for internal use only, so it's fine.
+//
+// The Utf8BoundedMap is used for Daciuk's algorithm for constructing a
+// (almost) minimal DFA for large Unicode character classes in linear time.
+// (Daciuk's algorithm is always used when compiling forward NFAs. For reverse
+// NFAs, it's only used when the compiler is configured to 'shrink' the NFA,
+// since there's a bit more expense in the reverse direction.)
+//
+// The Utf8SuffixMap is used when compiling large Unicode character classes for
+// reverse NFAs when 'shrink' is disabled. Specifically, it augments the naive
+// construction of UTF-8 automata by caching common suffixes. This doesn't
+// get the same space savings as Daciuk's algorithm, but it's basically as
+// fast as the naive approach and typically winds up using less memory (since
+// it generates smaller NFAs) despite the presence of the cache.
+//
+// These maps effectively represent caching mechanisms for CState::Sparse and
+// CState::Range, respectively. The former represents a single NFA state with
+// many transitions of equivalent priority while the latter represents a single
+// NFA state with a single transition. (Neither state ever has or is an
+// epsilon transition.) Thus, they have different key types. It's likely we
+// could make one generic map, but the machinery didn't seem worth it. They
+// are simple enough.
+
+use nfa::{StateID, Transition};
+
+// Basic FNV-1a hash constants as described in:
+// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
+const PRIME: u64 = 1099511628211;
+const INIT: u64 = 14695981039346656037;
+
+/// A bounded hash map where the key is a sequence of NFA transitions and the
+/// value is a pre-existing NFA state ID.
+///
+/// std's hashmap can be used for this, however, this map has two important
+/// advantages. Firstly, it has lower overhead. Secondly, it permits us to
+/// control our memory usage by limited the number of slots. In general, the
+/// cost here is that this map acts as a cache. That is, inserting a new entry
+/// may remove an old entry. We are okay with this, since it does not impact
+/// correctness in the cases where it is used. The only effect that dropping
+/// states from the cache has is that the resulting NFA generated may be bigger
+/// than it otherwise would be.
+///
+/// This improves benchmarks that compile large Unicode character classes,
+/// since it makes the generation of (almost) minimal UTF-8 automaton faster.
+/// Specifically, one could observe the difference with std's hashmap via
+/// something like the following benchmark:
+///
+/// hyperfine "regex-automata-debug debug -acqr '\w{40} ecurB'"
+///
+/// But to observe that difference, you'd have to modify the code to use
+/// std's hashmap.
+///
+/// It is quite possible that there is a better way to approach this problem.
+/// For example, if there happens to be a very common state that collides with
+/// a lot of less frequent states, then we could wind up with very poor caching
+/// behavior. Alas, the effectiveness of this cache has not been measured.
+/// Instead, ad hoc experiments suggest that it is "good enough." Additional
+/// smarts (such as an LRU eviction policy) have to be weighed against the
+/// amount of extra time they cost.
+#[derive(Clone, Debug)]
+pub struct Utf8BoundedMap {
+ /// The current version of this map. Only entries with matching versions
+ /// are considered during lookups. If an entry is found with a mismatched
+ /// version, then the map behaves as if the entry does not exist.
+ version: u16,
+ /// The total number of entries this map can store.
+ capacity: usize,
+ /// The actual entries, keyed by hash. Collisions between different states
+ /// result in the old state being dropped.
+ map: Vec<Utf8BoundedEntry>,
+}
+
+/// An entry in this map.
+#[derive(Clone, Debug, Default)]
+struct Utf8BoundedEntry {
+ /// The version of the map used to produce this entry. If this entry's
+ /// version does not match the current version of the map, then the map
+ /// should behave as if this entry does not exist.
+ version: u16,
+ /// The key, which is a sorted sequence of non-overlapping NFA transitions.
+ key: Vec<Transition>,
+ /// The state ID corresponding to the state containing the transitions in
+ /// this entry.
+ val: StateID,
+}
+
+impl Utf8BoundedMap {
+ /// Create a new bounded map with the given capacity. The map will never
+ /// grow beyond the given size.
+ ///
+ /// Note that this does not allocate. Instead, callers must call `clear`
+ /// before using this map. `clear` will allocate space if necessary.
+ ///
+ /// This avoids the need to pay for the allocation of this map when
+ /// compiling regexes that lack large Unicode character classes.
+ pub fn new(capacity: usize) -> Utf8BoundedMap {
+ assert!(capacity > 0);
+ Utf8BoundedMap { version: 0, capacity, map: vec![] }
+ }
+
+ /// Clear this map of all entries, but permit the reuse of allocation
+ /// if possible.
+ ///
+ /// This must be called before the map can be used.
+ pub fn clear(&mut self) {
+ if self.map.is_empty() {
+ self.map = vec![Utf8BoundedEntry::default(); self.capacity];
+ } else {
+ self.version = self.version.wrapping_add(1);
+ if self.version == 0 {
+ self.map = vec![Utf8BoundedEntry::default(); self.capacity];
+ }
+ }
+ }
+
+ /// Return a hash of the given transitions.
+ pub fn hash(&self, key: &[Transition]) -> usize {
+ let mut h = INIT;
+ for t in key {
+ h = (h ^ (t.start as u64)).wrapping_mul(PRIME);
+ h = (h ^ (t.end as u64)).wrapping_mul(PRIME);
+ h = (h ^ (t.next as u64)).wrapping_mul(PRIME);
+ }
+ (h as usize) % self.map.len()
+ }
+
+ /// Retrieve the cached state ID corresponding to the given key. The hash
+ /// given must have been computed with `hash` using the same key value.
+ ///
+ /// If there is no cached state with the given transitions, then None is
+ /// returned.
+ pub fn get(&mut self, key: &[Transition], hash: usize) -> Option<StateID> {
+ let entry = &self.map[hash];
+ if entry.version != self.version {
+ return None;
+ }
+ // There may be a hash collision, so we need to confirm real equality.
+ if entry.key != key {
+ return None;
+ }
+ Some(entry.val)
+ }
+
+ /// Add a cached state to this map with the given key. Callers should
+ /// ensure that `state_id` points to a state that contains precisely the
+ /// NFA transitions given.
+ ///
+ /// `hash` must have been computed using the `hash` method with the same
+ /// key.
+ pub fn set(
+ &mut self,
+ key: Vec<Transition>,
+ hash: usize,
+ state_id: StateID,
+ ) {
+ self.map[hash] =
+ Utf8BoundedEntry { version: self.version, key, val: state_id };
+ }
+}
+
+/// A cache of suffixes used to modestly compress UTF-8 automata for large
+/// Unicode character classes.
+#[derive(Clone, Debug)]
+pub struct Utf8SuffixMap {
+ /// The current version of this map. Only entries with matching versions
+ /// are considered during lookups. If an entry is found with a mismatched
+ /// version, then the map behaves as if the entry does not exist.
+ version: u16,
+ /// The total number of entries this map can store.
+ capacity: usize,
+ /// The actual entries, keyed by hash. Collisions between different states
+ /// result in the old state being dropped.
+ map: Vec<Utf8SuffixEntry>,
+}
+
+/// A key that uniquely identifies an NFA state. It is a triple that represents
+/// a transition from one state for a particular byte range.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct Utf8SuffixKey {
+ pub from: StateID,
+ pub start: u8,
+ pub end: u8,
+}
+
+/// An entry in this map.
+#[derive(Clone, Debug, Default)]
+struct Utf8SuffixEntry {
+ /// The version of the map used to produce this entry. If this entry's
+ /// version does not match the current version of the map, then the map
+ /// should behave as if this entry does not exist.
+ version: u16,
+ /// The key, which consists of a transition in a particular state.
+ key: Utf8SuffixKey,
+ /// The identifier that the transition in the key maps to.
+ val: StateID,
+}
+
+impl Utf8SuffixMap {
+ /// Create a new bounded map with the given capacity. The map will never
+ /// grow beyond the given size.
+ ///
+ /// Note that this does not allocate. Instead, callers must call `clear`
+ /// before using this map. `clear` will allocate space if necessary.
+ ///
+ /// This avoids the need to pay for the allocation of this map when
+ /// compiling regexes that lack large Unicode character classes.
+ pub fn new(capacity: usize) -> Utf8SuffixMap {
+ assert!(capacity > 0);
+ Utf8SuffixMap { version: 0, capacity, map: vec![] }
+ }
+
+ /// Clear this map of all entries, but permit the reuse of allocation
+ /// if possible.
+ ///
+ /// This must be called before the map can be used.
+ pub fn clear(&mut self) {
+ if self.map.is_empty() {
+ self.map = vec![Utf8SuffixEntry::default(); self.capacity];
+ } else {
+ self.version = self.version.wrapping_add(1);
+ if self.version == 0 {
+ self.map = vec![Utf8SuffixEntry::default(); self.capacity];
+ }
+ }
+ }
+
+ /// Return a hash of the given transition.
+ pub fn hash(&self, key: &Utf8SuffixKey) -> usize {
+ // Basic FNV-1a hash as described:
+ // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
+ const PRIME: u64 = 1099511628211;
+ const INIT: u64 = 14695981039346656037;
+
+ let mut h = INIT;
+ h = (h ^ (key.from as u64)).wrapping_mul(PRIME);
+ h = (h ^ (key.start as u64)).wrapping_mul(PRIME);
+ h = (h ^ (key.end as u64)).wrapping_mul(PRIME);
+ (h as usize) % self.map.len()
+ }
+
+ /// Retrieve the cached state ID corresponding to the given key. The hash
+ /// given must have been computed with `hash` using the same key value.
+ ///
+ /// If there is no cached state with the given key, then None is returned.
+ pub fn get(
+ &mut self,
+ key: &Utf8SuffixKey,
+ hash: usize,
+ ) -> Option<StateID> {
+ let entry = &self.map[hash];
+ if entry.version != self.version {
+ return None;
+ }
+ if key != &entry.key {
+ return None;
+ }
+ Some(entry.val)
+ }
+
+ /// Add a cached state to this map with the given key. Callers should
+ /// ensure that `state_id` points to a state that contains precisely the
+ /// NFA transition given.
+ ///
+ /// `hash` must have been computed using the `hash` method with the same
+ /// key.
+ pub fn set(&mut self, key: Utf8SuffixKey, hash: usize, state_id: StateID) {
+ self.map[hash] =
+ Utf8SuffixEntry { version: self.version, key, val: state_id };
+ }
+}
diff --git a/src/nfa/mod.rs b/src/nfa/mod.rs
new file mode 100644
index 0000000..02d0501
--- /dev/null
+++ b/src/nfa/mod.rs
@@ -0,0 +1,252 @@
+use std::fmt;
+
+use classes::ByteClasses;
+pub use nfa::compiler::Builder;
+
+mod compiler;
+mod map;
+mod range_trie;
+
+/// The representation for an NFA state identifier.
+pub type StateID = usize;
+
+/// A final compiled NFA.
+///
+/// The states of the NFA are indexed by state IDs, which are how transitions
+/// are expressed.
+#[derive(Clone)]
+pub struct NFA {
+ /// Whether this NFA can only match at the beginning of input or not.
+ ///
+ /// When true, a match should only be reported if it begins at the 0th
+ /// index of the haystack.
+ anchored: bool,
+ /// The starting state of this NFA.
+ start: StateID,
+ /// The state list. This list is guaranteed to be indexable by the starting
+ /// state ID, and it is also guaranteed to contain exactly one `Match`
+ /// state.
+ states: Vec<State>,
+ /// A mapping from any byte value to its corresponding equivalence class
+ /// identifier. Two bytes in the same equivalence class cannot discriminate
+ /// between a match or a non-match. This map can be used to shrink the
+ /// total size of a DFA's transition table with a small match-time cost.
+ ///
+ /// Note that the NFA's transitions are *not* defined in terms of these
+ /// equivalence classes. The NFA's transitions are defined on the original
+ /// byte values. For the most part, this is because they wouldn't really
+ /// help the NFA much since the NFA already uses a sparse representation
+ /// to represent transitions. Byte classes are most effective in a dense
+ /// representation.
+ byte_classes: ByteClasses,
+}
+
+impl NFA {
+ /// Returns an NFA that always matches at every position.
+ pub fn always_match() -> NFA {
+ NFA {
+ anchored: false,
+ start: 0,
+ states: vec![State::Match],
+ byte_classes: ByteClasses::empty(),
+ }
+ }
+
+ /// Returns an NFA that never matches at any position.
+ pub fn never_match() -> NFA {
+ NFA {
+ anchored: false,
+ start: 0,
+ states: vec![State::Fail],
+ byte_classes: ByteClasses::empty(),
+ }
+ }
+
+ /// Returns true if and only if this NFA is anchored.
+ pub fn is_anchored(&self) -> bool {
+ self.anchored
+ }
+
+ /// Return the number of states in this NFA.
+ pub fn len(&self) -> usize {
+ self.states.len()
+ }
+
+ /// Return the ID of the initial state of this NFA.
+ pub fn start(&self) -> StateID {
+ self.start
+ }
+
+ /// Return the NFA state corresponding to the given ID.
+ pub fn state(&self, id: StateID) -> &State {
+ &self.states[id]
+ }
+
+ /// Return the set of equivalence classes for this NFA. The slice returned
+ /// always has length 256 and maps each possible byte value to its
+ /// corresponding equivalence class ID (which is never more than 255).
+ pub fn byte_classes(&self) -> &ByteClasses {
+ &self.byte_classes
+ }
+}
+
+impl fmt::Debug for NFA {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ for (i, state) in self.states.iter().enumerate() {
+ let status = if i == self.start { '>' } else { ' ' };
+ writeln!(f, "{}{:06}: {:?}", status, i, state)?;
+ }
+ Ok(())
+ }
+}
+
+/// A state in a final compiled NFA.
+#[derive(Clone, Eq, PartialEq)]
+pub enum State {
+ /// A state that transitions to `next` if and only if the current input
+ /// byte is in the range `[start, end]` (inclusive).
+ ///
+ /// This is a special case of Sparse in that it encodes only one transition
+ /// (and therefore avoids the allocation).
+ Range { range: Transition },
+ /// A state with possibly many transitions, represented in a sparse
+ /// fashion. Transitions are ordered lexicographically by input range.
+ /// As such, this may only be used when every transition has equal
+ /// priority. (In practice, this is only used for encoding large UTF-8
+ /// automata.)
+ Sparse { ranges: Box<[Transition]> },
+ /// An alternation such that there exists an epsilon transition to all
+ /// states in `alternates`, where matches found via earlier transitions
+ /// are preferred over later transitions.
+ Union { alternates: Box<[StateID]> },
+ /// A fail state. When encountered, the automaton is guaranteed to never
+ /// reach a match state.
+ Fail,
+ /// A match state. There is exactly one such occurrence of this state in
+ /// an NFA.
+ Match,
+}
+
+/// A transition to another state, only if the given byte falls in the
+/// inclusive range specified.
+#[derive(Clone, Copy, Eq, Hash, PartialEq)]
+pub struct Transition {
+ pub start: u8,
+ pub end: u8,
+ pub next: StateID,
+}
+
+impl State {
+ /// Returns true if and only if this state contains one or more epsilon
+ /// transitions.
+ pub fn is_epsilon(&self) -> bool {
+ match *self {
+ State::Range { .. }
+ | State::Sparse { .. }
+ | State::Fail
+ | State::Match => false,
+ State::Union { .. } => true,
+ }
+ }
+
+ /// Remap the transitions in this state using the given map. Namely, the
+ /// given map should be indexed according to the transitions currently
+ /// in this state.
+ ///
+ /// This is used during the final phase of the NFA compiler, which turns
+ /// its intermediate NFA into the final NFA.
+ fn remap(&mut self, remap: &[StateID]) {
+ match *self {
+ State::Range { ref mut range } => range.next = remap[range.next],
+ State::Sparse { ref mut ranges } => {
+ for r in ranges.iter_mut() {
+ r.next = remap[r.next];
+ }
+ }
+ State::Union { ref mut alternates } => {
+ for alt in alternates.iter_mut() {
+ *alt = remap[*alt];
+ }
+ }
+ State::Fail => {}
+ State::Match => {}
+ }
+ }
+}
+
+impl fmt::Debug for State {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ match *self {
+ State::Range { ref range } => range.fmt(f),
+ State::Sparse { ref ranges } => {
+ let rs = ranges
+ .iter()
+ .map(|t| format!("{:?}", t))
+ .collect::<Vec<String>>()
+ .join(", ");
+ write!(f, "sparse({})", rs)
+ }
+ State::Union { ref alternates } => {
+ let alts = alternates
+ .iter()
+ .map(|id| format!("{}", id))
+ .collect::<Vec<String>>()
+ .join(", ");
+ write!(f, "alt({})", alts)
+ }
+ State::Fail => write!(f, "FAIL"),
+ State::Match => write!(f, "MATCH"),
+ }
+ }
+}
+
+impl fmt::Debug for Transition {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ let Transition { start, end, next } = *self;
+ if self.start == self.end {
+ write!(f, "{} => {}", escape(start), next)
+ } else {
+ write!(f, "{}-{} => {}", escape(start), escape(end), next)
+ }
+ }
+}
+
+/// Return the given byte as its escaped string form.
+fn escape(b: u8) -> String {
+ use std::ascii;
+
+ String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap()
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use dense;
+ use dfa::DFA;
+
+ #[test]
+ fn always_match() {
+ let nfa = NFA::always_match();
+ let dfa = dense::Builder::new().build_from_nfa::<usize>(&nfa).unwrap();
+
+ assert_eq!(Some(0), dfa.find_at(b"", 0));
+ assert_eq!(Some(0), dfa.find_at(b"a", 0));
+ assert_eq!(Some(1), dfa.find_at(b"a", 1));
+ assert_eq!(Some(0), dfa.find_at(b"ab", 0));
+ assert_eq!(Some(1), dfa.find_at(b"ab", 1));
+ assert_eq!(Some(2), dfa.find_at(b"ab", 2));
+ }
+
+ #[test]
+ fn never_match() {
+ let nfa = NFA::never_match();
+ let dfa = dense::Builder::new().build_from_nfa::<usize>(&nfa).unwrap();
+
+ assert_eq!(None, dfa.find_at(b"", 0));
+ assert_eq!(None, dfa.find_at(b"a", 0));
+ assert_eq!(None, dfa.find_at(b"a", 1));
+ assert_eq!(None, dfa.find_at(b"ab", 0));
+ assert_eq!(None, dfa.find_at(b"ab", 1));
+ assert_eq!(None, dfa.find_at(b"ab", 2));
+ }
+}
diff --git a/src/nfa/range_trie.rs b/src/nfa/range_trie.rs
new file mode 100644
index 0000000..50767c7
--- /dev/null
+++ b/src/nfa/range_trie.rs
@@ -0,0 +1,1048 @@
+// I've called the primary data structure in this module a "range trie." As far
+// as I can tell, there is no prior art on a data structure like this, however,
+// it's likely someone somewhere has built something like it. Searching for
+// "range trie" turns up the paper "Range Tries for Scalable Address Lookup,"
+// but it does not appear relevant.
+//
+// The range trie is just like a trie in that it is a special case of a
+// deterministic finite state machine. It has states and each state has a set
+// of transitions to other states. It is acyclic, and, like a normal trie,
+// it makes no attempt to reuse common suffixes among its elements. The key
+// difference between a normal trie and a range trie below is that a range trie
+// operates on *contiguous sequences* of bytes instead of singleton bytes.
+// One could say say that our alphabet is ranges of bytes instead of bytes
+// themselves, except a key part of range trie construction is splitting ranges
+// apart to ensure there is at most one transition that can be taken for any
+// byte in a given state.
+//
+// I've tried to explain the details of how the range trie works below, so
+// for now, we are left with trying to understand what problem we're trying to
+// solve. Which is itself fairly involved!
+//
+// At the highest level, here's what we want to do. We want to convert a
+// sequence of Unicode codepoints into a finite state machine whose transitions
+// are over *bytes* and *not* Unicode codepoints. We want this because it makes
+// said finite state machines much smaller and much faster to execute. As a
+// simple example, consider a byte oriented automaton for all Unicode scalar
+// values (0x00 through 0x10FFFF, not including surrogate codepoints):
+//
+// [00-7F]
+// [C2-DF][80-BF]
+// [E0-E0][A0-BF][80-BF]
+// [E1-EC][80-BF][80-BF]
+// [ED-ED][80-9F][80-BF]
+// [EE-EF][80-BF][80-BF]
+// [F0-F0][90-BF][80-BF][80-BF]
+// [F1-F3][80-BF][80-BF][80-BF]
+// [F4-F4][80-8F][80-BF][80-BF]
+//
+// (These byte ranges are generated via the regex-syntax::utf8 module, which
+// was based on Russ Cox's code in RE2, which was in turn based on Ken
+// Thompson's implementation of the same idea in his Plan9 implementation of
+// grep.)
+//
+// It should be fairly straight-forward to see how one could compile this into
+// a DFA. The sequences are sorted and non-overlapping. Essentially, you could
+// build a trie from this fairly easy. The problem comes when your initial
+// range (in this case, 0x00-0x10FFFF) isn't so nice. For example, the class
+// represented by '\w' contains only a tenth of the codepoints that
+// 0x00-0x10FFFF contains, but if we were to write out the byte based ranges
+// as we did above, the list would stretch to 892 entries! This turns into
+// quite a large NFA with a few thousand states. Turning this beast into a DFA
+// takes quite a bit of time. We are thus left with trying to trim down the
+// number of states we produce as early as possible.
+//
+// One approach (used by RE2 and still by the regex crate, at time of writing)
+// is to try to find common suffixes while building NFA states for the above
+// and reuse them. This is very cheap to do and one can control precisely how
+// much extra memory you want to use for the cache.
+//
+// Another approach, however, is to reuse an algorithm for constructing a
+// *minimal* DFA from a sorted sequence of inputs. I don't want to go into
+// the full details here, but I explain it in more depth in my blog post on
+// FSTs[1]. Note that the algorithm not invented by me, but was published
+// in paper by Daciuk et al. in 2000 called "Incremental Construction of
+// MinimalAcyclic Finite-State Automata." Like the suffix cache approach above,
+// it is also possible to control the amount of extra memory one uses, although
+// this usually comes with the cost of sacrificing true minimality. (But it's
+// typically close enough with a reasonably sized cache of states.)
+//
+// The catch is that Daciuk's algorithm only works if you add your keys in
+// lexicographic ascending order. In our case, since we're dealing with ranges,
+// we also need the additional requirement that ranges are either equivalent
+// or do not overlap at all. For example, if one were given the following byte
+// ranges:
+//
+// [BC-BF][80-BF]
+// [BC-BF][90-BF]
+//
+// Then Daciuk's algorithm also would not work, since there is nothing to
+// handle the fact that the ranges overlap. They would need to be split apart.
+// Thankfully, Thompson's algorithm for producing byte ranges for Unicode
+// codepoint ranges meets both of our requirements.
+//
+// ... however, we would also like to be able to compile UTF-8 automata in
+// reverse. We want this because in order to find the starting location of a
+// match using a DFA, we need to run a second DFA---a reversed version of the
+// forward DFA---backwards to discover the match location. Unfortunately, if
+// we reverse our byte sequences for 0x00-0x10FFFF, we get sequences that are
+// can overlap, even if they are sorted:
+//
+// [00-7F]
+// [80-BF][80-9F][ED-ED]
+// [80-BF][80-BF][80-8F][F4-F4]
+// [80-BF][80-BF][80-BF][F1-F3]
+// [80-BF][80-BF][90-BF][F0-F0]
+// [80-BF][80-BF][E1-EC]
+// [80-BF][80-BF][EE-EF]
+// [80-BF][A0-BF][E0-E0]
+// [80-BF][C2-DF]
+//
+// For example, '[80-BF][80-BF][EE-EF]' and '[80-BF][A0-BF][E0-E0]' have
+// overlapping ranges between '[80-BF]' and '[A0-BF]'. Thus, there is no
+// simple way to apply Daciuk's algorithm.
+//
+// And thus, the range trie was born. The range trie's only purpose is to take
+// sequences of byte ranges like the ones above, collect them into a trie and
+// then spit them in a sorted fashion with no overlapping ranges. For example,
+// 0x00-0x10FFFF gets translated to:
+//
+// [0-7F]
+// [80-BF][80-9F][80-8F][F1-F3]
+// [80-BF][80-9F][80-8F][F4]
+// [80-BF][80-9F][90-BF][F0]
+// [80-BF][80-9F][90-BF][F1-F3]
+// [80-BF][80-9F][E1-EC]
+// [80-BF][80-9F][ED]
+// [80-BF][80-9F][EE-EF]
+// [80-BF][A0-BF][80-8F][F1-F3]
+// [80-BF][A0-BF][80-8F][F4]
+// [80-BF][A0-BF][90-BF][F0]
+// [80-BF][A0-BF][90-BF][F1-F3]
+// [80-BF][A0-BF][E0]
+// [80-BF][A0-BF][E1-EC]
+// [80-BF][A0-BF][EE-EF]
+// [80-BF][C2-DF]
+//
+// We've thus satisfied our requirements for running Daciuk's algorithm. All
+// sequences of ranges are sorted, and any corresponding ranges are either
+// exactly equivalent or non-overlapping.
+//
+// In effect, a range trie is building a DFA from a sequence of arbitrary
+// byte ranges. But it uses an algoritm custom tailored to its input, so it
+// is not as costly as traditional DFA construction. While it is still quite
+// a bit more costly than the forward's case (which only needs Daciuk's
+// algorithm), it winds up saving a substantial amount of time if one is doing
+// a full DFA powerset construction later by virtue of producing a much much
+// smaller NFA.
+//
+// [1] - https://blog.burntsushi.net/transducers/
+// [2] - https://www.mitpressjournals.org/doi/pdfplus/10.1162/089120100561601
+
+use std::cell::RefCell;
+use std::fmt;
+use std::mem;
+use std::ops::RangeInclusive;
+use std::u32;
+
+use regex_syntax::utf8::Utf8Range;
+
+/// A smaller state ID means more effective use of the CPU cache and less
+/// time spent copying. The implementation below will panic if the state ID
+/// space is exhausted, but in order for that to happen, the range trie itself
+/// would use well over 100GB of memory. Moreover, it's likely impossible
+/// for the state ID space to get that big. In fact, it's likely that even a
+/// u16 would be good enough here. But it's not quite clear how to prove this.
+type StateID = u32;
+
+/// There is only one final state in this trie. Every sequence of byte ranges
+/// added shares the same final state.
+const FINAL: StateID = 0;
+
+/// The root state of the trie.
+const ROOT: StateID = 1;
+
+/// A range trie represents an ordered set of sequences of bytes.
+///
+/// A range trie accepts as input a sequence of byte ranges and merges
+/// them into the existing set such that the trie can produce a sorted
+/// non-overlapping sequence of byte ranges. The sequence emitted corresponds
+/// precisely to the sequence of bytes matched by the given keys, although the
+/// byte ranges themselves may be split at different boundaries.
+///
+/// The order complexity of this data structure seems difficult to analyze.
+/// If the size of a byte is held as a constant, then insertion is clearly
+/// O(n) where n is the number of byte ranges in the input key. However, if
+/// k=256 is our alphabet size, then insertion could be O(k^2 * n). In
+/// particular it seems possible for pathological inputs to cause insertion
+/// to do a lot of work. However, for what we use this data structure for,
+/// there should be no pathological inputs since the ultimate source is always
+/// a sorted set of Unicode scalar value ranges.
+///
+/// Internally, this trie is setup like a finite state machine. Note though
+/// that it is acyclic.
+#[derive(Clone)]
+pub struct RangeTrie {
+ /// The states in this trie. The first is always the shared final state.
+ /// The second is always the root state. Otherwise, there is no
+ /// particular order.
+ states: Vec<State>,
+ /// A free-list of states. When a range trie is cleared, all of its states
+ /// are added to list. Creating a new state reuses states from this list
+ /// before allocating a new one.
+ free: Vec<State>,
+ /// A stack for traversing this trie to yield sequences of byte ranges in
+ /// lexicographic order.
+ iter_stack: RefCell<Vec<NextIter>>,
+ /// A bufer that stores the current sequence during iteration.
+ iter_ranges: RefCell<Vec<Utf8Range>>,
+ /// A stack used for traversing the trie in order to (deeply) duplicate
+ /// a state.
+ dupe_stack: Vec<NextDupe>,
+ /// A stack used for traversing the trie during insertion of a new
+ /// sequence of byte ranges.
+ insert_stack: Vec<NextInsert>,
+}
+
+/// A single state in this trie.
+#[derive(Clone)]
+struct State {
+ /// A sorted sequence of non-overlapping transitions to other states. Each
+ /// transition corresponds to a single range of bytes.
+ transitions: Vec<Transition>,
+}
+
+/// A transition is a single range of bytes. If a particular byte is in this
+/// range, then the corresponding machine may transition to the state pointed
+/// to by `next_id`.
+#[derive(Clone)]
+struct Transition {
+ /// The byte range.
+ range: Utf8Range,
+ /// The next state to transition to.
+ next_id: StateID,
+}
+
+impl RangeTrie {
+ /// Create a new empty range trie.
+ pub fn new() -> RangeTrie {
+ let mut trie = RangeTrie {
+ states: vec![],
+ free: vec![],
+ iter_stack: RefCell::new(vec![]),
+ iter_ranges: RefCell::new(vec![]),
+ dupe_stack: vec![],
+ insert_stack: vec![],
+ };
+ trie.clear();
+ trie
+ }
+
+ /// Clear this range trie such that it is empty. Clearing a range trie
+ /// and reusing it can beneficial because this may reuse allocations.
+ pub fn clear(&mut self) {
+ self.free.extend(self.states.drain(..));
+ self.add_empty(); // final
+ self.add_empty(); // root
+ }
+
+ /// Iterate over all of the sequences of byte ranges in this trie, and
+ /// call the provided function for each sequence. Iteration occurs in
+ /// lexicographic order.
+ pub fn iter<F: FnMut(&[Utf8Range])>(&self, mut f: F) {
+ let mut stack = self.iter_stack.borrow_mut();
+ stack.clear();
+ let mut ranges = self.iter_ranges.borrow_mut();
+ ranges.clear();
+
+ // We do iteration in a way that permits us to use a single buffer
+ // for our keys. We iterate in a depth first fashion, while being
+ // careful to expand our frontier as we move deeper in the trie.
+ stack.push(NextIter { state_id: ROOT, tidx: 0 });
+ while let Some(NextIter { mut state_id, mut tidx }) = stack.pop() {
+ // This could be implemented more simply without an inner loop
+ // here, but at the cost of more stack pushes.
+ loop {
+ let state = self.state(state_id);
+ // If we're visited all transitions in this state, then pop
+ // back to the parent state.
+ if tidx >= state.transitions.len() {
+ ranges.pop();
+ break;
+ }
+
+ let t = &state.transitions[tidx];
+ ranges.push(t.range);
+ if t.next_id == FINAL {
+ f(&ranges);
+ ranges.pop();
+ tidx += 1;
+ } else {
+ // Expand our frontier. Once we come back to this state
+ // via the stack, start in on the next transition.
+ stack.push(NextIter { state_id, tidx: tidx + 1 });
+ // Otherwise, move to the first transition of the next
+ // state.
+ state_id = t.next_id;
+ tidx = 0;
+ }
+ }
+ }
+ }
+
+ /// Inserts a new sequence of ranges into this trie.
+ ///
+ /// The sequence given must be non-empty and must not have a length
+ /// exceeding 4.
+ pub fn insert(&mut self, ranges: &[Utf8Range]) {
+ assert!(!ranges.is_empty());
+ assert!(ranges.len() <= 4);
+
+ let mut stack = mem::replace(&mut self.insert_stack, vec![]);
+ stack.clear();
+
+ stack.push(NextInsert::new(ROOT, ranges));
+ while let Some(next) = stack.pop() {
+ let (state_id, ranges) = (next.state_id(), next.ranges());
+ assert!(!ranges.is_empty());
+
+ let (mut new, rest) = (ranges[0], &ranges[1..]);
+
+ // i corresponds to the position of the existing transition on
+ // which we are operating. Typically, the result is to remove the
+ // transition and replace it with two or more new transitions
+ // corresponding to the partitions generated by splitting the
+ // 'new' with the ith transition's range.
+ let mut i = self.state(state_id).find(new);
+
+ // In this case, there is no overlap *and* the new range is greater
+ // than all existing ranges. So we can just add it to the end.
+ if i == self.state(state_id).transitions.len() {
+ let next_id = NextInsert::push(self, &mut stack, rest);
+ self.add_transition(state_id, new, next_id);
+ continue;
+ }
+
+ // The need for this loop is a bit subtle, buf basically, after
+ // we've handled the partitions from our initial split, it's
+ // possible that there will be a partition leftover that overlaps
+ // with a subsequent transition. If so, then we have to repeat
+ // the split process again with the leftovers and that subsequent
+ // transition.
+ 'OUTER: loop {
+ let old = self.state(state_id).transitions[i].clone();
+ let split = match Split::new(old.range, new) {
+ Some(split) => split,
+ None => {
+ let next_id = NextInsert::push(self, &mut stack, rest);
+ self.add_transition_at(i, state_id, new, next_id);
+ continue;
+ }
+ };
+ let splits = split.as_slice();
+ // If we only have one partition, then the ranges must be
+ // equivalent. There's nothing to do here for this state, so
+ // just move on to the next one.
+ if splits.len() == 1 {
+ // ... but only if we have anything left to do.
+ if !rest.is_empty() {
+ stack.push(NextInsert::new(old.next_id, rest));
+ }
+ break;
+ }
+ // At this point, we know that 'split' is non-empty and there
+ // must be some overlap AND that the two ranges are not
+ // equivalent. Therefore, the existing range MUST be removed
+ // and split up somehow. Instead of actually doing the removal
+ // and then a subsequent insertion---with all the memory
+ // shuffling that entails---we simply overwrite the transition
+ // at position `i` for the first new transition we want to
+ // insert. After that, we're forced to do expensive inserts.
+ let mut first = true;
+ let mut add_trans =
+ |trie: &mut RangeTrie, pos, from, range, to| {
+ if first {
+ trie.set_transition_at(pos, from, range, to);
+ first = false;
+ } else {
+ trie.add_transition_at(pos, from, range, to);
+ }
+ };
+ for (j, &srange) in splits.iter().enumerate() {
+ match srange {
+ SplitRange::Old(r) => {
+ // Deep clone the state pointed to by the ith
+ // transition. This is always necessary since 'old'
+ // is always coupled with at least a 'both'
+ // partition. We don't want any new changes made
+ // via the 'both' partition to impact the part of
+ // the transition that doesn't overlap with the
+ // new range.
+ let dup_id = self.duplicate(old.next_id);
+ add_trans(self, i, state_id, r, dup_id);
+ }
+ SplitRange::New(r) => {
+ // This is a bit subtle, but if this happens to be
+ // the last partition in our split, it is possible
+ // that this overlaps with a subsequent transition.
+ // If it does, then we must repeat the whole
+ // splitting process over again with `r` and the
+ // subsequent transition.
+ {
+ let trans = &self.state(state_id).transitions;
+ if j + 1 == splits.len()
+ && i < trans.len()
+ && intersects(r, trans[i].range)
+ {
+ new = r;
+ continue 'OUTER;
+ }
+ }
+
+ // ... otherwise, setup exploration for a new
+ // empty state and add a brand new transition for
+ // this new range.
+ let next_id =
+ NextInsert::push(self, &mut stack, rest);
+ add_trans(self, i, state_id, r, next_id);
+ }
+ SplitRange::Both(r) => {
+ // Continue adding the remaining ranges on this
+ // path and update the transition with the new
+ // range.
+ if !rest.is_empty() {
+ stack.push(NextInsert::new(old.next_id, rest));
+ }
+ add_trans(self, i, state_id, r, old.next_id);
+ }
+ }
+ i += 1;
+ }
+ // If we've reached this point, then we know that there are
+ // no subsequent transitions with any overlap. Therefore, we
+ // can stop processing this range and move on to the next one.
+ break;
+ }
+ }
+ self.insert_stack = stack;
+ }
+
+ pub fn add_empty(&mut self) -> StateID {
+ if self.states.len() as u64 > u32::MAX as u64 {
+ // This generally should not happen since a range trie is only
+ // ever used to compile a single sequence of Unicode scalar values.
+ // If we ever got to this point, we would, at *minimum*, be using
+ // 96GB in just the range trie alone.
+ panic!("too many sequences added to range trie");
+ }
+ let id = self.states.len() as StateID;
+ // If we have some free states available, then use them to avoid
+ // more allocations.
+ if let Some(mut state) = self.free.pop() {
+ state.clear();
+ self.states.push(state);
+ } else {
+ self.states.push(State { transitions: vec![] });
+ }
+ id
+ }
+
+ /// Performs a deep clone of the given state and returns the duplicate's
+ /// state ID.
+ ///
+ /// A "deep clone" in this context means that the state given along with
+ /// recursively all states that it points to are copied. Once complete,
+ /// the given state ID and the returned state ID share nothing.
+ ///
+ /// This is useful during range trie insertion when a new range overlaps
+ /// with an existing range that is bigger than the new one. The part of
+ /// the existing range that does *not* overlap with the new one is that
+ /// duplicated so that adding the new range to the overlap doesn't disturb
+ /// the non-overlapping portion.
+ ///
+ /// There's one exception: if old_id is the final state, then it is not
+ /// duplicated and the same final state is returned. This is because all
+ /// final states in this trie are equivalent.
+ fn duplicate(&mut self, old_id: StateID) -> StateID {
+ if old_id == FINAL {
+ return FINAL;
+ }
+
+ let mut stack = mem::replace(&mut self.dupe_stack, vec![]);
+ stack.clear();
+
+ let new_id = self.add_empty();
+ // old_id is the state we're cloning and new_id is the ID of the
+ // duplicated state for old_id.
+ stack.push(NextDupe { old_id, new_id });
+ while let Some(NextDupe { old_id, new_id }) = stack.pop() {
+ for i in 0..self.state(old_id).transitions.len() {
+ let t = self.state(old_id).transitions[i].clone();
+ if t.next_id == FINAL {
+ // All final states are the same, so there's no need to
+ // duplicate it.
+ self.add_transition(new_id, t.range, FINAL);
+ continue;
+ }
+
+ let new_child_id = self.add_empty();
+ self.add_transition(new_id, t.range, new_child_id);
+ stack.push(NextDupe {
+ old_id: t.next_id,
+ new_id: new_child_id,
+ });
+ }
+ }
+ self.dupe_stack = stack;
+ new_id
+ }
+
+ /// Adds the given transition to the given state.
+ ///
+ /// Callers must ensure that all previous transitions in this state
+ /// are lexicographically smaller than the given range.
+ fn add_transition(
+ &mut self,
+ from_id: StateID,
+ range: Utf8Range,
+ next_id: StateID,
+ ) {
+ self.state_mut(from_id)
+ .transitions
+ .push(Transition { range, next_id });
+ }
+
+ /// Like `add_transition`, except this inserts the transition just before
+ /// the ith transition.
+ fn add_transition_at(
+ &mut self,
+ i: usize,
+ from_id: StateID,
+ range: Utf8Range,
+ next_id: StateID,
+ ) {
+ self.state_mut(from_id)
+ .transitions
+ .insert(i, Transition { range, next_id });
+ }
+
+ /// Overwrites the transition at position i with the given transition.
+ fn set_transition_at(
+ &mut self,
+ i: usize,
+ from_id: StateID,
+ range: Utf8Range,
+ next_id: StateID,
+ ) {
+ self.state_mut(from_id).transitions[i] = Transition { range, next_id };
+ }
+
+ /// Return an immutable borrow for the state with the given ID.
+ fn state(&self, id: StateID) -> &State {
+ &self.states[id as usize]
+ }
+
+ /// Return a mutable borrow for the state with the given ID.
+ fn state_mut(&mut self, id: StateID) -> &mut State {
+ &mut self.states[id as usize]
+ }
+}
+
+impl State {
+ /// Find the position at which the given range should be inserted in this
+ /// state.
+ ///
+ /// The position returned is always in the inclusive range
+ /// [0, transitions.len()]. If 'transitions.len()' is returned, then the
+ /// given range overlaps with no other range in this state *and* is greater
+ /// than all of them.
+ ///
+ /// For all other possible positions, the given range either overlaps
+ /// with the transition at that position or is otherwise less than it
+ /// with no overlap (and is greater than the previous transition). In the
+ /// former case, careful attention must be paid to inserting this range
+ /// as a new transition. In the latter case, the range can be inserted as
+ /// a new transition at the given position without disrupting any other
+ /// transitions.
+ fn find(&self, range: Utf8Range) -> usize {
+ /// Returns the position `i` at which `pred(xs[i])` first returns true
+ /// such that for all `j >= i`, `pred(xs[j]) == true`. If `pred` never
+ /// returns true, then `xs.len()` is returned.
+ ///
+ /// We roll our own binary search because it doesn't seem like the
+ /// standard library's binary search can be used here. Namely, if
+ /// there is an overlapping range, then we want to find the first such
+ /// occurrence, but there may be many. Or at least, it's not quite
+ /// clear to me how to do it.
+ fn binary_search<T, F>(xs: &[T], mut pred: F) -> usize
+ where
+ F: FnMut(&T) -> bool,
+ {
+ let (mut left, mut right) = (0, xs.len());
+ while left < right {
+ // Overflow is impossible because xs.len() <= 256.
+ let mid = (left + right) / 2;
+ if pred(&xs[mid]) {
+ right = mid;
+ } else {
+ left = mid + 1;
+ }
+ }
+ left
+ }
+
+ // Benchmarks suggest that binary search is just a bit faster than
+ // straight linear search. Specifically when using the debug tool:
+ //
+ // hyperfine "regex-automata-debug debug -acqr '\w{40} ecurB'"
+ binary_search(&self.transitions, |t| range.start <= t.range.end)
+ }
+
+ /// Clear this state such that it has zero transitions.
+ fn clear(&mut self) {
+ self.transitions.clear();
+ }
+}
+
+/// The next state to process during duplication.
+#[derive(Clone, Debug)]
+struct NextDupe {
+ /// The state we want to duplicate.
+ old_id: StateID,
+ /// The ID of the new state that is a duplicate of old_id.
+ new_id: StateID,
+}
+
+/// The next state (and its corresponding transition) that we want to visit
+/// during iteration in lexicographic order.
+#[derive(Clone, Debug)]
+struct NextIter {
+ state_id: StateID,
+ tidx: usize,
+}
+
+/// The next state to process during insertion and any remaining ranges that we
+/// want to add for a partcular sequence of ranges. The first such instance
+/// is always the root state along with all ranges given.
+#[derive(Clone, Debug)]
+struct NextInsert {
+ /// The next state to begin inserting ranges. This state should be the
+ /// state at which `ranges[0]` should be inserted.
+ state_id: StateID,
+ /// The ranges to insert. We used a fixed-size array here to avoid an
+ /// allocation.
+ ranges: [Utf8Range; 4],
+ /// The number of valid ranges in the above array.
+ len: u8,
+}
+
+impl NextInsert {
+ /// Create the next item to visit. The given state ID should correspond
+ /// to the state at which the first range in the given slice should be
+ /// inserted. The slice given must not be empty and it must be no longer
+ /// than 4.
+ fn new(state_id: StateID, ranges: &[Utf8Range]) -> NextInsert {
+ let len = ranges.len();
+ assert!(len > 0);
+ assert!(len <= 4);
+
+ let mut tmp = [Utf8Range { start: 0, end: 0 }; 4];
+ tmp[..len].copy_from_slice(ranges);
+ NextInsert { state_id, ranges: tmp, len: len as u8 }
+ }
+
+ /// Push a new empty state to visit along with any remaining ranges that
+ /// still need to be inserted. The ID of the new empty state is returned.
+ ///
+ /// If ranges is empty, then no new state is created and FINAL is returned.
+ fn push(
+ trie: &mut RangeTrie,
+ stack: &mut Vec<NextInsert>,
+ ranges: &[Utf8Range],
+ ) -> StateID {
+ if ranges.is_empty() {
+ FINAL
+ } else {
+ let next_id = trie.add_empty();
+ stack.push(NextInsert::new(next_id, ranges));
+ next_id
+ }
+ }
+
+ /// Return the ID of the state to visit.
+ fn state_id(&self) -> StateID {
+ self.state_id
+ }
+
+ /// Return the remaining ranges to insert.
+ fn ranges(&self) -> &[Utf8Range] {
+ &self.ranges[..self.len as usize]
+ }
+}
+
+/// Split represents a partitioning of two ranges into one or more ranges. This
+/// is the secret sauce that makes a range trie work, as it's what tells us
+/// how to deal with two overlapping but unequal ranges during insertion.
+///
+/// Essentially, either two ranges overlap or they don't. If they don't, then
+/// handling insertion is easy: just insert the new range into its
+/// lexicographically correct position. Since it does not overlap with anything
+/// else, no other transitions are impacted by the new range.
+///
+/// If they do overlap though, there are generally three possible cases to
+/// handle:
+///
+/// 1. The part where the two ranges actually overlap. i.e., The intersection.
+/// 2. The part of the existing range that is not in the the new range.
+/// 3. The part of the new range that is not in the old range.
+///
+/// (1) is guaranteed to always occur since all overlapping ranges have a
+/// non-empty intersection. If the two ranges are not equivalent, then at
+/// least one of (2) or (3) is guaranteed to occur as well. In some cases,
+/// e.g., `[0-4]` and `[4-9]`, all three cases will occur.
+///
+/// This `Split` type is responsible for providing (1), (2) and (3) for any
+/// possible pair of byte ranges.
+///
+/// As for insertion, for the overlap in (1), the remaining ranges to insert
+/// should be added by following the corresponding transition. However, this
+/// should only be done for the overlapping parts of the range. If there was
+/// a part of the existing range that was not in the new range, then that
+/// existing part must be split off from the transition and duplicated. The
+/// remaining parts of the overlap can then be added to using the new ranges
+/// without disturbing the existing range.
+///
+/// Handling the case for the part of a new range that is not in an existing
+/// range is seemingly easy. Just treat it as if it were a non-overlapping
+/// range. The problem here is that if this new non-overlapping range occurs
+/// after both (1) and (2), then it's possible that it can overlap with the
+/// next transition in the current state. If it does, then the whole process
+/// must be repeated!
+///
+/// # Details of the 3 cases
+///
+/// The following details the various cases that are implemented in code
+/// below. It's plausible that the number of cases is not actually minimal,
+/// but it's important for this code to remain at least somewhat readable.
+///
+/// Given [a,b] and [x,y], where a <= b, x <= y, b < 256 and y < 256, we define
+/// the follow distinct relationships where at least one must apply. The order
+/// of these matters, since multiple can match. The first to match applies.
+///
+/// 1. b < x <=> [a,b] < [x,y]
+/// 2. y < a <=> [x,y] < [a,b]
+///
+/// In the case of (1) and (2), these are the only cases where there is no
+/// overlap. Or otherwise, the intersection of [a,b] and [x,y] is empty. In
+/// order to compute the intersection, one can do [max(a,x), min(b,y)]. The
+/// intersection in all of the following cases is non-empty.
+///
+/// 3. a = x && b = y <=> [a,b] == [x,y]
+/// 4. a = x && b < y <=> [x,y] right-extends [a,b]
+/// 5. b = y && a > x <=> [x,y] left-extends [a,b]
+/// 6. x = a && y < b <=> [a,b] right-extends [x,y]
+/// 7. y = b && x > a <=> [a,b] left-extends [x,y]
+/// 8. a > x && b < y <=> [x,y] covers [a,b]
+/// 9. x > a && y < b <=> [a,b] covers [x,y]
+/// 10. b = x && a < y <=> [a,b] is left-adjacent to [x,y]
+/// 11. y = a && x < b <=> [x,y] is left-adjacent to [a,b]
+/// 12. b > x && b < y <=> [a,b] left-overlaps [x,y]
+/// 13. y > a && y < b <=> [x,y] left-overlaps [a,b]
+///
+/// In cases 3-13, we can form rules that partition the ranges into a
+/// non-overlapping ordered sequence of ranges:
+///
+/// 3. [a,b]
+/// 4. [a,b], [b+1,y]
+/// 5. [x,a-1], [a,b]
+/// 6. [x,y], [y+1,b]
+/// 7. [a,x-1], [x,y]
+/// 8. [x,a-1], [a,b], [b+1,y]
+/// 9. [a,x-1], [x,y], [y+1,b]
+/// 10. [a,b-1], [b,b], [b+1,y]
+/// 11. [x,y-1], [y,y], [y+1,b]
+/// 12. [a,x-1], [x,b], [b+1,y]
+/// 13. [x,a-1], [a,y], [y+1,b]
+///
+/// In the code below, we go a step further and identify each of the above
+/// outputs as belonging either to the overlap of the two ranges or to one
+/// of [a,b] or [x,y] exclusively.
+#[derive(Clone, Debug, Eq, PartialEq)]
+struct Split {
+ partitions: [SplitRange; 3],
+ len: usize,
+}
+
+/// A tagged range indicating how it was derived from a pair of ranges.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+enum SplitRange {
+ Old(Utf8Range),
+ New(Utf8Range),
+ Both(Utf8Range),
+}
+
+impl Split {
+ /// Create a partitioning of the given ranges.
+ ///
+ /// If the given ranges have an empty intersection, then None is returned.
+ fn new(o: Utf8Range, n: Utf8Range) -> Option<Split> {
+ let range = |r: RangeInclusive<u8>| Utf8Range {
+ start: *r.start(),
+ end: *r.end(),
+ };
+ let old = |r| SplitRange::Old(range(r));
+ let new = |r| SplitRange::New(range(r));
+ let both = |r| SplitRange::Both(range(r));
+
+ // Use same names as the comment above to make it easier to compare.
+ let (a, b, x, y) = (o.start, o.end, n.start, n.end);
+
+ if b < x || y < a {
+ // case 1, case 2
+ None
+ } else if a == x && b == y {
+ // case 3
+ Some(Split::parts1(both(a..=b)))
+ } else if a == x && b < y {
+ // case 4
+ Some(Split::parts2(both(a..=b), new(b + 1..=y)))
+ } else if b == y && a > x {
+ // case 5
+ Some(Split::parts2(new(x..=a - 1), both(a..=b)))
+ } else if x == a && y < b {
+ // case 6
+ Some(Split::parts2(both(x..=y), old(y + 1..=b)))
+ } else if y == b && x > a {
+ // case 7
+ Some(Split::parts2(old(a..=x - 1), both(x..=y)))
+ } else if a > x && b < y {
+ // case 8
+ Some(Split::parts3(new(x..=a - 1), both(a..=b), new(b + 1..=y)))
+ } else if x > a && y < b {
+ // case 9
+ Some(Split::parts3(old(a..=x - 1), both(x..=y), old(y + 1..=b)))
+ } else if b == x && a < y {
+ // case 10
+ Some(Split::parts3(old(a..=b - 1), both(b..=b), new(b + 1..=y)))
+ } else if y == a && x < b {
+ // case 11
+ Some(Split::parts3(new(x..=y - 1), both(y..=y), old(y + 1..=b)))
+ } else if b > x && b < y {
+ // case 12
+ Some(Split::parts3(old(a..=x - 1), both(x..=b), new(b + 1..=y)))
+ } else if y > a && y < b {
+ // case 13
+ Some(Split::parts3(new(x..=a - 1), both(a..=y), old(y + 1..=b)))
+ } else {
+ unreachable!()
+ }
+ }
+
+ /// Create a new split with a single partition. This only occurs when two
+ /// ranges are equivalent.
+ fn parts1(r1: SplitRange) -> Split {
+ // This value doesn't matter since it is never accessed.
+ let nada = SplitRange::Old(Utf8Range { start: 0, end: 0 });
+ Split { partitions: [r1, nada, nada], len: 1 }
+ }
+
+ /// Create a new split with two partitions.
+ fn parts2(r1: SplitRange, r2: SplitRange) -> Split {
+ // This value doesn't matter since it is never accessed.
+ let nada = SplitRange::Old(Utf8Range { start: 0, end: 0 });
+ Split { partitions: [r1, r2, nada], len: 2 }
+ }
+
+ /// Create a new split with three partitions.
+ fn parts3(r1: SplitRange, r2: SplitRange, r3: SplitRange) -> Split {
+ Split { partitions: [r1, r2, r3], len: 3 }
+ }
+
+ /// Return the partitions in this split as a slice.
+ fn as_slice(&self) -> &[SplitRange] {
+ &self.partitions[..self.len]
+ }
+}
+
+impl fmt::Debug for RangeTrie {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ writeln!(f, "")?;
+ for (i, state) in self.states.iter().enumerate() {
+ let status = if i == FINAL as usize { '*' } else { ' ' };
+ writeln!(f, "{}{:06}: {:?}", status, i, state)?;
+ }
+ Ok(())
+ }
+}
+
+impl fmt::Debug for State {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ let rs = self
+ .transitions
+ .iter()
+ .map(|t| format!("{:?}", t))
+ .collect::<Vec<String>>()
+ .join(", ");
+ write!(f, "{}", rs)
+ }
+}
+
+impl fmt::Debug for Transition {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ if self.range.start == self.range.end {
+ write!(f, "{:02X} => {:02X}", self.range.start, self.next_id)
+ } else {
+ write!(
+ f,
+ "{:02X}-{:02X} => {:02X}",
+ self.range.start, self.range.end, self.next_id
+ )
+ }
+ }
+}
+
+/// Returns true if and only if the given ranges intersect.
+fn intersects(r1: Utf8Range, r2: Utf8Range) -> bool {
+ !(r1.end < r2.start || r2.end < r1.start)
+}
+
+#[cfg(test)]
+mod tests {
+ use std::ops::RangeInclusive;
+
+ use regex_syntax::utf8::Utf8Range;
+
+ use super::*;
+
+ fn r(range: RangeInclusive<u8>) -> Utf8Range {
+ Utf8Range { start: *range.start(), end: *range.end() }
+ }
+
+ fn split_maybe(
+ old: RangeInclusive<u8>,
+ new: RangeInclusive<u8>,
+ ) -> Option<Split> {
+ Split::new(r(old), r(new))
+ }
+
+ fn split(
+ old: RangeInclusive<u8>,
+ new: RangeInclusive<u8>,
+ ) -> Vec<SplitRange> {
+ split_maybe(old, new).unwrap().as_slice().to_vec()
+ }
+
+ #[test]
+ fn no_splits() {
+ // case 1
+ assert_eq!(None, split_maybe(0..=1, 2..=3));
+ // case 2
+ assert_eq!(None, split_maybe(2..=3, 0..=1));
+ }
+
+ #[test]
+ fn splits() {
+ let range = |r: RangeInclusive<u8>| Utf8Range {
+ start: *r.start(),
+ end: *r.end(),
+ };
+ let old = |r| SplitRange::Old(range(r));
+ let new = |r| SplitRange::New(range(r));
+ let both = |r| SplitRange::Both(range(r));
+
+ // case 3
+ assert_eq!(split(0..=0, 0..=0), vec![both(0..=0)]);
+ assert_eq!(split(9..=9, 9..=9), vec![both(9..=9)]);
+
+ // case 4
+ assert_eq!(split(0..=5, 0..=6), vec![both(0..=5), new(6..=6)]);
+ assert_eq!(split(0..=5, 0..=8), vec![both(0..=5), new(6..=8)]);
+ assert_eq!(split(5..=5, 5..=8), vec![both(5..=5), new(6..=8)]);
+
+ // case 5
+ assert_eq!(split(1..=5, 0..=5), vec![new(0..=0), both(1..=5)]);
+ assert_eq!(split(3..=5, 0..=5), vec![new(0..=2), both(3..=5)]);
+ assert_eq!(split(5..=5, 0..=5), vec![new(0..=4), both(5..=5)]);
+
+ // case 6
+ assert_eq!(split(0..=6, 0..=5), vec![both(0..=5), old(6..=6)]);
+ assert_eq!(split(0..=8, 0..=5), vec![both(0..=5), old(6..=8)]);
+ assert_eq!(split(5..=8, 5..=5), vec![both(5..=5), old(6..=8)]);
+
+ // case 7
+ assert_eq!(split(0..=5, 1..=5), vec![old(0..=0), both(1..=5)]);
+ assert_eq!(split(0..=5, 3..=5), vec![old(0..=2), both(3..=5)]);
+ assert_eq!(split(0..=5, 5..=5), vec![old(0..=4), both(5..=5)]);
+
+ // case 8
+ assert_eq!(
+ split(3..=6, 2..=7),
+ vec![new(2..=2), both(3..=6), new(7..=7)],
+ );
+ assert_eq!(
+ split(3..=6, 1..=8),
+ vec![new(1..=2), both(3..=6), new(7..=8)],
+ );
+
+ // case 9
+ assert_eq!(
+ split(2..=7, 3..=6),
+ vec![old(2..=2), both(3..=6), old(7..=7)],
+ );
+ assert_eq!(
+ split(1..=8, 3..=6),
+ vec![old(1..=2), both(3..=6), old(7..=8)],
+ );
+
+ // case 10
+ assert_eq!(
+ split(3..=6, 6..=7),
+ vec![old(3..=5), both(6..=6), new(7..=7)],
+ );
+ assert_eq!(
+ split(3..=6, 6..=8),
+ vec![old(3..=5), both(6..=6), new(7..=8)],
+ );
+ assert_eq!(
+ split(5..=6, 6..=7),
+ vec![old(5..=5), both(6..=6), new(7..=7)],
+ );
+
+ // case 11
+ assert_eq!(
+ split(6..=7, 3..=6),
+ vec![new(3..=5), both(6..=6), old(7..=7)],
+ );
+ assert_eq!(
+ split(6..=8, 3..=6),
+ vec![new(3..=5), both(6..=6), old(7..=8)],
+ );
+ assert_eq!(
+ split(6..=7, 5..=6),
+ vec![new(5..=5), both(6..=6), old(7..=7)],
+ );
+
+ // case 12
+ assert_eq!(
+ split(3..=7, 5..=9),
+ vec![old(3..=4), both(5..=7), new(8..=9)],
+ );
+ assert_eq!(
+ split(3..=5, 4..=6),
+ vec![old(3..=3), both(4..=5), new(6..=6)],
+ );
+
+ // case 13
+ assert_eq!(
+ split(5..=9, 3..=7),
+ vec![new(3..=4), both(5..=7), old(8..=9)],
+ );
+ assert_eq!(
+ split(4..=6, 3..=5),
+ vec![new(3..=3), both(4..=5), old(6..=6)],
+ );
+ }
+
+ // Arguably there should be more tests here, but in practice, this data
+ // structure is well covered by the huge number of regex tests.
+}
diff --git a/src/regex.rs b/src/regex.rs
new file mode 100644
index 0000000..47e1c58
--- /dev/null
+++ b/src/regex.rs
@@ -0,0 +1,771 @@
+#[cfg(feature = "std")]
+use dense::{self, DenseDFA};
+use dfa::DFA;
+#[cfg(feature = "std")]
+use error::Result;
+#[cfg(feature = "std")]
+use sparse::SparseDFA;
+#[cfg(feature = "std")]
+use state_id::StateID;
+
+/// A regular expression that uses deterministic finite automata for fast
+/// searching.
+///
+/// A regular expression is comprised of two DFAs, a "forward" DFA and a
+/// "reverse" DFA. The forward DFA is responsible for detecting the end of a
+/// match while the reverse DFA is responsible for detecting the start of a
+/// match. Thus, in order to find the bounds of any given match, a forward
+/// search must first be run followed by a reverse search. A match found by
+/// the forward DFA guarantees that the reverse DFA will also find a match.
+///
+/// The type of the DFA used by a `Regex` corresponds to the `D` type
+/// parameter, which must satisfy the [`DFA`](trait.DFA.html) trait. Typically,
+/// `D` is either a [`DenseDFA`](enum.DenseDFA.html) or a
+/// [`SparseDFA`](enum.SparseDFA.html), where dense DFAs use more memory but
+/// search faster, while sparse DFAs use less memory but search more slowly.
+///
+/// By default, a regex's DFA type parameter is set to
+/// `DenseDFA<Vec<usize>, usize>`. For most in-memory work loads, this is the
+/// most convenient type that gives the best search performance.
+///
+/// # Sparse DFAs
+///
+/// Since a `Regex` is generic over the `DFA` trait, it can be used with any
+/// kind of DFA. While this crate constructs dense DFAs by default, it is easy
+/// enough to build corresponding sparse DFAs, and then build a regex from
+/// them:
+///
+/// ```
+/// use regex_automata::Regex;
+///
+/// # fn example() -> Result<(), regex_automata::Error> {
+/// // First, build a regex that uses dense DFAs.
+/// let dense_re = Regex::new("foo[0-9]+")?;
+///
+/// // Second, build sparse DFAs from the forward and reverse dense DFAs.
+/// let fwd = dense_re.forward().to_sparse()?;
+/// let rev = dense_re.reverse().to_sparse()?;
+///
+/// // Third, build a new regex from the constituent sparse DFAs.
+/// let sparse_re = Regex::from_dfas(fwd, rev);
+///
+/// // A regex that uses sparse DFAs can be used just like with dense DFAs.
+/// assert_eq!(true, sparse_re.is_match(b"foo123"));
+/// # Ok(()) }; example().unwrap()
+/// ```
+#[cfg(feature = "std")]
+#[derive(Clone, Debug)]
+pub struct Regex<D: DFA = DenseDFA<Vec<usize>, usize>> {
+ forward: D,
+ reverse: D,
+}
+
+/// A regular expression that uses deterministic finite automata for fast
+/// searching.
+///
+/// A regular expression is comprised of two DFAs, a "forward" DFA and a
+/// "reverse" DFA. The forward DFA is responsible for detecting the end of a
+/// match while the reverse DFA is responsible for detecting the start of a
+/// match. Thus, in order to find the bounds of any given match, a forward
+/// search must first be run followed by a reverse search. A match found by
+/// the forward DFA guarantees that the reverse DFA will also find a match.
+///
+/// The type of the DFA used by a `Regex` corresponds to the `D` type
+/// parameter, which must satisfy the [`DFA`](trait.DFA.html) trait. Typically,
+/// `D` is either a [`DenseDFA`](enum.DenseDFA.html) or a
+/// [`SparseDFA`](enum.SparseDFA.html), where dense DFAs use more memory but
+/// search faster, while sparse DFAs use less memory but search more slowly.
+///
+/// When using this crate without the standard library, the `Regex` type has
+/// no default type parameter.
+///
+/// # Sparse DFAs
+///
+/// Since a `Regex` is generic over the `DFA` trait, it can be used with any
+/// kind of DFA. While this crate constructs dense DFAs by default, it is easy
+/// enough to build corresponding sparse DFAs, and then build a regex from
+/// them:
+///
+/// ```
+/// use regex_automata::Regex;
+///
+/// # fn example() -> Result<(), regex_automata::Error> {
+/// // First, build a regex that uses dense DFAs.
+/// let dense_re = Regex::new("foo[0-9]+")?;
+///
+/// // Second, build sparse DFAs from the forward and reverse dense DFAs.
+/// let fwd = dense_re.forward().to_sparse()?;
+/// let rev = dense_re.reverse().to_sparse()?;
+///
+/// // Third, build a new regex from the constituent sparse DFAs.
+/// let sparse_re = Regex::from_dfas(fwd, rev);
+///
+/// // A regex that uses sparse DFAs can be used just like with dense DFAs.
+/// assert_eq!(true, sparse_re.is_match(b"foo123"));
+/// # Ok(()) }; example().unwrap()
+/// ```
+#[cfg(not(feature = "std"))]
+#[derive(Clone, Debug)]
+pub struct Regex<D> {
+ forward: D,
+ reverse: D,
+}
+
+#[cfg(feature = "std")]
+impl Regex {
+ /// Parse the given regular expression using a default configuration and
+ /// return the corresponding regex.
+ ///
+ /// The default configuration uses `usize` for state IDs, premultiplies
+ /// them and reduces the alphabet size by splitting bytes into equivalence
+ /// classes. The underlying DFAs are *not* minimized.
+ ///
+ /// If you want a non-default configuration, then use the
+ /// [`RegexBuilder`](struct.RegexBuilder.html)
+ /// to set your own configuration.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::Regex;
+ ///
+ /// # fn example() -> Result<(), regex_automata::Error> {
+ /// let re = Regex::new("foo[0-9]+bar")?;
+ /// assert_eq!(Some((3, 14)), re.find(b"zzzfoo12345barzzz"));
+ /// # Ok(()) }; example().unwrap()
+ /// ```
+ pub fn new(pattern: &str) -> Result<Regex> {
+ RegexBuilder::new().build(pattern)
+ }
+}
+
+#[cfg(feature = "std")]
+impl Regex<SparseDFA<Vec<u8>, usize>> {
+ /// Parse the given regular expression using a default configuration and
+ /// return the corresponding regex using sparse DFAs.
+ ///
+ /// The default configuration uses `usize` for state IDs, reduces the
+ /// alphabet size by splitting bytes into equivalence classes. The
+ /// underlying DFAs are *not* minimized.
+ ///
+ /// If you want a non-default configuration, then use the
+ /// [`RegexBuilder`](struct.RegexBuilder.html)
+ /// to set your own configuration.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::Regex;
+ ///
+ /// # fn example() -> Result<(), regex_automata::Error> {
+ /// let re = Regex::new_sparse("foo[0-9]+bar")?;
+ /// assert_eq!(Some((3, 14)), re.find(b"zzzfoo12345barzzz"));
+ /// # Ok(()) }; example().unwrap()
+ /// ```
+ pub fn new_sparse(
+ pattern: &str,
+ ) -> Result<Regex<SparseDFA<Vec<u8>, usize>>> {
+ RegexBuilder::new().build_sparse(pattern)
+ }
+}
+
+impl<D: DFA> Regex<D> {
+ /// Returns true if and only if the given bytes match.
+ ///
+ /// This routine may short circuit if it knows that scanning future input
+ /// will never lead to a different result. In particular, if the underlying
+ /// DFA enters a match state or a dead state, then this routine will return
+ /// `true` or `false`, respectively, without inspecting any future input.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::Regex;
+ ///
+ /// # fn example() -> Result<(), regex_automata::Error> {
+ /// let re = Regex::new("foo[0-9]+bar")?;
+ /// assert_eq!(true, re.is_match(b"foo12345bar"));
+ /// assert_eq!(false, re.is_match(b"foobar"));
+ /// # Ok(()) }; example().unwrap()
+ /// ```
+ pub fn is_match(&self, input: &[u8]) -> bool {
+ self.is_match_at(input, 0)
+ }
+
+ /// Returns the first position at which a match is found.
+ ///
+ /// This routine stops scanning input in precisely the same circumstances
+ /// as `is_match`. The key difference is that this routine returns the
+ /// position at which it stopped scanning input if and only if a match
+ /// was found. If no match is found, then `None` is returned.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::Regex;
+ ///
+ /// # fn example() -> Result<(), regex_automata::Error> {
+ /// let re = Regex::new("foo[0-9]+")?;
+ /// assert_eq!(Some(4), re.shortest_match(b"foo12345"));
+ ///
+ /// // Normally, the end of the leftmost first match here would be 3,
+ /// // but the shortest match semantics detect a match earlier.
+ /// let re = Regex::new("abc|a")?;
+ /// assert_eq!(Some(1), re.shortest_match(b"abc"));
+ /// # Ok(()) }; example().unwrap()
+ /// ```
+ pub fn shortest_match(&self, input: &[u8]) -> Option<usize> {
+ self.shortest_match_at(input, 0)
+ }
+
+ /// Returns the start and end offset of the leftmost first match. If no
+ /// match exists, then `None` is returned.
+ ///
+ /// The "leftmost first" match corresponds to the match with the smallest
+ /// starting offset, but where the end offset is determined by preferring
+ /// earlier branches in the original regular expression. For example,
+ /// `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam` will
+ /// match `Samwise` in `Samwise`.
+ ///
+ /// Generally speaking, the "leftmost first" match is how most backtracking
+ /// regular expressions tend to work. This is in contrast to POSIX-style
+ /// regular expressions that yield "leftmost longest" matches. Namely,
+ /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using
+ /// leftmost longest semantics.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::Regex;
+ ///
+ /// # fn example() -> Result<(), regex_automata::Error> {
+ /// let re = Regex::new("foo[0-9]+")?;
+ /// assert_eq!(Some((3, 11)), re.find(b"zzzfoo12345zzz"));
+ ///
+ /// // Even though a match is found after reading the first byte (`a`),
+ /// // the leftmost first match semantics demand that we find the earliest
+ /// // match that prefers earlier parts of the pattern over latter parts.
+ /// let re = Regex::new("abc|a")?;
+ /// assert_eq!(Some((0, 3)), re.find(b"abc"));
+ /// # Ok(()) }; example().unwrap()
+ /// ```
+ pub fn find(&self, input: &[u8]) -> Option<(usize, usize)> {
+ self.find_at(input, 0)
+ }
+
+ /// Returns the same as `is_match`, but starts the search at the given
+ /// offset.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, if the DFA is anchored, then
+ /// a match can only occur when `start == 0`.
+ pub fn is_match_at(&self, input: &[u8], start: usize) -> bool {
+ self.forward().is_match_at(input, start)
+ }
+
+ /// Returns the same as `shortest_match`, but starts the search at the
+ /// given offset.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, if the DFA is anchored, then
+ /// a match can only occur when `start == 0`.
+ pub fn shortest_match_at(
+ &self,
+ input: &[u8],
+ start: usize,
+ ) -> Option<usize> {
+ self.forward().shortest_match_at(input, start)
+ }
+
+ /// Returns the same as `find`, but starts the search at the given
+ /// offset.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, if the DFA is anchored, then
+ /// a match can only occur when `start == 0`.
+ pub fn find_at(
+ &self,
+ input: &[u8],
+ start: usize,
+ ) -> Option<(usize, usize)> {
+ let end = match self.forward().find_at(input, start) {
+ None => return None,
+ Some(end) => end,
+ };
+ let start = self
+ .reverse()
+ .rfind(&input[start..end])
+ .map(|i| start + i)
+ .expect("reverse search must match if forward search does");
+ Some((start, end))
+ }
+
+ /// Returns an iterator over all non-overlapping leftmost first matches
+ /// in the given bytes. If no match exists, then the iterator yields no
+ /// elements.
+ ///
+ /// Note that if the regex can match the empty string, then it is
+ /// possible for the iterator to yield a zero-width match at a location
+ /// that is not a valid UTF-8 boundary (for example, between the code units
+ /// of a UTF-8 encoded codepoint). This can happen regardless of whether
+ /// [`allow_invalid_utf8`](struct.RegexBuilder.html#method.allow_invalid_utf8)
+ /// was enabled or not.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::Regex;
+ ///
+ /// # fn example() -> Result<(), regex_automata::Error> {
+ /// let re = Regex::new("foo[0-9]+")?;
+ /// let text = b"foo1 foo12 foo123";
+ /// let matches: Vec<(usize, usize)> = re.find_iter(text).collect();
+ /// assert_eq!(matches, vec![(0, 4), (5, 10), (11, 17)]);
+ /// # Ok(()) }; example().unwrap()
+ /// ```
+ pub fn find_iter<'r, 't>(&'r self, input: &'t [u8]) -> Matches<'r, 't, D> {
+ Matches::new(self, input)
+ }
+
+ /// Build a new regex from its constituent forward and reverse DFAs.
+ ///
+ /// This is useful when deserializing a regex from some arbitrary
+ /// memory region. This is also useful for building regexes from other
+ /// types of DFAs.
+ ///
+ /// # Example
+ ///
+ /// This example is a bit a contrived. The usual use of these methods
+ /// would involve serializing `initial_re` somewhere and then deserializing
+ /// it later to build a regex.
+ ///
+ /// ```
+ /// use regex_automata::Regex;
+ ///
+ /// # fn example() -> Result<(), regex_automata::Error> {
+ /// let initial_re = Regex::new("foo[0-9]+")?;
+ /// assert_eq!(true, initial_re.is_match(b"foo123"));
+ ///
+ /// let (fwd, rev) = (initial_re.forward(), initial_re.reverse());
+ /// let re = Regex::from_dfas(fwd, rev);
+ /// assert_eq!(true, re.is_match(b"foo123"));
+ /// # Ok(()) }; example().unwrap()
+ /// ```
+ ///
+ /// This example shows how you might build smaller DFAs, and then use those
+ /// smaller DFAs to build a new regex.
+ ///
+ /// ```
+ /// use regex_automata::Regex;
+ ///
+ /// # fn example() -> Result<(), regex_automata::Error> {
+ /// let initial_re = Regex::new("foo[0-9]+")?;
+ /// assert_eq!(true, initial_re.is_match(b"foo123"));
+ ///
+ /// let fwd = initial_re.forward().to_u16()?;
+ /// let rev = initial_re.reverse().to_u16()?;
+ /// let re = Regex::from_dfas(fwd, rev);
+ /// assert_eq!(true, re.is_match(b"foo123"));
+ /// # Ok(()) }; example().unwrap()
+ /// ```
+ ///
+ /// This example shows how to build a `Regex` that uses sparse DFAs instead
+ /// of dense DFAs:
+ ///
+ /// ```
+ /// use regex_automata::Regex;
+ ///
+ /// # fn example() -> Result<(), regex_automata::Error> {
+ /// let initial_re = Regex::new("foo[0-9]+")?;
+ /// assert_eq!(true, initial_re.is_match(b"foo123"));
+ ///
+ /// let fwd = initial_re.forward().to_sparse()?;
+ /// let rev = initial_re.reverse().to_sparse()?;
+ /// let re = Regex::from_dfas(fwd, rev);
+ /// assert_eq!(true, re.is_match(b"foo123"));
+ /// # Ok(()) }; example().unwrap()
+ /// ```
+ pub fn from_dfas(forward: D, reverse: D) -> Regex<D> {
+ Regex { forward, reverse }
+ }
+
+ /// Return the underlying DFA responsible for forward matching.
+ pub fn forward(&self) -> &D {
+ &self.forward
+ }
+
+ /// Return the underlying DFA responsible for reverse matching.
+ pub fn reverse(&self) -> &D {
+ &self.reverse
+ }
+}
+
+/// An iterator over all non-overlapping matches for a particular search.
+///
+/// The iterator yields a `(usize, usize)` value until no more matches could be
+/// found. The first `usize` is the start of the match (inclusive) while the
+/// second `usize` is the end of the match (exclusive).
+///
+/// `S` is the type used to represent state identifiers in the underlying
+/// regex. The lifetime variables are as follows:
+///
+/// * `'r` is the lifetime of the regular expression value itself.
+/// * `'t` is the lifetime of the text being searched.
+#[derive(Clone, Debug)]
+pub struct Matches<'r, 't, D: DFA + 'r> {
+ re: &'r Regex<D>,
+ text: &'t [u8],
+ last_end: usize,
+ last_match: Option<usize>,
+}
+
+impl<'r, 't, D: DFA> Matches<'r, 't, D> {
+ fn new(re: &'r Regex<D>, text: &'t [u8]) -> Matches<'r, 't, D> {
+ Matches { re, text, last_end: 0, last_match: None }
+ }
+}
+
+impl<'r, 't, D: DFA> Iterator for Matches<'r, 't, D> {
+ type Item = (usize, usize);
+
+ fn next(&mut self) -> Option<(usize, usize)> {
+ if self.last_end > self.text.len() {
+ return None;
+ }
+ let (s, e) = match self.re.find_at(self.text, self.last_end) {
+ None => return None,
+ Some((s, e)) => (s, e),
+ };
+ if s == e {
+ // This is an empty match. To ensure we make progress, start
+ // the next search at the smallest possible starting position
+ // of the next match following this one.
+ self.last_end = e + 1;
+ // Don't accept empty matches immediately following a match.
+ // Just move on to the next match.
+ if Some(e) == self.last_match {
+ return self.next();
+ }
+ } else {
+ self.last_end = e;
+ }
+ self.last_match = Some(e);
+ Some((s, e))
+ }
+}
+
+/// A builder for a regex based on deterministic finite automatons.
+///
+/// This builder permits configuring several aspects of the construction
+/// process such as case insensitivity, Unicode support and various options
+/// that impact the size of the underlying DFAs. In some cases, options (like
+/// performing DFA minimization) can come with a substantial additional cost.
+///
+/// This builder generally constructs two DFAs, where one is responsible for
+/// finding the end of a match and the other is responsible for finding the
+/// start of a match. If you only need to detect whether something matched,
+/// or only the end of a match, then you should use a
+/// [`dense::Builder`](dense/struct.Builder.html)
+/// to construct a single DFA, which is cheaper than building two DFAs.
+#[cfg(feature = "std")]
+#[derive(Clone, Debug)]
+pub struct RegexBuilder {
+ dfa: dense::Builder,
+}
+
+#[cfg(feature = "std")]
+impl RegexBuilder {
+ /// Create a new regex builder with the default configuration.
+ pub fn new() -> RegexBuilder {
+ RegexBuilder { dfa: dense::Builder::new() }
+ }
+
+ /// Build a regex from the given pattern.
+ ///
+ /// If there was a problem parsing or compiling the pattern, then an error
+ /// is returned.
+ pub fn build(&self, pattern: &str) -> Result<Regex> {
+ self.build_with_size::<usize>(pattern)
+ }
+
+ /// Build a regex from the given pattern using sparse DFAs.
+ ///
+ /// If there was a problem parsing or compiling the pattern, then an error
+ /// is returned.
+ pub fn build_sparse(
+ &self,
+ pattern: &str,
+ ) -> Result<Regex<SparseDFA<Vec<u8>, usize>>> {
+ self.build_with_size_sparse::<usize>(pattern)
+ }
+
+ /// Build a regex from the given pattern using a specific representation
+ /// for the underlying DFA state IDs.
+ ///
+ /// If there was a problem parsing or compiling the pattern, then an error
+ /// is returned.
+ ///
+ /// The representation of state IDs is determined by the `S` type
+ /// parameter. In general, `S` is usually one of `u8`, `u16`, `u32`, `u64`
+ /// or `usize`, where `usize` is the default used for `build`. The purpose
+ /// of specifying a representation for state IDs is to reduce the memory
+ /// footprint of the underlying DFAs.
+ ///
+ /// When using this routine, the chosen state ID representation will be
+ /// used throughout determinization and minimization, if minimization was
+ /// requested. Even if the minimized DFAs can fit into the chosen state ID
+ /// representation but the initial determinized DFA cannot, then this will
+ /// still return an error. To get a minimized DFA with a smaller state ID
+ /// representation, first build it with a bigger state ID representation,
+ /// and then shrink the sizes of the DFAs using one of its conversion
+ /// routines, such as [`DenseDFA::to_u16`](enum.DenseDFA.html#method.to_u16).
+ /// Finally, reconstitute the regex via
+ /// [`Regex::from_dfa`](struct.Regex.html#method.from_dfa).
+ pub fn build_with_size<S: StateID>(
+ &self,
+ pattern: &str,
+ ) -> Result<Regex<DenseDFA<Vec<S>, S>>> {
+ let forward = self.dfa.build_with_size(pattern)?;
+ let reverse = self
+ .dfa
+ .clone()
+ .anchored(true)
+ .reverse(true)
+ .longest_match(true)
+ .build_with_size(pattern)?;
+ Ok(Regex::from_dfas(forward, reverse))
+ }
+
+ /// Build a regex from the given pattern using a specific representation
+ /// for the underlying DFA state IDs using sparse DFAs.
+ pub fn build_with_size_sparse<S: StateID>(
+ &self,
+ pattern: &str,
+ ) -> Result<Regex<SparseDFA<Vec<u8>, S>>> {
+ let re = self.build_with_size(pattern)?;
+ let fwd = re.forward().to_sparse()?;
+ let rev = re.reverse().to_sparse()?;
+ Ok(Regex::from_dfas(fwd, rev))
+ }
+
+ /// Set whether matching must be anchored at the beginning of the input.
+ ///
+ /// When enabled, a match must begin at the start of the input. When
+ /// disabled, the regex will act as if the pattern started with a `.*?`,
+ /// which enables a match to appear anywhere.
+ ///
+ /// By default this is disabled.
+ pub fn anchored(&mut self, yes: bool) -> &mut RegexBuilder {
+ self.dfa.anchored(yes);
+ self
+ }
+
+ /// Enable or disable the case insensitive flag by default.
+ ///
+ /// By default this is disabled. It may alternatively be selectively
+ /// enabled in the regular expression itself via the `i` flag.
+ pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder {
+ self.dfa.case_insensitive(yes);
+ self
+ }
+
+ /// Enable verbose mode in the regular expression.
+ ///
+ /// When enabled, verbose mode permits insigificant whitespace in many
+ /// places in the regular expression, as well as comments. Comments are
+ /// started using `#` and continue until the end of the line.
+ ///
+ /// By default, this is disabled. It may be selectively enabled in the
+ /// regular expression by using the `x` flag regardless of this setting.
+ pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder {
+ self.dfa.ignore_whitespace(yes);
+ self
+ }
+
+ /// Enable or disable the "dot matches any character" flag by default.
+ ///
+ /// By default this is disabled. It may alternatively be selectively
+ /// enabled in the regular expression itself via the `s` flag.
+ pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut RegexBuilder {
+ self.dfa.dot_matches_new_line(yes);
+ self
+ }
+
+ /// Enable or disable the "swap greed" flag by default.
+ ///
+ /// By default this is disabled. It may alternatively be selectively
+ /// enabled in the regular expression itself via the `U` flag.
+ pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
+ self.dfa.swap_greed(yes);
+ self
+ }
+
+ /// Enable or disable the Unicode flag (`u`) by default.
+ ///
+ /// By default this is **enabled**. It may alternatively be selectively
+ /// disabled in the regular expression itself via the `u` flag.
+ ///
+ /// Note that unless `allow_invalid_utf8` is enabled (it's disabled by
+ /// default), a regular expression will fail to parse if Unicode mode is
+ /// disabled and a sub-expression could possibly match invalid UTF-8.
+ pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
+ self.dfa.unicode(yes);
+ self
+ }
+
+ /// When enabled, the builder will permit the construction of a regular
+ /// expression that may match invalid UTF-8.
+ ///
+ /// When disabled (the default), the builder is guaranteed to produce a
+ /// regex that will only ever match valid UTF-8 (otherwise, the builder
+ /// will return an error).
+ pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut RegexBuilder {
+ self.dfa.allow_invalid_utf8(yes);
+ self
+ }
+
+ /// Set the nesting limit used for the regular expression parser.
+ ///
+ /// The nesting limit controls how deep the abstract syntax tree is allowed
+ /// to be. If the AST exceeds the given limit (e.g., with too many nested
+ /// groups), then an error is returned by the parser.
+ ///
+ /// The purpose of this limit is to act as a heuristic to prevent stack
+ /// overflow when building a finite automaton from a regular expression's
+ /// abstract syntax tree. In particular, construction currently uses
+ /// recursion. In the future, the implementation may stop using recursion
+ /// and this option will no longer be necessary.
+ ///
+ /// This limit is not checked until the entire AST is parsed. Therefore,
+ /// if callers want to put a limit on the amount of heap space used, then
+ /// they should impose a limit on the length, in bytes, of the concrete
+ /// pattern string. In particular, this is viable since the parser will
+ /// limit itself to heap space proportional to the lenth of the pattern
+ /// string.
+ ///
+ /// Note that a nest limit of `0` will return a nest limit error for most
+ /// patterns but not all. For example, a nest limit of `0` permits `a` but
+ /// not `ab`, since `ab` requires a concatenation AST item, which results
+ /// in a nest depth of `1`. In general, a nest limit is not something that
+ /// manifests in an obvious way in the concrete syntax, therefore, it
+ /// should not be used in a granular way.
+ pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
+ self.dfa.nest_limit(limit);
+ self
+ }
+
+ /// Minimize the underlying DFAs.
+ ///
+ /// When enabled, the DFAs powering the resulting regex will be minimized
+ /// such that it is as small as possible.
+ ///
+ /// Whether one enables minimization or not depends on the types of costs
+ /// you're willing to pay and how much you care about its benefits. In
+ /// particular, minimization has worst case `O(n*k*logn)` time and `O(k*n)`
+ /// space, where `n` is the number of DFA states and `k` is the alphabet
+ /// size. In practice, minimization can be quite costly in terms of both
+ /// space and time, so it should only be done if you're willing to wait
+ /// longer to produce a DFA. In general, you might want a minimal DFA in
+ /// the following circumstances:
+ ///
+ /// 1. You would like to optimize for the size of the automaton. This can
+ /// manifest in one of two ways. Firstly, if you're converting the
+ /// DFA into Rust code (or a table embedded in the code), then a minimal
+ /// DFA will translate into a corresponding reduction in code size, and
+ /// thus, also the final compiled binary size. Secondly, if you are
+ /// building many DFAs and putting them on the heap, you'll be able to
+ /// fit more if they are smaller. Note though that building a minimal
+ /// DFA itself requires additional space; you only realize the space
+ /// savings once the minimal DFA is constructed (at which point, the
+ /// space used for minimization is freed).
+ /// 2. You've observed that a smaller DFA results in faster match
+ /// performance. Naively, this isn't guaranteed since there is no
+ /// inherent difference between matching with a bigger-than-minimal
+ /// DFA and a minimal DFA. However, a smaller DFA may make use of your
+ /// CPU's cache more efficiently.
+ /// 3. You are trying to establish an equivalence between regular
+ /// languages. The standard method for this is to build a minimal DFA
+ /// for each language and then compare them. If the DFAs are equivalent
+ /// (up to state renaming), then the languages are equivalent.
+ ///
+ /// This option is disabled by default.
+ pub fn minimize(&mut self, yes: bool) -> &mut RegexBuilder {
+ self.dfa.minimize(yes);
+ self
+ }
+
+ /// Premultiply state identifiers in the underlying DFA transition tables.
+ ///
+ /// When enabled, state identifiers are premultiplied to point to their
+ /// corresponding row in the DFA's transition table. That is, given the
+ /// `i`th state, its corresponding premultiplied identifier is `i * k`
+ /// where `k` is the alphabet size of the DFA. (The alphabet size is at
+ /// most 256, but is in practice smaller if byte classes is enabled.)
+ ///
+ /// When state identifiers are not premultiplied, then the identifier of
+ /// the `i`th state is `i`.
+ ///
+ /// The advantage of premultiplying state identifiers is that is saves
+ /// a multiplication instruction per byte when searching with the DFA.
+ /// This has been observed to lead to a 20% performance benefit in
+ /// micro-benchmarks.
+ ///
+ /// The primary disadvantage of premultiplying state identifiers is
+ /// that they require a larger integer size to represent. For example,
+ /// if your DFA has 200 states, then its premultiplied form requires
+ /// 16 bits to represent every possible state identifier, where as its
+ /// non-premultiplied form only requires 8 bits.
+ ///
+ /// This option is enabled by default.
+ pub fn premultiply(&mut self, yes: bool) -> &mut RegexBuilder {
+ self.dfa.premultiply(yes);
+ self
+ }
+
+ /// Shrink the size of the underlying DFA alphabet by mapping bytes to
+ /// their equivalence classes.
+ ///
+ /// When enabled, each DFA will use a map from all possible bytes to their
+ /// corresponding equivalence class. Each equivalence class represents a
+ /// set of bytes that does not discriminate between a match and a non-match
+ /// in the DFA. For example, the pattern `[ab]+` has at least two
+ /// equivalence classes: a set containing `a` and `b` and a set containing
+ /// every byte except for `a` and `b`. `a` and `b` are in the same
+ /// equivalence classes because they never discriminate between a match
+ /// and a non-match.
+ ///
+ /// The advantage of this map is that the size of the transition table can
+ /// be reduced drastically from `#states * 256 * sizeof(id)` to
+ /// `#states * k * sizeof(id)` where `k` is the number of equivalence
+ /// classes. As a result, total space usage can decrease substantially.
+ /// Moreover, since a smaller alphabet is used, compilation becomes faster
+ /// as well.
+ ///
+ /// The disadvantage of this map is that every byte searched must be
+ /// passed through this map before it can be used to determine the next
+ /// transition. This has a small match time performance cost.
+ ///
+ /// This option is enabled by default.
+ pub fn byte_classes(&mut self, yes: bool) -> &mut RegexBuilder {
+ self.dfa.byte_classes(yes);
+ self
+ }
+
+ /// Apply best effort heuristics to shrink the NFA at the expense of more
+ /// time/memory.
+ ///
+ /// This may be exposed in the future, but for now is exported for use in
+ /// the `regex-automata-debug` tool.
+ #[doc(hidden)]
+ pub fn shrink(&mut self, yes: bool) -> &mut RegexBuilder {
+ self.dfa.shrink(yes);
+ self
+ }
+}
+
+#[cfg(feature = "std")]
+impl Default for RegexBuilder {
+ fn default() -> RegexBuilder {
+ RegexBuilder::new()
+ }
+}
diff --git a/src/sparse.rs b/src/sparse.rs
new file mode 100644
index 0000000..d18024b
--- /dev/null
+++ b/src/sparse.rs
@@ -0,0 +1,1256 @@
+#[cfg(feature = "std")]
+use core::fmt;
+#[cfg(feature = "std")]
+use core::iter;
+use core::marker::PhantomData;
+use core::mem::size_of;
+#[cfg(feature = "std")]
+use std::collections::HashMap;
+
+#[cfg(feature = "std")]
+use byteorder::{BigEndian, LittleEndian};
+use byteorder::{ByteOrder, NativeEndian};
+
+use classes::ByteClasses;
+use dense;
+use dfa::DFA;
+#[cfg(feature = "std")]
+use error::{Error, Result};
+#[cfg(feature = "std")]
+use state_id::{dead_id, usize_to_state_id, write_state_id_bytes, StateID};
+#[cfg(not(feature = "std"))]
+use state_id::{dead_id, StateID};
+
+/// A sparse table-based deterministic finite automaton (DFA).
+///
+/// In contrast to a [dense DFA](enum.DenseDFA.html), a sparse DFA uses a
+/// more space efficient representation for its transition table. Consequently,
+/// sparse DFAs can use much less memory than dense DFAs, but this comes at a
+/// price. In particular, reading the more space efficient transitions takes
+/// more work, and consequently, searching using a sparse DFA is typically
+/// slower than a dense DFA.
+///
+/// A sparse DFA can be built using the default configuration via the
+/// [`SparseDFA::new`](enum.SparseDFA.html#method.new) constructor. Otherwise,
+/// one can configure various aspects of a dense DFA via
+/// [`dense::Builder`](dense/struct.Builder.html), and then convert a dense
+/// DFA to a sparse DFA using
+/// [`DenseDFA::to_sparse`](enum.DenseDFA.html#method.to_sparse).
+///
+/// In general, a sparse DFA supports all the same operations as a dense DFA.
+///
+/// Making the choice between a dense and sparse DFA depends on your specific
+/// work load. If you can sacrifice a bit of search time performance, then a
+/// sparse DFA might be the best choice. In particular, while sparse DFAs are
+/// probably always slower than dense DFAs, you may find that they are easily
+/// fast enough for your purposes!
+///
+/// # State size
+///
+/// A `SparseDFA` has two type parameters, `T` and `S`. `T` corresponds to
+/// the type of the DFA's transition table while `S` corresponds to the
+/// representation used for the DFA's state identifiers as described by the
+/// [`StateID`](trait.StateID.html) trait. This type parameter is typically
+/// `usize`, but other valid choices provided by this crate include `u8`,
+/// `u16`, `u32` and `u64`. The primary reason for choosing a different state
+/// identifier representation than the default is to reduce the amount of
+/// memory used by a DFA. Note though, that if the chosen representation cannot
+/// accommodate the size of your DFA, then building the DFA will fail and
+/// return an error.
+///
+/// While the reduction in heap memory used by a DFA is one reason for choosing
+/// a smaller state identifier representation, another possible reason is for
+/// decreasing the serialization size of a DFA, as returned by
+/// [`to_bytes_little_endian`](enum.SparseDFA.html#method.to_bytes_little_endian),
+/// [`to_bytes_big_endian`](enum.SparseDFA.html#method.to_bytes_big_endian)
+/// or
+/// [`to_bytes_native_endian`](enum.DenseDFA.html#method.to_bytes_native_endian).
+///
+/// The type of the transition table is typically either `Vec<u8>` or `&[u8]`,
+/// depending on where the transition table is stored. Note that this is
+/// different than a dense DFA, whose transition table is typically
+/// `Vec<S>` or `&[S]`. The reason for this is that a sparse DFA always reads
+/// its transition table from raw bytes because the table is compactly packed.
+///
+/// # Variants
+///
+/// This DFA is defined as a non-exhaustive enumeration of different types of
+/// dense DFAs. All of the variants use the same internal representation
+/// for the transition table, but they vary in how the transition table is
+/// read. A DFA's specific variant depends on the configuration options set via
+/// [`dense::Builder`](dense/struct.Builder.html). The default variant is
+/// `ByteClass`.
+///
+/// # The `DFA` trait
+///
+/// This type implements the [`DFA`](trait.DFA.html) trait, which means it
+/// can be used for searching. For example:
+///
+/// ```
+/// use regex_automata::{DFA, SparseDFA};
+///
+/// # fn example() -> Result<(), regex_automata::Error> {
+/// let dfa = SparseDFA::new("foo[0-9]+")?;
+/// assert_eq!(Some(8), dfa.find(b"foo12345"));
+/// # Ok(()) }; example().unwrap()
+/// ```
+///
+/// The `DFA` trait also provides an assortment of other lower level methods
+/// for DFAs, such as `start_state` and `next_state`. While these are correctly
+/// implemented, it is an anti-pattern to use them in performance sensitive
+/// code on the `SparseDFA` type directly. Namely, each implementation requires
+/// a branch to determine which type of sparse DFA is being used. Instead,
+/// this branch should be pushed up a layer in the code since walking the
+/// transitions of a DFA is usually a hot path. If you do need to use these
+/// lower level methods in performance critical code, then you should match on
+/// the variants of this DFA and use each variant's implementation of the `DFA`
+/// trait directly.
+#[derive(Clone, Debug)]
+pub enum SparseDFA<T: AsRef<[u8]>, S: StateID = usize> {
+ /// A standard DFA that does not use byte classes.
+ Standard(Standard<T, S>),
+ /// A DFA that shrinks its alphabet to a set of equivalence classes instead
+ /// of using all possible byte values. Any two bytes belong to the same
+ /// equivalence class if and only if they can be used interchangeably
+ /// anywhere in the DFA while never discriminating between a match and a
+ /// non-match.
+ ///
+ /// Unlike dense DFAs, sparse DFAs do not tend to benefit nearly as much
+ /// from using byte classes. In some cases, using byte classes can even
+ /// marginally increase the size of a sparse DFA's transition table. The
+ /// reason for this is that a sparse DFA already compacts each state's
+ /// transitions separate from whether byte classes are used.
+ ByteClass(ByteClass<T, S>),
+ /// Hints that destructuring should not be exhaustive.
+ ///
+ /// This enum may grow additional variants, so this makes sure clients
+ /// don't count on exhaustive matching. (Otherwise, adding a new variant
+ /// could break existing code.)
+ #[doc(hidden)]
+ __Nonexhaustive,
+}
+
+#[cfg(feature = "std")]
+impl SparseDFA<Vec<u8>, usize> {
+ /// Parse the given regular expression using a default configuration and
+ /// return the corresponding sparse DFA.
+ ///
+ /// The default configuration uses `usize` for state IDs and reduces the
+ /// alphabet size by splitting bytes into equivalence classes. The
+ /// resulting DFA is *not* minimized.
+ ///
+ /// If you want a non-default configuration, then use the
+ /// [`dense::Builder`](dense/struct.Builder.html)
+ /// to set your own configuration, and then call
+ /// [`DenseDFA::to_sparse`](enum.DenseDFA.html#method.to_sparse)
+ /// to create a sparse DFA.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{DFA, SparseDFA};
+ ///
+ /// # fn example() -> Result<(), regex_automata::Error> {
+ /// let dfa = SparseDFA::new("foo[0-9]+bar")?;
+ /// assert_eq!(Some(11), dfa.find(b"foo12345bar"));
+ /// # Ok(()) }; example().unwrap()
+ /// ```
+ pub fn new(pattern: &str) -> Result<SparseDFA<Vec<u8>, usize>> {
+ dense::Builder::new()
+ .build(pattern)
+ .and_then(|dense| dense.to_sparse())
+ }
+}
+
+#[cfg(feature = "std")]
+impl<S: StateID> SparseDFA<Vec<u8>, S> {
+ /// Create a new empty sparse DFA that never matches any input.
+ ///
+ /// # Example
+ ///
+ /// In order to build an empty DFA, callers must provide a type hint
+ /// indicating their choice of state identifier representation.
+ ///
+ /// ```
+ /// use regex_automata::{DFA, SparseDFA};
+ ///
+ /// # fn example() -> Result<(), regex_automata::Error> {
+ /// let dfa: SparseDFA<Vec<u8>, usize> = SparseDFA::empty();
+ /// assert_eq!(None, dfa.find(b""));
+ /// assert_eq!(None, dfa.find(b"foo"));
+ /// # Ok(()) }; example().unwrap()
+ /// ```
+ pub fn empty() -> SparseDFA<Vec<u8>, S> {
+ dense::DenseDFA::empty().to_sparse().unwrap()
+ }
+
+ pub(crate) fn from_dense_sized<T: AsRef<[S]>, A: StateID>(
+ dfa: &dense::Repr<T, S>,
+ ) -> Result<SparseDFA<Vec<u8>, A>> {
+ Repr::from_dense_sized(dfa).map(|r| r.into_sparse_dfa())
+ }
+}
+
+impl<T: AsRef<[u8]>, S: StateID> SparseDFA<T, S> {
+ /// Cheaply return a borrowed version of this sparse DFA. Specifically, the
+ /// DFA returned always uses `&[u8]` for its transition table while keeping
+ /// the same state identifier representation.
+ pub fn as_ref<'a>(&'a self) -> SparseDFA<&'a [u8], S> {
+ match *self {
+ SparseDFA::Standard(Standard(ref r)) => {
+ SparseDFA::Standard(Standard(r.as_ref()))
+ }
+ SparseDFA::ByteClass(ByteClass(ref r)) => {
+ SparseDFA::ByteClass(ByteClass(r.as_ref()))
+ }
+ SparseDFA::__Nonexhaustive => unreachable!(),
+ }
+ }
+
+ /// Return an owned version of this sparse DFA. Specifically, the DFA
+ /// returned always uses `Vec<u8>` for its transition table while keeping
+ /// the same state identifier representation.
+ ///
+ /// Effectively, this returns a sparse DFA whose transition table lives
+ /// on the heap.
+ #[cfg(feature = "std")]
+ pub fn to_owned(&self) -> SparseDFA<Vec<u8>, S> {
+ match *self {
+ SparseDFA::Standard(Standard(ref r)) => {
+ SparseDFA::Standard(Standard(r.to_owned()))
+ }
+ SparseDFA::ByteClass(ByteClass(ref r)) => {
+ SparseDFA::ByteClass(ByteClass(r.to_owned()))
+ }
+ SparseDFA::__Nonexhaustive => unreachable!(),
+ }
+ }
+
+ /// Returns the memory usage, in bytes, of this DFA.
+ ///
+ /// The memory usage is computed based on the number of bytes used to
+ /// represent this DFA's transition table. This typically corresponds to
+ /// heap memory usage.
+ ///
+ /// This does **not** include the stack size used up by this DFA. To
+ /// compute that, used `std::mem::size_of::<SparseDFA>()`.
+ pub fn memory_usage(&self) -> usize {
+ self.repr().memory_usage()
+ }
+
+ fn repr(&self) -> &Repr<T, S> {
+ match *self {
+ SparseDFA::Standard(ref r) => &r.0,
+ SparseDFA::ByteClass(ref r) => &r.0,
+ SparseDFA::__Nonexhaustive => unreachable!(),
+ }
+ }
+}
+
+/// Routines for converting a sparse DFA to other representations, such as
+/// smaller state identifiers or raw bytes suitable for persistent storage.
+#[cfg(feature = "std")]
+impl<T: AsRef<[u8]>, S: StateID> SparseDFA<T, S> {
+ /// Create a new sparse DFA whose match semantics are equivalent to
+ /// this DFA, but attempt to use `u8` for the representation of state
+ /// identifiers. If `u8` is insufficient to represent all state identifiers
+ /// in this DFA, then this returns an error.
+ ///
+ /// This is a convenience routine for `to_sized::<u8>()`.
+ pub fn to_u8(&self) -> Result<SparseDFA<Vec<u8>, u8>> {
+ self.to_sized()
+ }
+
+ /// Create a new sparse DFA whose match semantics are equivalent to
+ /// this DFA, but attempt to use `u16` for the representation of state
+ /// identifiers. If `u16` is insufficient to represent all state
+ /// identifiers in this DFA, then this returns an error.
+ ///
+ /// This is a convenience routine for `to_sized::<u16>()`.
+ pub fn to_u16(&self) -> Result<SparseDFA<Vec<u8>, u16>> {
+ self.to_sized()
+ }
+
+ /// Create a new sparse DFA whose match semantics are equivalent to
+ /// this DFA, but attempt to use `u32` for the representation of state
+ /// identifiers. If `u32` is insufficient to represent all state
+ /// identifiers in this DFA, then this returns an error.
+ ///
+ /// This is a convenience routine for `to_sized::<u32>()`.
+ #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
+ pub fn to_u32(&self) -> Result<SparseDFA<Vec<u8>, u32>> {
+ self.to_sized()
+ }
+
+ /// Create a new sparse DFA whose match semantics are equivalent to
+ /// this DFA, but attempt to use `u64` for the representation of state
+ /// identifiers. If `u64` is insufficient to represent all state
+ /// identifiers in this DFA, then this returns an error.
+ ///
+ /// This is a convenience routine for `to_sized::<u64>()`.
+ #[cfg(target_pointer_width = "64")]
+ pub fn to_u64(&self) -> Result<SparseDFA<Vec<u8>, u64>> {
+ self.to_sized()
+ }
+
+ /// Create a new sparse DFA whose match semantics are equivalent to
+ /// this DFA, but attempt to use `A` for the representation of state
+ /// identifiers. If `A` is insufficient to represent all state identifiers
+ /// in this DFA, then this returns an error.
+ ///
+ /// An alternative way to construct such a DFA is to use
+ /// [`DenseDFA::to_sparse_sized`](enum.DenseDFA.html#method.to_sparse_sized).
+ /// In general, picking the appropriate size upon initial construction of
+ /// a sparse DFA is preferred, since it will do the conversion in one
+ /// step instead of two.
+ pub fn to_sized<A: StateID>(&self) -> Result<SparseDFA<Vec<u8>, A>> {
+ self.repr().to_sized().map(|r| r.into_sparse_dfa())
+ }
+
+ /// Serialize a sparse DFA to raw bytes in little endian format.
+ ///
+ /// If the state identifier representation of this DFA has a size different
+ /// than 1, 2, 4 or 8 bytes, then this returns an error. All
+ /// implementations of `StateID` provided by this crate satisfy this
+ /// requirement.
+ pub fn to_bytes_little_endian(&self) -> Result<Vec<u8>> {
+ self.repr().to_bytes::<LittleEndian>()
+ }
+
+ /// Serialize a sparse DFA to raw bytes in big endian format.
+ ///
+ /// If the state identifier representation of this DFA has a size different
+ /// than 1, 2, 4 or 8 bytes, then this returns an error. All
+ /// implementations of `StateID` provided by this crate satisfy this
+ /// requirement.
+ pub fn to_bytes_big_endian(&self) -> Result<Vec<u8>> {
+ self.repr().to_bytes::<BigEndian>()
+ }
+
+ /// Serialize a sparse DFA to raw bytes in native endian format.
+ /// Generally, it is better to pick an explicit endianness using either
+ /// `to_bytes_little_endian` or `to_bytes_big_endian`. This routine is
+ /// useful in tests where the DFA is serialized and deserialized on the
+ /// same platform.
+ ///
+ /// If the state identifier representation of this DFA has a size different
+ /// than 1, 2, 4 or 8 bytes, then this returns an error. All
+ /// implementations of `StateID` provided by this crate satisfy this
+ /// requirement.
+ pub fn to_bytes_native_endian(&self) -> Result<Vec<u8>> {
+ self.repr().to_bytes::<NativeEndian>()
+ }
+}
+
+impl<'a, S: StateID> SparseDFA<&'a [u8], S> {
+ /// Deserialize a sparse DFA with a specific state identifier
+ /// representation.
+ ///
+ /// Deserializing a DFA using this routine will never allocate heap memory.
+ /// This is also guaranteed to be a constant time operation that does not
+ /// vary with the size of the DFA.
+ ///
+ /// The bytes given should be generated by the serialization of a DFA with
+ /// either the
+ /// [`to_bytes_little_endian`](enum.DenseDFA.html#method.to_bytes_little_endian)
+ /// method or the
+ /// [`to_bytes_big_endian`](enum.DenseDFA.html#method.to_bytes_big_endian)
+ /// endian, depending on the endianness of the machine you are
+ /// deserializing this DFA from.
+ ///
+ /// If the state identifier representation is `usize`, then deserialization
+ /// is dependent on the pointer size. For this reason, it is best to
+ /// serialize DFAs using a fixed size representation for your state
+ /// identifiers, such as `u8`, `u16`, `u32` or `u64`.
+ ///
+ /// # Panics
+ ///
+ /// The bytes given should be *trusted*. In particular, if the bytes
+ /// are not a valid serialization of a DFA, or if the endianness of the
+ /// serialized bytes is different than the endianness of the machine that
+ /// is deserializing the DFA, then this routine will panic. Moreover, it
+ /// is possible for this deserialization routine to succeed even if the
+ /// given bytes do not represent a valid serialized sparse DFA.
+ ///
+ /// # Safety
+ ///
+ /// This routine is unsafe because it permits callers to provide an
+ /// arbitrary transition table with possibly incorrect transitions. While
+ /// the various serialization routines will never return an incorrect
+ /// transition table, there is no guarantee that the bytes provided here
+ /// are correct. While deserialization does many checks (as documented
+ /// above in the panic conditions), this routine does not check that the
+ /// transition table is correct. Given an incorrect transition table, it is
+ /// possible for the search routines to access out-of-bounds memory because
+ /// of explicit bounds check elision.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to serialize a DFA to raw bytes, deserialize it
+ /// and then use it for searching. Note that we first convert the DFA to
+ /// using `u16` for its state identifier representation before serializing
+ /// it. While this isn't strictly necessary, it's good practice in order to
+ /// decrease the size of the DFA and to avoid platform specific pitfalls
+ /// such as differing pointer sizes.
+ ///
+ /// ```
+ /// use regex_automata::{DFA, DenseDFA, SparseDFA};
+ ///
+ /// # fn example() -> Result<(), regex_automata::Error> {
+ /// let sparse = SparseDFA::new("foo[0-9]+")?;
+ /// let bytes = sparse.to_u16()?.to_bytes_native_endian()?;
+ ///
+ /// let dfa: SparseDFA<&[u8], u16> = unsafe {
+ /// SparseDFA::from_bytes(&bytes)
+ /// };
+ ///
+ /// assert_eq!(Some(8), dfa.find(b"foo12345"));
+ /// # Ok(()) }; example().unwrap()
+ /// ```
+ pub unsafe fn from_bytes(buf: &'a [u8]) -> SparseDFA<&'a [u8], S> {
+ Repr::from_bytes(buf).into_sparse_dfa()
+ }
+}
+
+impl<T: AsRef<[u8]>, S: StateID> DFA for SparseDFA<T, S> {
+ type ID = S;
+
+ #[inline]
+ fn start_state(&self) -> S {
+ self.repr().start_state()
+ }
+
+ #[inline]
+ fn is_match_state(&self, id: S) -> bool {
+ self.repr().is_match_state(id)
+ }
+
+ #[inline]
+ fn is_dead_state(&self, id: S) -> bool {
+ self.repr().is_dead_state(id)
+ }
+
+ #[inline]
+ fn is_match_or_dead_state(&self, id: S) -> bool {
+ self.repr().is_match_or_dead_state(id)
+ }
+
+ #[inline]
+ fn is_anchored(&self) -> bool {
+ self.repr().is_anchored()
+ }
+
+ #[inline]
+ fn next_state(&self, current: S, input: u8) -> S {
+ match *self {
+ SparseDFA::Standard(ref r) => r.next_state(current, input),
+ SparseDFA::ByteClass(ref r) => r.next_state(current, input),
+ SparseDFA::__Nonexhaustive => unreachable!(),
+ }
+ }
+
+ #[inline]
+ unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
+ self.next_state(current, input)
+ }
+
+ // We specialize the following methods because it lets us lift the
+ // case analysis between the different types of sparse DFAs. Instead of
+ // doing the case analysis for every transition, we do it once before
+ // searching. For sparse DFAs, this doesn't seem to benefit performance as
+ // much as it does for the dense DFAs, but it's easy to do so we might as
+ // well do it.
+
+ #[inline]
+ fn is_match_at(&self, bytes: &[u8], start: usize) -> bool {
+ match *self {
+ SparseDFA::Standard(ref r) => r.is_match_at(bytes, start),
+ SparseDFA::ByteClass(ref r) => r.is_match_at(bytes, start),
+ SparseDFA::__Nonexhaustive => unreachable!(),
+ }
+ }
+
+ #[inline]
+ fn shortest_match_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
+ match *self {
+ SparseDFA::Standard(ref r) => r.shortest_match_at(bytes, start),
+ SparseDFA::ByteClass(ref r) => r.shortest_match_at(bytes, start),
+ SparseDFA::__Nonexhaustive => unreachable!(),
+ }
+ }
+
+ #[inline]
+ fn find_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
+ match *self {
+ SparseDFA::Standard(ref r) => r.find_at(bytes, start),
+ SparseDFA::ByteClass(ref r) => r.find_at(bytes, start),
+ SparseDFA::__Nonexhaustive => unreachable!(),
+ }
+ }
+
+ #[inline]
+ fn rfind_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
+ match *self {
+ SparseDFA::Standard(ref r) => r.rfind_at(bytes, start),
+ SparseDFA::ByteClass(ref r) => r.rfind_at(bytes, start),
+ SparseDFA::__Nonexhaustive => unreachable!(),
+ }
+ }
+}
+
+/// A standard sparse DFA that does not use premultiplication or byte classes.
+///
+/// Generally, it isn't necessary to use this type directly, since a
+/// `SparseDFA` can be used for searching directly. One possible reason why
+/// one might want to use this type directly is if you are implementing your
+/// own search routines by walking a DFA's transitions directly. In that case,
+/// you'll want to use this type (or any of the other DFA variant types)
+/// directly, since they implement `next_state` more efficiently.
+#[derive(Clone, Debug)]
+pub struct Standard<T: AsRef<[u8]>, S: StateID = usize>(Repr<T, S>);
+
+impl<T: AsRef<[u8]>, S: StateID> DFA for Standard<T, S> {
+ type ID = S;
+
+ #[inline]
+ fn start_state(&self) -> S {
+ self.0.start_state()
+ }
+
+ #[inline]
+ fn is_match_state(&self, id: S) -> bool {
+ self.0.is_match_state(id)
+ }
+
+ #[inline]
+ fn is_dead_state(&self, id: S) -> bool {
+ self.0.is_dead_state(id)
+ }
+
+ #[inline]
+ fn is_match_or_dead_state(&self, id: S) -> bool {
+ self.0.is_match_or_dead_state(id)
+ }
+
+ #[inline]
+ fn is_anchored(&self) -> bool {
+ self.0.is_anchored()
+ }
+
+ #[inline]
+ fn next_state(&self, current: S, input: u8) -> S {
+ self.0.state(current).next(input)
+ }
+
+ #[inline]
+ unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
+ self.next_state(current, input)
+ }
+}
+
+/// A sparse DFA that shrinks its alphabet.
+///
+/// Alphabet shrinking is achieved by using a set of equivalence classes
+/// instead of using all possible byte values. Any two bytes belong to the same
+/// equivalence class if and only if they can be used interchangeably anywhere
+/// in the DFA while never discriminating between a match and a non-match.
+///
+/// Unlike dense DFAs, sparse DFAs do not tend to benefit nearly as much from
+/// using byte classes. In some cases, using byte classes can even marginally
+/// increase the size of a sparse DFA's transition table. The reason for this
+/// is that a sparse DFA already compacts each state's transitions separate
+/// from whether byte classes are used.
+///
+/// Generally, it isn't necessary to use this type directly, since a
+/// `SparseDFA` can be used for searching directly. One possible reason why
+/// one might want to use this type directly is if you are implementing your
+/// own search routines by walking a DFA's transitions directly. In that case,
+/// you'll want to use this type (or any of the other DFA variant types)
+/// directly, since they implement `next_state` more efficiently.
+#[derive(Clone, Debug)]
+pub struct ByteClass<T: AsRef<[u8]>, S: StateID = usize>(Repr<T, S>);
+
+impl<T: AsRef<[u8]>, S: StateID> DFA for ByteClass<T, S> {
+ type ID = S;
+
+ #[inline]
+ fn start_state(&self) -> S {
+ self.0.start_state()
+ }
+
+ #[inline]
+ fn is_match_state(&self, id: S) -> bool {
+ self.0.is_match_state(id)
+ }
+
+ #[inline]
+ fn is_dead_state(&self, id: S) -> bool {
+ self.0.is_dead_state(id)
+ }
+
+ #[inline]
+ fn is_match_or_dead_state(&self, id: S) -> bool {
+ self.0.is_match_or_dead_state(id)
+ }
+
+ #[inline]
+ fn is_anchored(&self) -> bool {
+ self.0.is_anchored()
+ }
+
+ #[inline]
+ fn next_state(&self, current: S, input: u8) -> S {
+ let input = self.0.byte_classes.get(input);
+ self.0.state(current).next(input)
+ }
+
+ #[inline]
+ unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
+ self.next_state(current, input)
+ }
+}
+
+/// The underlying representation of a sparse DFA. This is shared by all of
+/// the different variants of a sparse DFA.
+#[derive(Clone)]
+#[cfg_attr(not(feature = "std"), derive(Debug))]
+struct Repr<T: AsRef<[u8]>, S: StateID = usize> {
+ anchored: bool,
+ start: S,
+ state_count: usize,
+ max_match: S,
+ byte_classes: ByteClasses,
+ trans: T,
+}
+
+impl<T: AsRef<[u8]>, S: StateID> Repr<T, S> {
+ fn into_sparse_dfa(self) -> SparseDFA<T, S> {
+ if self.byte_classes.is_singleton() {
+ SparseDFA::Standard(Standard(self))
+ } else {
+ SparseDFA::ByteClass(ByteClass(self))
+ }
+ }
+
+ fn as_ref<'a>(&'a self) -> Repr<&'a [u8], S> {
+ Repr {
+ anchored: self.anchored,
+ start: self.start,
+ state_count: self.state_count,
+ max_match: self.max_match,
+ byte_classes: self.byte_classes.clone(),
+ trans: self.trans(),
+ }
+ }
+
+ #[cfg(feature = "std")]
+ fn to_owned(&self) -> Repr<Vec<u8>, S> {
+ Repr {
+ anchored: self.anchored,
+ start: self.start,
+ state_count: self.state_count,
+ max_match: self.max_match,
+ byte_classes: self.byte_classes.clone(),
+ trans: self.trans().to_vec(),
+ }
+ }
+
+ /// Return a convenient representation of the given state.
+ ///
+ /// This is marked as inline because it doesn't seem to get inlined
+ /// otherwise, which leads to a fairly significant performance loss (~25%).
+ #[inline]
+ fn state<'a>(&'a self, id: S) -> State<'a, S> {
+ let mut pos = id.to_usize();
+ let ntrans = NativeEndian::read_u16(&self.trans()[pos..]) as usize;
+ pos += 2;
+ let input_ranges = &self.trans()[pos..pos + (ntrans * 2)];
+ pos += 2 * ntrans;
+ let next = &self.trans()[pos..pos + (ntrans * size_of::<S>())];
+ State { _state_id_repr: PhantomData, ntrans, input_ranges, next }
+ }
+
+ /// Return an iterator over all of the states in this DFA.
+ ///
+ /// The iterator returned yields tuples, where the first element is the
+ /// state ID and the second element is the state itself.
+ #[cfg(feature = "std")]
+ fn states<'a>(&'a self) -> StateIter<'a, T, S> {
+ StateIter { dfa: self, id: dead_id() }
+ }
+
+ fn memory_usage(&self) -> usize {
+ self.trans().len()
+ }
+
+ fn start_state(&self) -> S {
+ self.start
+ }
+
+ fn is_match_state(&self, id: S) -> bool {
+ self.is_match_or_dead_state(id) && !self.is_dead_state(id)
+ }
+
+ fn is_dead_state(&self, id: S) -> bool {
+ id == dead_id()
+ }
+
+ fn is_match_or_dead_state(&self, id: S) -> bool {
+ id <= self.max_match
+ }
+
+ fn is_anchored(&self) -> bool {
+ self.anchored
+ }
+
+ fn trans(&self) -> &[u8] {
+ self.trans.as_ref()
+ }
+
+ /// Create a new sparse DFA whose match semantics are equivalent to this
+ /// DFA, but attempt to use `A` for the representation of state
+ /// identifiers. If `A` is insufficient to represent all state identifiers
+ /// in this DFA, then this returns an error.
+ #[cfg(feature = "std")]
+ fn to_sized<A: StateID>(&self) -> Result<Repr<Vec<u8>, A>> {
+ // To build the new DFA, we proceed much like the initial construction
+ // of the sparse DFA. Namely, since the state ID size is changing,
+ // we don't actually know all of our state IDs until we've allocated
+ // all necessary space. So we do one pass that allocates all of the
+ // storage we need, and then another pass to fill in the transitions.
+
+ let mut trans = Vec::with_capacity(size_of::<A>() * self.state_count);
+ let mut map: HashMap<S, A> = HashMap::with_capacity(self.state_count);
+ for (old_id, state) in self.states() {
+ let pos = trans.len();
+ map.insert(old_id, usize_to_state_id(pos)?);
+
+ let n = state.ntrans;
+ let zeros = 2 + (n * 2) + (n * size_of::<A>());
+ trans.extend(iter::repeat(0).take(zeros));
+
+ NativeEndian::write_u16(&mut trans[pos..], n as u16);
+ let (s, e) = (pos + 2, pos + 2 + (n * 2));
+ trans[s..e].copy_from_slice(state.input_ranges);
+ }
+
+ let mut new = Repr {
+ anchored: self.anchored,
+ start: map[&self.start],
+ state_count: self.state_count,
+ max_match: map[&self.max_match],
+ byte_classes: self.byte_classes.clone(),
+ trans,
+ };
+ for (&old_id, &new_id) in map.iter() {
+ let old_state = self.state(old_id);
+ let mut new_state = new.state_mut(new_id);
+ for i in 0..new_state.ntrans {
+ let next = map[&old_state.next_at(i)];
+ new_state.set_next_at(i, usize_to_state_id(next.to_usize())?);
+ }
+ }
+ new.start = map[&self.start];
+ new.max_match = map[&self.max_match];
+ Ok(new)
+ }
+
+ /// Serialize a sparse DFA to raw bytes using the provided endianness.
+ ///
+ /// If the state identifier representation of this DFA has a size different
+ /// than 1, 2, 4 or 8 bytes, then this returns an error. All
+ /// implementations of `StateID` provided by this crate satisfy this
+ /// requirement.
+ ///
+ /// Unlike dense DFAs, the result is not necessarily aligned since a
+ /// sparse DFA's transition table is always read as a sequence of bytes.
+ #[cfg(feature = "std")]
+ fn to_bytes<A: ByteOrder>(&self) -> Result<Vec<u8>> {
+ let label = b"rust-regex-automata-sparse-dfa\x00";
+ let size =
+ // For human readable label.
+ label.len()
+ // endiannes check, must be equal to 0xFEFF for native endian
+ + 2
+ // For version number.
+ + 2
+ // Size of state ID representation, in bytes.
+ // Must be 1, 2, 4 or 8.
+ + 2
+ // For DFA misc options. (Currently unused.)
+ + 2
+ // For start state.
+ + 8
+ // For state count.
+ + 8
+ // For max match state.
+ + 8
+ // For byte class map.
+ + 256
+ // For transition table.
+ + self.trans().len();
+
+ let mut i = 0;
+ let mut buf = vec![0; size];
+
+ // write label
+ for &b in label {
+ buf[i] = b;
+ i += 1;
+ }
+ // endianness check
+ A::write_u16(&mut buf[i..], 0xFEFF);
+ i += 2;
+ // version number
+ A::write_u16(&mut buf[i..], 1);
+ i += 2;
+ // size of state ID
+ let state_size = size_of::<S>();
+ if ![1, 2, 4, 8].contains(&state_size) {
+ return Err(Error::serialize(&format!(
+ "state size of {} not supported, must be 1, 2, 4 or 8",
+ state_size
+ )));
+ }
+ A::write_u16(&mut buf[i..], state_size as u16);
+ i += 2;
+ // DFA misc options
+ let mut options = 0u16;
+ if self.anchored {
+ options |= dense::MASK_ANCHORED;
+ }
+ A::write_u16(&mut buf[i..], options);
+ i += 2;
+ // start state
+ A::write_u64(&mut buf[i..], self.start.to_usize() as u64);
+ i += 8;
+ // state count
+ A::write_u64(&mut buf[i..], self.state_count as u64);
+ i += 8;
+ // max match state
+ A::write_u64(&mut buf[i..], self.max_match.to_usize() as u64);
+ i += 8;
+ // byte class map
+ for b in (0..256).map(|b| b as u8) {
+ buf[i] = self.byte_classes.get(b);
+ i += 1;
+ }
+ // transition table
+ for (_, state) in self.states() {
+ A::write_u16(&mut buf[i..], state.ntrans as u16);
+ i += 2;
+ buf[i..i + (state.ntrans * 2)].copy_from_slice(state.input_ranges);
+ i += state.ntrans * 2;
+ for j in 0..state.ntrans {
+ write_state_id_bytes::<A, _>(&mut buf[i..], state.next_at(j));
+ i += size_of::<S>();
+ }
+ }
+
+ assert_eq!(size, i, "expected to consume entire buffer");
+
+ Ok(buf)
+ }
+}
+
+impl<'a, S: StateID> Repr<&'a [u8], S> {
+ /// The implementation for deserializing a sparse DFA from raw bytes.
+ unsafe fn from_bytes(mut buf: &'a [u8]) -> Repr<&'a [u8], S> {
+ // skip over label
+ match buf.iter().position(|&b| b == b'\x00') {
+ None => panic!("could not find label"),
+ Some(i) => buf = &buf[i + 1..],
+ }
+
+ // check that current endianness is same as endianness of DFA
+ let endian_check = NativeEndian::read_u16(buf);
+ buf = &buf[2..];
+ if endian_check != 0xFEFF {
+ panic!(
+ "endianness mismatch, expected 0xFEFF but got 0x{:X}. \
+ are you trying to load a SparseDFA serialized with a \
+ different endianness?",
+ endian_check,
+ );
+ }
+
+ // check that the version number is supported
+ let version = NativeEndian::read_u16(buf);
+ buf = &buf[2..];
+ if version != 1 {
+ panic!(
+ "expected version 1, but found unsupported version {}",
+ version,
+ );
+ }
+
+ // read size of state
+ let state_size = NativeEndian::read_u16(buf) as usize;
+ if state_size != size_of::<S>() {
+ panic!(
+ "state size of SparseDFA ({}) does not match \
+ requested state size ({})",
+ state_size,
+ size_of::<S>(),
+ );
+ }
+ buf = &buf[2..];
+
+ // read miscellaneous options
+ let opts = NativeEndian::read_u16(buf);
+ buf = &buf[2..];
+
+ // read start state
+ let start = S::from_usize(NativeEndian::read_u64(buf) as usize);
+ buf = &buf[8..];
+
+ // read state count
+ let state_count = NativeEndian::read_u64(buf) as usize;
+ buf = &buf[8..];
+
+ // read max match state
+ let max_match = S::from_usize(NativeEndian::read_u64(buf) as usize);
+ buf = &buf[8..];
+
+ // read byte classes
+ let byte_classes = ByteClasses::from_slice(&buf[..256]);
+ buf = &buf[256..];
+
+ Repr {
+ anchored: opts & dense::MASK_ANCHORED > 0,
+ start,
+ state_count,
+ max_match,
+ byte_classes,
+ trans: buf,
+ }
+ }
+}
+
+#[cfg(feature = "std")]
+impl<S: StateID> Repr<Vec<u8>, S> {
+ /// The implementation for constructing a sparse DFA from a dense DFA.
+ fn from_dense_sized<T: AsRef<[S]>, A: StateID>(
+ dfa: &dense::Repr<T, S>,
+ ) -> Result<Repr<Vec<u8>, A>> {
+ // In order to build the transition table, we need to be able to write
+ // state identifiers for each of the "next" transitions in each state.
+ // Our state identifiers correspond to the byte offset in the
+ // transition table at which the state is encoded. Therefore, we do not
+ // actually know what the state identifiers are until we've allocated
+ // exactly as much space as we need for each state. Thus, construction
+ // of the transition table happens in two passes.
+ //
+ // In the first pass, we fill out the shell of each state, which
+ // includes the transition count, the input byte ranges and zero-filled
+ // space for the transitions. In this first pass, we also build up a
+ // map from the state identifier index of the dense DFA to the state
+ // identifier in this sparse DFA.
+ //
+ // In the second pass, we fill in the transitions based on the map
+ // built in the first pass.
+
+ let mut trans = Vec::with_capacity(size_of::<A>() * dfa.state_count());
+ let mut remap: Vec<A> = vec![dead_id(); dfa.state_count()];
+ for (old_id, state) in dfa.states() {
+ let pos = trans.len();
+
+ remap[dfa.state_id_to_index(old_id)] = usize_to_state_id(pos)?;
+ // zero-filled space for the transition count
+ trans.push(0);
+ trans.push(0);
+
+ let mut trans_count = 0;
+ for (b1, b2, _) in state.sparse_transitions() {
+ trans_count += 1;
+ trans.push(b1);
+ trans.push(b2);
+ }
+ // fill in the transition count
+ NativeEndian::write_u16(&mut trans[pos..], trans_count);
+
+ // zero-fill the actual transitions
+ let zeros = trans_count as usize * size_of::<A>();
+ trans.extend(iter::repeat(0).take(zeros));
+ }
+
+ let mut new = Repr {
+ anchored: dfa.is_anchored(),
+ start: remap[dfa.state_id_to_index(dfa.start_state())],
+ state_count: dfa.state_count(),
+ max_match: remap[dfa.state_id_to_index(dfa.max_match_state())],
+ byte_classes: dfa.byte_classes().clone(),
+ trans,
+ };
+ for (old_id, old_state) in dfa.states() {
+ let new_id = remap[dfa.state_id_to_index(old_id)];
+ let mut new_state = new.state_mut(new_id);
+ let sparse = old_state.sparse_transitions();
+ for (i, (_, _, next)) in sparse.enumerate() {
+ let next = remap[dfa.state_id_to_index(next)];
+ new_state.set_next_at(i, next);
+ }
+ }
+ Ok(new)
+ }
+
+ /// Return a convenient mutable representation of the given state.
+ fn state_mut<'a>(&'a mut self, id: S) -> StateMut<'a, S> {
+ let mut pos = id.to_usize();
+ let ntrans = NativeEndian::read_u16(&self.trans[pos..]) as usize;
+ pos += 2;
+
+ let size = (ntrans * 2) + (ntrans * size_of::<S>());
+ let ranges_and_next = &mut self.trans[pos..pos + size];
+ let (input_ranges, next) = ranges_and_next.split_at_mut(ntrans * 2);
+ StateMut { _state_id_repr: PhantomData, ntrans, input_ranges, next }
+ }
+}
+
+#[cfg(feature = "std")]
+impl<T: AsRef<[u8]>, S: StateID> fmt::Debug for Repr<T, S> {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ fn state_status<T: AsRef<[u8]>, S: StateID>(
+ dfa: &Repr<T, S>,
+ id: S,
+ ) -> &'static str {
+ if id == dead_id() {
+ if dfa.is_match_state(id) {
+ "D*"
+ } else {
+ "D "
+ }
+ } else if id == dfa.start_state() {
+ if dfa.is_match_state(id) {
+ ">*"
+ } else {
+ "> "
+ }
+ } else {
+ if dfa.is_match_state(id) {
+ " *"
+ } else {
+ " "
+ }
+ }
+ }
+
+ writeln!(f, "SparseDFA(")?;
+ for (id, state) in self.states() {
+ let status = state_status(self, id);
+ writeln!(f, "{}{:06}: {:?}", status, id.to_usize(), state)?;
+ }
+ writeln!(f, ")")?;
+ Ok(())
+ }
+}
+
+/// An iterator over all states in a sparse DFA.
+///
+/// This iterator yields tuples, where the first element is the state ID and
+/// the second element is the state itself.
+#[cfg(feature = "std")]
+#[derive(Debug)]
+struct StateIter<'a, T: AsRef<[u8]> + 'a, S: StateID + 'a = usize> {
+ dfa: &'a Repr<T, S>,
+ id: S,
+}
+
+#[cfg(feature = "std")]
+impl<'a, T: AsRef<[u8]>, S: StateID> Iterator for StateIter<'a, T, S> {
+ type Item = (S, State<'a, S>);
+
+ fn next(&mut self) -> Option<(S, State<'a, S>)> {
+ if self.id.to_usize() >= self.dfa.trans().len() {
+ return None;
+ }
+ let id = self.id;
+ let state = self.dfa.state(id);
+ self.id = S::from_usize(self.id.to_usize() + state.bytes());
+ Some((id, state))
+ }
+}
+
+/// A representation of a sparse DFA state that can be cheaply materialized
+/// from a state identifier.
+#[derive(Clone)]
+struct State<'a, S: StateID = usize> {
+ /// The state identifier representation used by the DFA from which this
+ /// state was extracted. Since our transition table is compacted in a
+ /// &[u8], we don't actually use the state ID type parameter explicitly
+ /// anywhere, so we fake it. This prevents callers from using an incorrect
+ /// state ID representation to read from this state.
+ _state_id_repr: PhantomData<S>,
+ /// The number of transitions in this state.
+ ntrans: usize,
+ /// Pairs of input ranges, where there is one pair for each transition.
+ /// Each pair specifies an inclusive start and end byte range for the
+ /// corresponding transition.
+ input_ranges: &'a [u8],
+ /// Transitions to the next state. This slice contains native endian
+ /// encoded state identifiers, with `S` as the representation. Thus, there
+ /// are `ntrans * size_of::<S>()` bytes in this slice.
+ next: &'a [u8],
+}
+
+impl<'a, S: StateID> State<'a, S> {
+ /// Searches for the next transition given an input byte. If no such
+ /// transition could be found, then a dead state is returned.
+ fn next(&self, input: u8) -> S {
+ // This straight linear search was observed to be much better than
+ // binary search on ASCII haystacks, likely because a binary search
+ // visits the ASCII case last but a linear search sees it first. A
+ // binary search does do a little better on non-ASCII haystacks, but
+ // not by much. There might be a better trade off lurking here.
+ for i in 0..self.ntrans {
+ let (start, end) = self.range(i);
+ if start <= input && input <= end {
+ return self.next_at(i);
+ }
+ // We could bail early with an extra branch: if input < b1, then
+ // we know we'll never find a matching transition. Interestingly,
+ // this extra branch seems to not help performance, or will even
+ // hurt it. It's likely very dependent on the DFA itself and what
+ // is being searched.
+ }
+ dead_id()
+ }
+
+ /// Returns the inclusive input byte range for the ith transition in this
+ /// state.
+ fn range(&self, i: usize) -> (u8, u8) {
+ (self.input_ranges[i * 2], self.input_ranges[i * 2 + 1])
+ }
+
+ /// Returns the next state for the ith transition in this state.
+ fn next_at(&self, i: usize) -> S {
+ S::read_bytes(&self.next[i * size_of::<S>()..])
+ }
+
+ /// Return the total number of bytes that this state consumes in its
+ /// encoded form.
+ #[cfg(feature = "std")]
+ fn bytes(&self) -> usize {
+ 2 + (self.ntrans * 2) + (self.ntrans * size_of::<S>())
+ }
+}
+
+#[cfg(feature = "std")]
+impl<'a, S: StateID> fmt::Debug for State<'a, S> {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ let mut transitions = vec![];
+ for i in 0..self.ntrans {
+ let next = self.next_at(i);
+ if next == dead_id() {
+ continue;
+ }
+
+ let (start, end) = self.range(i);
+ if start == end {
+ transitions.push(format!(
+ "{} => {}",
+ escape(start),
+ next.to_usize()
+ ));
+ } else {
+ transitions.push(format!(
+ "{}-{} => {}",
+ escape(start),
+ escape(end),
+ next.to_usize(),
+ ));
+ }
+ }
+ write!(f, "{}", transitions.join(", "))
+ }
+}
+
+/// A representation of a mutable sparse DFA state that can be cheaply
+/// materialized from a state identifier.
+#[cfg(feature = "std")]
+struct StateMut<'a, S: StateID = usize> {
+ /// The state identifier representation used by the DFA from which this
+ /// state was extracted. Since our transition table is compacted in a
+ /// &[u8], we don't actually use the state ID type parameter explicitly
+ /// anywhere, so we fake it. This prevents callers from using an incorrect
+ /// state ID representation to read from this state.
+ _state_id_repr: PhantomData<S>,
+ /// The number of transitions in this state.
+ ntrans: usize,
+ /// Pairs of input ranges, where there is one pair for each transition.
+ /// Each pair specifies an inclusive start and end byte range for the
+ /// corresponding transition.
+ input_ranges: &'a mut [u8],
+ /// Transitions to the next state. This slice contains native endian
+ /// encoded state identifiers, with `S` as the representation. Thus, there
+ /// are `ntrans * size_of::<S>()` bytes in this slice.
+ next: &'a mut [u8],
+}
+
+#[cfg(feature = "std")]
+impl<'a, S: StateID> StateMut<'a, S> {
+ /// Sets the ith transition to the given state.
+ fn set_next_at(&mut self, i: usize, next: S) {
+ next.write_bytes(&mut self.next[i * size_of::<S>()..]);
+ }
+}
+
+#[cfg(feature = "std")]
+impl<'a, S: StateID> fmt::Debug for StateMut<'a, S> {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ let state = State {
+ _state_id_repr: self._state_id_repr,
+ ntrans: self.ntrans,
+ input_ranges: self.input_ranges,
+ next: self.next,
+ };
+ fmt::Debug::fmt(&state, f)
+ }
+}
+
+/// Return the given byte as its escaped string form.
+#[cfg(feature = "std")]
+fn escape(b: u8) -> String {
+ use std::ascii;
+
+ String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap()
+}
+
+/// A binary search routine specialized specifically to a sparse DFA state's
+/// transitions. Specifically, the transitions are defined as a set of pairs
+/// of input bytes that delineate an inclusive range of bytes. If the input
+/// byte is in the range, then the corresponding transition is a match.
+///
+/// This binary search accepts a slice of these pairs and returns the position
+/// of the matching pair (the ith transition), or None if no matching pair
+/// could be found.
+///
+/// Note that this routine is not currently used since it was observed to
+/// either decrease performance when searching ASCII, or did not provide enough
+/// of a boost on non-ASCII haystacks to be worth it. However, we leave it here
+/// for posterity in case we can find a way to use it.
+///
+/// In theory, we could use the standard library's search routine if we could
+/// cast a `&[u8]` to a `&[(u8, u8)]`, but I don't believe this is currently
+/// guaranteed to be safe and is thus UB (since I don't think the in-memory
+/// representation of `(u8, u8)` has been nailed down).
+#[inline(always)]
+#[allow(dead_code)]
+fn binary_search_ranges(ranges: &[u8], needle: u8) -> Option<usize> {
+ debug_assert!(ranges.len() % 2 == 0, "ranges must have even length");
+ debug_assert!(ranges.len() <= 512, "ranges should be short");
+
+ let (mut left, mut right) = (0, ranges.len() / 2);
+ while left < right {
+ let mid = (left + right) / 2;
+ let (b1, b2) = (ranges[mid * 2], ranges[mid * 2 + 1]);
+ if needle < b1 {
+ right = mid;
+ } else if needle > b2 {
+ left = mid + 1;
+ } else {
+ return Some(mid);
+ }
+ }
+ None
+}
diff --git a/src/sparse_set.rs b/src/sparse_set.rs
new file mode 100644
index 0000000..6f145ba
--- /dev/null
+++ b/src/sparse_set.rs
@@ -0,0 +1,60 @@
+use std::slice;
+
+/// A sparse set used for representing ordered NFA states.
+///
+/// This supports constant time addition and membership testing. Clearing an
+/// entire set can also be done in constant time. Iteration yields elements
+/// in the order in which they were inserted.
+///
+/// The data structure is based on: http://research.swtch.com/sparse
+/// Note though that we don't actually use uninitialized memory. We generally
+/// reuse sparse sets, so the initial allocation cost is bareable. However, its
+/// other properties listed above are extremely useful.
+#[derive(Clone, Debug)]
+pub struct SparseSet {
+ /// Dense contains the instruction pointers in the order in which they
+ /// were inserted.
+ dense: Vec<usize>,
+ /// Sparse maps instruction pointers to their location in dense.
+ ///
+ /// An instruction pointer is in the set if and only if
+ /// sparse[ip] < dense.len() && ip == dense[sparse[ip]].
+ sparse: Box<[usize]>,
+}
+
+impl SparseSet {
+ pub fn new(size: usize) -> SparseSet {
+ SparseSet {
+ dense: Vec::with_capacity(size),
+ sparse: vec![0; size].into_boxed_slice(),
+ }
+ }
+
+ pub fn len(&self) -> usize {
+ self.dense.len()
+ }
+
+ pub fn insert(&mut self, value: usize) {
+ let i = self.len();
+ assert!(i < self.dense.capacity());
+ self.dense.push(value);
+ self.sparse[value] = i;
+ }
+
+ pub fn contains(&self, value: usize) -> bool {
+ let i = self.sparse[value];
+ self.dense.get(i) == Some(&value)
+ }
+
+ pub fn clear(&mut self) {
+ self.dense.clear();
+ }
+}
+
+impl<'a> IntoIterator for &'a SparseSet {
+ type Item = &'a usize;
+ type IntoIter = slice::Iter<'a, usize>;
+ fn into_iter(self) -> Self::IntoIter {
+ self.dense.iter()
+ }
+}
diff --git a/src/state_id.rs b/src/state_id.rs
new file mode 100644
index 0000000..c9bac19
--- /dev/null
+++ b/src/state_id.rs
@@ -0,0 +1,291 @@
+use core::fmt::Debug;
+use core::hash::Hash;
+use core::mem::size_of;
+
+use byteorder::{ByteOrder, NativeEndian};
+
+#[cfg(feature = "std")]
+pub use self::std::*;
+
+#[cfg(feature = "std")]
+mod std {
+ use byteorder::ByteOrder;
+ use core::mem::size_of;
+ use error::{Error, Result};
+
+ use super::StateID;
+
+ /// Check that the premultiplication of the given state identifier can
+ /// fit into the representation indicated by `S`. If it cannot, or if it
+ /// overflows `usize` itself, then an error is returned.
+ pub fn premultiply_overflow_error<S: StateID>(
+ last_state: S,
+ alphabet_len: usize,
+ ) -> Result<()> {
+ let requested = match last_state.to_usize().checked_mul(alphabet_len) {
+ Some(requested) => requested,
+ None => return Err(Error::premultiply_overflow(0, 0)),
+ };
+ if requested > S::max_id() {
+ return Err(Error::premultiply_overflow(S::max_id(), requested));
+ }
+ Ok(())
+ }
+
+ /// Allocate the next sequential identifier for a fresh state given
+ /// the previously constructed state identified by `current`. If the
+ /// next sequential identifier would overflow `usize` or the chosen
+ /// representation indicated by `S`, then an error is returned.
+ pub fn next_state_id<S: StateID>(current: S) -> Result<S> {
+ let next = match current.to_usize().checked_add(1) {
+ Some(next) => next,
+ None => return Err(Error::state_id_overflow(::std::usize::MAX)),
+ };
+ if next > S::max_id() {
+ return Err(Error::state_id_overflow(S::max_id()));
+ }
+ Ok(S::from_usize(next))
+ }
+
+ /// Convert the given `usize` to the chosen state identifier
+ /// representation. If the given value cannot fit in the chosen
+ /// representation, then an error is returned.
+ pub fn usize_to_state_id<S: StateID>(value: usize) -> Result<S> {
+ if value > S::max_id() {
+ Err(Error::state_id_overflow(S::max_id()))
+ } else {
+ Ok(S::from_usize(value))
+ }
+ }
+
+ /// Write the given identifier to the given slice of bytes using the
+ /// specified endianness. The given slice must have length at least
+ /// `size_of::<S>()`.
+ ///
+ /// The given state identifier representation must have size 1, 2, 4 or 8.
+ pub fn write_state_id_bytes<E: ByteOrder, S: StateID>(
+ slice: &mut [u8],
+ id: S,
+ ) {
+ assert!(
+ 1 == size_of::<S>()
+ || 2 == size_of::<S>()
+ || 4 == size_of::<S>()
+ || 8 == size_of::<S>()
+ );
+
+ match size_of::<S>() {
+ 1 => slice[0] = id.to_usize() as u8,
+ 2 => E::write_u16(slice, id.to_usize() as u16),
+ 4 => E::write_u32(slice, id.to_usize() as u32),
+ 8 => E::write_u64(slice, id.to_usize() as u64),
+ _ => unreachable!(),
+ }
+ }
+}
+
+/// Return the unique identifier for a DFA's dead state in the chosen
+/// representation indicated by `S`.
+pub fn dead_id<S: StateID>() -> S {
+ S::from_usize(0)
+}
+
+/// A trait describing the representation of a DFA's state identifier.
+///
+/// The purpose of this trait is to safely express both the possible state
+/// identifier representations that can be used in a DFA and to convert between
+/// state identifier representations and types that can be used to efficiently
+/// index memory (such as `usize`).
+///
+/// In general, one should not need to implement this trait explicitly. In
+/// particular, this crate provides implementations for `u8`, `u16`, `u32`,
+/// `u64` and `usize`. (`u32` and `u64` are only provided for targets that can
+/// represent all corresponding values in a `usize`.)
+///
+/// # Safety
+///
+/// This trait is unsafe because the correctness of its implementations may be
+/// relied upon by other unsafe code. For example, one possible way to
+/// implement this trait incorrectly would be to return a maximum identifier
+/// in `max_id` that is greater than the real maximum identifier. This will
+/// likely result in wrap-on-overflow semantics in release mode, which can in
+/// turn produce incorrect state identifiers. Those state identifiers may then
+/// in turn access out-of-bounds memory in a DFA's search routine, where bounds
+/// checks are explicitly elided for performance reasons.
+pub unsafe trait StateID:
+ Clone + Copy + Debug + Eq + Hash + PartialEq + PartialOrd + Ord
+{
+ /// Convert from a `usize` to this implementation's representation.
+ ///
+ /// Implementors may assume that `n <= Self::max_id`. That is, implementors
+ /// do not need to check whether `n` can fit inside this implementation's
+ /// representation.
+ fn from_usize(n: usize) -> Self;
+
+ /// Convert this implementation's representation to a `usize`.
+ ///
+ /// Implementors must not return a `usize` value greater than
+ /// `Self::max_id` and must not permit overflow when converting between the
+ /// implementor's representation and `usize`. In general, the preferred
+ /// way for implementors to achieve this is to simply not provide
+ /// implementations of `StateID` that cannot fit into the target platform's
+ /// `usize`.
+ fn to_usize(self) -> usize;
+
+ /// Return the maximum state identifier supported by this representation.
+ ///
+ /// Implementors must return a correct bound. Doing otherwise may result
+ /// in memory unsafety.
+ fn max_id() -> usize;
+
+ /// Read a single state identifier from the given slice of bytes in native
+ /// endian format.
+ ///
+ /// Implementors may assume that the given slice has length at least
+ /// `size_of::<Self>()`.
+ fn read_bytes(slice: &[u8]) -> Self;
+
+ /// Write this state identifier to the given slice of bytes in native
+ /// endian format.
+ ///
+ /// Implementors may assume that the given slice has length at least
+ /// `size_of::<Self>()`.
+ fn write_bytes(self, slice: &mut [u8]);
+}
+
+unsafe impl StateID for usize {
+ #[inline]
+ fn from_usize(n: usize) -> usize {
+ n
+ }
+
+ #[inline]
+ fn to_usize(self) -> usize {
+ self
+ }
+
+ #[inline]
+ fn max_id() -> usize {
+ ::core::usize::MAX
+ }
+
+ #[inline]
+ fn read_bytes(slice: &[u8]) -> Self {
+ NativeEndian::read_uint(slice, size_of::<usize>()) as usize
+ }
+
+ #[inline]
+ fn write_bytes(self, slice: &mut [u8]) {
+ NativeEndian::write_uint(slice, self as u64, size_of::<usize>())
+ }
+}
+
+unsafe impl StateID for u8 {
+ #[inline]
+ fn from_usize(n: usize) -> u8 {
+ n as u8
+ }
+
+ #[inline]
+ fn to_usize(self) -> usize {
+ self as usize
+ }
+
+ #[inline]
+ fn max_id() -> usize {
+ ::core::u8::MAX as usize
+ }
+
+ #[inline]
+ fn read_bytes(slice: &[u8]) -> Self {
+ slice[0]
+ }
+
+ #[inline]
+ fn write_bytes(self, slice: &mut [u8]) {
+ slice[0] = self;
+ }
+}
+
+unsafe impl StateID for u16 {
+ #[inline]
+ fn from_usize(n: usize) -> u16 {
+ n as u16
+ }
+
+ #[inline]
+ fn to_usize(self) -> usize {
+ self as usize
+ }
+
+ #[inline]
+ fn max_id() -> usize {
+ ::core::u16::MAX as usize
+ }
+
+ #[inline]
+ fn read_bytes(slice: &[u8]) -> Self {
+ NativeEndian::read_u16(slice)
+ }
+
+ #[inline]
+ fn write_bytes(self, slice: &mut [u8]) {
+ NativeEndian::write_u16(slice, self)
+ }
+}
+
+#[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
+unsafe impl StateID for u32 {
+ #[inline]
+ fn from_usize(n: usize) -> u32 {
+ n as u32
+ }
+
+ #[inline]
+ fn to_usize(self) -> usize {
+ self as usize
+ }
+
+ #[inline]
+ fn max_id() -> usize {
+ ::core::u32::MAX as usize
+ }
+
+ #[inline]
+ fn read_bytes(slice: &[u8]) -> Self {
+ NativeEndian::read_u32(slice)
+ }
+
+ #[inline]
+ fn write_bytes(self, slice: &mut [u8]) {
+ NativeEndian::write_u32(slice, self)
+ }
+}
+
+#[cfg(target_pointer_width = "64")]
+unsafe impl StateID for u64 {
+ #[inline]
+ fn from_usize(n: usize) -> u64 {
+ n as u64
+ }
+
+ #[inline]
+ fn to_usize(self) -> usize {
+ self as usize
+ }
+
+ #[inline]
+ fn max_id() -> usize {
+ ::core::u64::MAX as usize
+ }
+
+ #[inline]
+ fn read_bytes(slice: &[u8]) -> Self {
+ NativeEndian::read_u64(slice)
+ }
+
+ #[inline]
+ fn write_bytes(self, slice: &mut [u8]) {
+ NativeEndian::write_u64(slice, self)
+ }
+}
diff --git a/src/transducer.rs b/src/transducer.rs
new file mode 100644
index 0000000..679c757
--- /dev/null
+++ b/src/transducer.rs
@@ -0,0 +1,107 @@
+use fst::Automaton;
+
+use crate::{StateID, DFA};
+
+macro_rules! imp {
+ ($ty:ty, $id:ty) => {
+ impl<T: AsRef<[$id]>, S: StateID> Automaton for $ty {
+ type State = S;
+
+ #[inline]
+ fn start(&self) -> S {
+ self.start_state()
+ }
+
+ #[inline]
+ fn is_match(&self, state: &S) -> bool {
+ self.is_match_state(*state)
+ }
+
+ #[inline]
+ fn accept(&self, state: &S, byte: u8) -> S {
+ self.next_state(*state, byte)
+ }
+
+ #[inline]
+ fn can_match(&self, state: &S) -> bool {
+ !self.is_dead_state(*state)
+ }
+ }
+ };
+}
+
+imp!(crate::dense::DenseDFA<T, S>, S);
+imp!(crate::dense::Standard<T, S>, S);
+imp!(crate::dense::ByteClass<T, S>, S);
+imp!(crate::dense::Premultiplied<T, S>, S);
+imp!(crate::dense::PremultipliedByteClass<T, S>, S);
+imp!(crate::sparse::SparseDFA<T, S>, u8);
+imp!(crate::sparse::Standard<T, S>, u8);
+imp!(crate::sparse::ByteClass<T, S>, u8);
+
+#[cfg(test)]
+mod tests {
+ use bstr::BString;
+ use fst::{Automaton, IntoStreamer, Set, Streamer};
+
+ use crate::dense::{self, DenseDFA};
+ use crate::sparse::SparseDFA;
+
+ fn search<A: Automaton, D: AsRef<[u8]>>(
+ set: &Set<D>,
+ aut: A,
+ ) -> Vec<BString> {
+ let mut stream = set.search(aut).into_stream();
+
+ let mut results = vec![];
+ while let Some(key) = stream.next() {
+ results.push(BString::from(key));
+ }
+ results
+ }
+
+ #[test]
+ fn dense_anywhere() {
+ let set =
+ Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
+ .unwrap();
+ let dfa = DenseDFA::new("ba.*").unwrap();
+ let got = search(&set, &dfa);
+ assert_eq!(got, vec!["bar", "baz", "xba", "xbax"]);
+ }
+
+ #[test]
+ fn dense_anchored() {
+ let set =
+ Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
+ .unwrap();
+ let dfa = dense::Builder::new().anchored(true).build("ba.*").unwrap();
+ let got = search(&set, &dfa);
+ assert_eq!(got, vec!["bar", "baz"]);
+ }
+
+ #[test]
+ fn sparse_anywhere() {
+ let set =
+ Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
+ .unwrap();
+ let dfa = SparseDFA::new("ba.*").unwrap();
+ let got = search(&set, &dfa);
+ assert_eq!(got, vec!["bar", "baz", "xba", "xbax"]);
+ }
+
+ #[test]
+ fn sparse_anchored() {
+ let set =
+ Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
+ .unwrap();
+ let dfa = dense::Builder::new()
+ .anchored(true)
+ .build("ba.*")
+ .unwrap()
+ .to_sparse()
+ .unwrap();
+ let got = search(&set, &dfa);
+ assert_eq!(got, vec!["bar", "baz"]);
+ }
+}
diff --git a/tests/collection.rs b/tests/collection.rs
new file mode 100644
index 0000000..68b0322
--- /dev/null
+++ b/tests/collection.rs
@@ -0,0 +1,461 @@
+use std::collections::BTreeMap;
+use std::env;
+use std::fmt::{self, Write};
+use std::thread;
+
+use regex;
+use regex_automata::{DenseDFA, ErrorKind, Regex, RegexBuilder, StateID, DFA};
+use serde_bytes;
+use toml;
+
+macro_rules! load {
+ ($col:ident, $path:expr) => {
+ $col.extend(RegexTests::load(
+ concat!("../data/tests/", $path),
+ include_bytes!(concat!("../data/tests/", $path)),
+ ));
+ };
+}
+
+lazy_static! {
+ pub static ref SUITE: RegexTestCollection = {
+ let mut col = RegexTestCollection::new();
+ load!(col, "fowler/basic.toml");
+ load!(col, "fowler/nullsubexpr.toml");
+ load!(col, "fowler/repetition.toml");
+ load!(col, "fowler/repetition-long.toml");
+ load!(col, "crazy.toml");
+ load!(col, "flags.toml");
+ load!(col, "iter.toml");
+ load!(col, "no-unicode.toml");
+ load!(col, "unicode.toml");
+ col
+ };
+}
+
+#[derive(Clone, Debug)]
+pub struct RegexTestCollection {
+ pub by_name: BTreeMap<String, RegexTest>,
+}
+
+#[derive(Clone, Debug, Deserialize)]
+pub struct RegexTests {
+ pub tests: Vec<RegexTest>,
+}
+
+#[derive(Clone, Debug, Deserialize)]
+pub struct RegexTest {
+ pub name: String,
+ #[serde(default)]
+ pub options: Vec<RegexTestOption>,
+ pub pattern: String,
+ #[serde(with = "serde_bytes")]
+ pub input: Vec<u8>,
+ #[serde(rename = "matches")]
+ pub matches: Vec<Match>,
+ #[serde(default)]
+ pub captures: Vec<Option<Match>>,
+ #[serde(default)]
+ pub fowler_line_number: Option<u64>,
+}
+
+#[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq)]
+#[serde(rename_all = "kebab-case")]
+pub enum RegexTestOption {
+ Anchored,
+ CaseInsensitive,
+ NoUnicode,
+ Escaped,
+ #[serde(rename = "invalid-utf8")]
+ InvalidUTF8,
+}
+
+#[derive(Clone, Copy, Deserialize, Eq, PartialEq)]
+pub struct Match {
+ pub start: usize,
+ pub end: usize,
+}
+
+impl RegexTestCollection {
+ fn new() -> RegexTestCollection {
+ RegexTestCollection { by_name: BTreeMap::new() }
+ }
+
+ fn extend(&mut self, tests: RegexTests) {
+ for test in tests.tests {
+ let name = test.name.clone();
+ if self.by_name.contains_key(&name) {
+ panic!("found duplicate test {}", name);
+ }
+ self.by_name.insert(name, test);
+ }
+ }
+
+ pub fn tests(&self) -> Vec<&RegexTest> {
+ self.by_name.values().collect()
+ }
+}
+
+impl RegexTests {
+ fn load(path: &str, slice: &[u8]) -> RegexTests {
+ let mut data: RegexTests = toml::from_slice(slice)
+ .expect(&format!("failed to load {}", path));
+ for test in &mut data.tests {
+ if test.options.contains(&RegexTestOption::Escaped) {
+ test.input = unescape_bytes(&test.input);
+ }
+ }
+ data
+ }
+}
+
+#[derive(Debug)]
+pub struct RegexTester {
+ asserted: bool,
+ results: RegexTestResults,
+ skip_expensive: bool,
+ whitelist: Vec<regex::Regex>,
+ blacklist: Vec<regex::Regex>,
+}
+
+impl Drop for RegexTester {
+ fn drop(&mut self) {
+ // If we haven't asserted yet, then the test is probably buggy, so
+ // fail it. But if we're already panicking (e.g., a bug in the regex
+ // engine), then don't double-panic, which causes an immediate abort.
+ if !thread::panicking() && !self.asserted {
+ panic!("must call RegexTester::assert at end of test");
+ }
+ }
+}
+
+impl RegexTester {
+ pub fn new() -> RegexTester {
+ let mut tester = RegexTester {
+ asserted: false,
+ results: RegexTestResults::default(),
+ skip_expensive: false,
+ whitelist: vec![],
+ blacklist: vec![],
+ };
+ for x in env::var("REGEX_TEST").unwrap_or("".to_string()).split(",") {
+ let x = x.trim();
+ if x.is_empty() {
+ continue;
+ }
+ if x.starts_with("-") {
+ tester = tester.blacklist(&x[1..]);
+ } else {
+ tester = tester.whitelist(x);
+ }
+ }
+ tester
+ }
+
+ pub fn skip_expensive(mut self) -> RegexTester {
+ self.skip_expensive = true;
+ self
+ }
+
+ pub fn whitelist(mut self, name: &str) -> RegexTester {
+ self.whitelist.push(regex::Regex::new(name).unwrap());
+ self
+ }
+
+ pub fn blacklist(mut self, name: &str) -> RegexTester {
+ self.blacklist.push(regex::Regex::new(name).unwrap());
+ self
+ }
+
+ pub fn assert(&mut self) {
+ self.asserted = true;
+ self.results.assert();
+ }
+
+ pub fn build_regex<S: StateID>(
+ &self,
+ mut builder: RegexBuilder,
+ test: &RegexTest,
+ ) -> Option<Regex<DenseDFA<Vec<S>, S>>> {
+ if self.skip(test) {
+ return None;
+ }
+ self.apply_options(test, &mut builder);
+
+ match builder.build_with_size::<S>(&test.pattern) {
+ Ok(re) => Some(re),
+ Err(err) => {
+ if let ErrorKind::Unsupported(_) = *err.kind() {
+ None
+ } else {
+ panic!(
+ "failed to build {:?} with pattern '{:?}': {}",
+ test.name, test.pattern, err
+ );
+ }
+ }
+ }
+ }
+
+ pub fn test_all<'a, I, T>(&mut self, builder: RegexBuilder, tests: I)
+ where
+ I: IntoIterator<IntoIter = T, Item = &'a RegexTest>,
+ T: Iterator<Item = &'a RegexTest>,
+ {
+ for test in tests {
+ let builder = builder.clone();
+ let re: Regex = match self.build_regex(builder, test) {
+ None => continue,
+ Some(re) => re,
+ };
+ self.test(test, &re);
+ }
+ }
+
+ pub fn test<'a, D: DFA>(&mut self, test: &RegexTest, re: &Regex<D>) {
+ self.test_is_match(test, re);
+ self.test_find(test, re);
+ // Some tests (namely, fowler) are designed only to detect the
+ // first match even if there are more subsequent matches. To that
+ // end, we only test match iteration when the number of matches
+ // expected is not 1, or if the test name has 'iter' in it.
+ if test.name.contains("iter") || test.matches.len() != 1 {
+ self.test_find_iter(test, re);
+ }
+ }
+
+ pub fn test_is_match<'a, D: DFA>(
+ &mut self,
+ test: &RegexTest,
+ re: &Regex<D>,
+ ) {
+ self.asserted = false;
+
+ let got = re.is_match(&test.input);
+ let expected = test.matches.len() >= 1;
+ if got == expected {
+ self.results.succeeded.push(test.clone());
+ return;
+ }
+ self.results.failed.push(RegexTestFailure {
+ test: test.clone(),
+ kind: RegexTestFailureKind::IsMatch,
+ });
+ }
+
+ pub fn test_find<'a, D: DFA>(&mut self, test: &RegexTest, re: &Regex<D>) {
+ self.asserted = false;
+
+ let got =
+ re.find(&test.input).map(|(start, end)| Match { start, end });
+ if got == test.matches.get(0).map(|&m| m) {
+ self.results.succeeded.push(test.clone());
+ return;
+ }
+ self.results.failed.push(RegexTestFailure {
+ test: test.clone(),
+ kind: RegexTestFailureKind::Find { got },
+ });
+ }
+
+ pub fn test_find_iter<'a, D: DFA>(
+ &mut self,
+ test: &RegexTest,
+ re: &Regex<D>,
+ ) {
+ self.asserted = false;
+
+ let got: Vec<Match> = re
+ .find_iter(&test.input)
+ .map(|(start, end)| Match { start, end })
+ .collect();
+ if got == test.matches {
+ self.results.succeeded.push(test.clone());
+ return;
+ }
+ self.results.failed.push(RegexTestFailure {
+ test: test.clone(),
+ kind: RegexTestFailureKind::FindIter { got },
+ });
+ }
+
+ fn skip(&self, test: &RegexTest) -> bool {
+ if self.skip_expensive {
+ if test.name.starts_with("repetition-long") {
+ return true;
+ }
+ }
+ if !self.blacklist.is_empty() {
+ if self.blacklist.iter().any(|re| re.is_match(&test.name)) {
+ return true;
+ }
+ }
+ if !self.whitelist.is_empty() {
+ if !self.whitelist.iter().any(|re| re.is_match(&test.name)) {
+ return true;
+ }
+ }
+ false
+ }
+
+ fn apply_options(&self, test: &RegexTest, builder: &mut RegexBuilder) {
+ for opt in &test.options {
+ match *opt {
+ RegexTestOption::Anchored => {
+ builder.anchored(true);
+ }
+ RegexTestOption::CaseInsensitive => {
+ builder.case_insensitive(true);
+ }
+ RegexTestOption::NoUnicode => {
+ builder.unicode(false);
+ }
+ RegexTestOption::Escaped => {}
+ RegexTestOption::InvalidUTF8 => {
+ builder.allow_invalid_utf8(true);
+ }
+ }
+ }
+ }
+}
+
+#[derive(Clone, Debug, Default)]
+pub struct RegexTestResults {
+ /// Tests that succeeded.
+ pub succeeded: Vec<RegexTest>,
+ /// Failed tests, indexed by group name.
+ pub failed: Vec<RegexTestFailure>,
+}
+
+#[derive(Clone, Debug)]
+pub struct RegexTestFailure {
+ test: RegexTest,
+ kind: RegexTestFailureKind,
+}
+
+#[derive(Clone, Debug)]
+pub enum RegexTestFailureKind {
+ IsMatch,
+ Find { got: Option<Match> },
+ FindIter { got: Vec<Match> },
+}
+
+impl RegexTestResults {
+ pub fn assert(&self) {
+ if self.failed.is_empty() {
+ return;
+ }
+ let failures = self
+ .failed
+ .iter()
+ .map(|f| f.to_string())
+ .collect::<Vec<String>>()
+ .join("\n\n");
+ panic!(
+ "found {} failures:\n{}\n{}\n{}\n\n\
+ Set the REGEX_TEST environment variable to filter tests, \n\
+ e.g., REGEX_TEST=crazy-misc,-crazy-misc2 runs every test \n\
+ whose name contains crazy-misc but not crazy-misc2\n\n",
+ self.failed.len(),
+ "~".repeat(79),
+ failures.trim(),
+ "~".repeat(79)
+ )
+ }
+}
+
+impl fmt::Display for RegexTestFailure {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ write!(
+ f,
+ "{}: {}\n \
+ options: {:?}\n \
+ pattern: {}\n \
+ pattern (escape): {}\n \
+ input: {}\n \
+ input (escape): {}\n \
+ input (hex): {}",
+ self.test.name,
+ self.kind.fmt(&self.test)?,
+ self.test.options,
+ self.test.pattern,
+ escape_default(&self.test.pattern),
+ nice_raw_bytes(&self.test.input),
+ escape_bytes(&self.test.input),
+ hex_bytes(&self.test.input)
+ )
+ }
+}
+
+impl RegexTestFailureKind {
+ fn fmt(&self, test: &RegexTest) -> Result<String, fmt::Error> {
+ let mut buf = String::new();
+ match *self {
+ RegexTestFailureKind::IsMatch => {
+ if let Some(&m) = test.matches.get(0) {
+ write!(buf, "expected match (at {}), but none found", m)?
+ } else {
+ write!(buf, "expected no match, but found a match")?
+ }
+ }
+ RegexTestFailureKind::Find { got } => write!(
+ buf,
+ "expected {:?}, but found {:?}",
+ test.matches.get(0),
+ got
+ )?,
+ RegexTestFailureKind::FindIter { ref got } => write!(
+ buf,
+ "expected {:?}, but found {:?}",
+ test.matches, got
+ )?,
+ }
+ Ok(buf)
+ }
+}
+
+impl fmt::Display for Match {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ write!(f, "({}, {})", self.start, self.end)
+ }
+}
+
+impl fmt::Debug for Match {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ write!(f, "({}, {})", self.start, self.end)
+ }
+}
+
+fn nice_raw_bytes(bytes: &[u8]) -> String {
+ use std::str;
+
+ match str::from_utf8(bytes) {
+ Ok(s) => s.to_string(),
+ Err(_) => escape_bytes(bytes),
+ }
+}
+
+fn escape_bytes(bytes: &[u8]) -> String {
+ use std::ascii;
+
+ let escaped = bytes
+ .iter()
+ .flat_map(|&b| ascii::escape_default(b))
+ .collect::<Vec<u8>>();
+ String::from_utf8(escaped).unwrap()
+}
+
+fn hex_bytes(bytes: &[u8]) -> String {
+ bytes.iter().map(|&b| format!(r"\x{:02X}", b)).collect()
+}
+
+fn escape_default(s: &str) -> String {
+ s.chars().flat_map(|c| c.escape_default()).collect()
+}
+
+fn unescape_bytes(bytes: &[u8]) -> Vec<u8> {
+ use std::str;
+ use unescape::unescape;
+
+ unescape(&str::from_utf8(bytes).expect("all input must be valid UTF-8"))
+}
diff --git a/tests/regression.rs b/tests/regression.rs
new file mode 100644
index 0000000..c2d2c12
--- /dev/null
+++ b/tests/regression.rs
@@ -0,0 +1,42 @@
+use regex_automata::{dense, DFA};
+
+// A regression test for checking that minimization correctly translates
+// whether a state is a match state or not. Previously, it was possible for
+// minimization to mark a non-matching state as matching.
+#[test]
+fn minimize_sets_correct_match_states() {
+ let pattern =
+ // This is a subset of the grapheme matching regex. I couldn't seem
+ // to get a repro any smaller than this unfortunately.
+ r"(?x)
+ (?:
+ \p{gcb=Prepend}*
+ (?:
+ (?:
+ (?:
+ \p{gcb=L}*
+ (?:\p{gcb=V}+|\p{gcb=LV}\p{gcb=V}*|\p{gcb=LVT})
+ \p{gcb=T}*
+ )
+ |
+ \p{gcb=L}+
+ |
+ \p{gcb=T}+
+ )
+ |
+ \p{Extended_Pictographic}
+ (?:\p{gcb=Extend}*\p{gcb=ZWJ}\p{Extended_Pictographic})*
+ |
+ [^\p{gcb=Control}\p{gcb=CR}\p{gcb=LF}]
+ )
+ [\p{gcb=Extend}\p{gcb=ZWJ}\p{gcb=SpacingMark}]*
+ )
+ ";
+
+ let dfa = dense::Builder::new()
+ .minimize(true)
+ .anchored(true)
+ .build(pattern)
+ .unwrap();
+ assert_eq!(None, dfa.find(b"\xE2"));
+}
diff --git a/tests/suite.rs b/tests/suite.rs
new file mode 100644
index 0000000..8397194
--- /dev/null
+++ b/tests/suite.rs
@@ -0,0 +1,250 @@
+use regex_automata::{DenseDFA, Regex, RegexBuilder, SparseDFA};
+
+use collection::{RegexTester, SUITE};
+
+#[test]
+fn unminimized_standard() {
+ let mut builder = RegexBuilder::new();
+ builder.minimize(false).premultiply(false).byte_classes(false);
+
+ let mut tester = RegexTester::new().skip_expensive();
+ tester.test_all(builder, SUITE.tests());
+ tester.assert();
+}
+
+#[test]
+fn unminimized_premultiply() {
+ let mut builder = RegexBuilder::new();
+ builder.minimize(false).premultiply(true).byte_classes(false);
+
+ let mut tester = RegexTester::new().skip_expensive();
+ tester.test_all(builder, SUITE.tests());
+ tester.assert();
+}
+
+#[test]
+fn unminimized_byte_class() {
+ let mut builder = RegexBuilder::new();
+ builder.minimize(false).premultiply(false).byte_classes(true);
+
+ let mut tester = RegexTester::new();
+ tester.test_all(builder, SUITE.tests());
+ tester.assert();
+}
+
+#[test]
+fn unminimized_premultiply_byte_class() {
+ let mut builder = RegexBuilder::new();
+ builder.minimize(false).premultiply(true).byte_classes(true);
+
+ let mut tester = RegexTester::new();
+ tester.test_all(builder, SUITE.tests());
+ tester.assert();
+}
+
+#[test]
+fn unminimized_standard_no_nfa_shrink() {
+ let mut builder = RegexBuilder::new();
+ builder
+ .minimize(false)
+ .premultiply(false)
+ .byte_classes(false)
+ .shrink(false);
+
+ let mut tester = RegexTester::new().skip_expensive();
+ tester.test_all(builder, SUITE.tests());
+ tester.assert();
+}
+
+#[test]
+fn minimized_standard() {
+ let mut builder = RegexBuilder::new();
+ builder.minimize(true).premultiply(false).byte_classes(false);
+
+ let mut tester = RegexTester::new().skip_expensive();
+ tester.test_all(builder, SUITE.tests());
+ tester.assert();
+}
+
+#[test]
+fn minimized_premultiply() {
+ let mut builder = RegexBuilder::new();
+ builder.minimize(true).premultiply(true).byte_classes(false);
+
+ let mut tester = RegexTester::new().skip_expensive();
+ tester.test_all(builder, SUITE.tests());
+ tester.assert();
+}
+
+#[test]
+fn minimized_byte_class() {
+ let mut builder = RegexBuilder::new();
+ builder.minimize(true).premultiply(false).byte_classes(true);
+
+ let mut tester = RegexTester::new();
+ tester.test_all(builder, SUITE.tests());
+ tester.assert();
+}
+
+#[test]
+fn minimized_premultiply_byte_class() {
+ let mut builder = RegexBuilder::new();
+ builder.minimize(true).premultiply(true).byte_classes(true);
+
+ let mut tester = RegexTester::new();
+ tester.test_all(builder, SUITE.tests());
+ tester.assert();
+}
+
+#[test]
+fn minimized_standard_no_nfa_shrink() {
+ let mut builder = RegexBuilder::new();
+ builder
+ .minimize(true)
+ .premultiply(false)
+ .byte_classes(false)
+ .shrink(false);
+
+ let mut tester = RegexTester::new().skip_expensive();
+ tester.test_all(builder, SUITE.tests());
+ tester.assert();
+}
+
+// A basic sanity test that checks we can convert a regex to a smaller
+// representation and that the resulting regex still passes our tests.
+//
+// If tests grow minimal regexes that cannot be represented in 16 bits, then
+// we'll either want to skip those or increase the size to test to u32.
+#[test]
+fn u16() {
+ let mut builder = RegexBuilder::new();
+ builder.minimize(true).premultiply(false).byte_classes(true);
+
+ let mut tester = RegexTester::new().skip_expensive();
+ for test in SUITE.tests() {
+ let builder = builder.clone();
+ let re: Regex = match tester.build_regex(builder, test) {
+ None => continue,
+ Some(re) => re,
+ };
+ let small_re = Regex::from_dfas(
+ re.forward().to_u16().unwrap(),
+ re.reverse().to_u16().unwrap(),
+ );
+
+ tester.test(test, &small_re);
+ }
+ tester.assert();
+}
+
+// Test that sparse DFAs work using the standard configuration.
+#[test]
+fn sparse_unminimized_standard() {
+ let mut builder = RegexBuilder::new();
+ builder.minimize(false).premultiply(false).byte_classes(false);
+
+ let mut tester = RegexTester::new().skip_expensive();
+ for test in SUITE.tests() {
+ let builder = builder.clone();
+ let re: Regex = match tester.build_regex(builder, test) {
+ None => continue,
+ Some(re) => re,
+ };
+ let fwd = re.forward().to_sparse().unwrap();
+ let rev = re.reverse().to_sparse().unwrap();
+ let sparse_re = Regex::from_dfas(fwd, rev);
+
+ tester.test(test, &sparse_re);
+ }
+ tester.assert();
+}
+
+// Test that sparse DFAs work after converting them to a different state ID
+// representation.
+#[test]
+fn sparse_u16() {
+ let mut builder = RegexBuilder::new();
+ builder.minimize(true).premultiply(false).byte_classes(false);
+
+ let mut tester = RegexTester::new().skip_expensive();
+ for test in SUITE.tests() {
+ let builder = builder.clone();
+ let re: Regex = match tester.build_regex(builder, test) {
+ None => continue,
+ Some(re) => re,
+ };
+ let fwd = re.forward().to_sparse().unwrap().to_u16().unwrap();
+ let rev = re.reverse().to_sparse().unwrap().to_u16().unwrap();
+ let sparse_re = Regex::from_dfas(fwd, rev);
+
+ tester.test(test, &sparse_re);
+ }
+ tester.assert();
+}
+
+// Another basic sanity test that checks we can serialize and then deserialize
+// a regex, and that the resulting regex can be used for searching correctly.
+#[test]
+fn serialization_roundtrip() {
+ let mut builder = RegexBuilder::new();
+ builder.premultiply(false).byte_classes(true);
+
+ let mut tester = RegexTester::new().skip_expensive();
+ for test in SUITE.tests() {
+ let builder = builder.clone();
+ let re: Regex = match tester.build_regex(builder, test) {
+ None => continue,
+ Some(re) => re,
+ };
+
+ let fwd_bytes = re.forward().to_bytes_native_endian().unwrap();
+ let rev_bytes = re.reverse().to_bytes_native_endian().unwrap();
+ let fwd: DenseDFA<&[usize], usize> =
+ unsafe { DenseDFA::from_bytes(&fwd_bytes) };
+ let rev: DenseDFA<&[usize], usize> =
+ unsafe { DenseDFA::from_bytes(&rev_bytes) };
+ let re = Regex::from_dfas(fwd, rev);
+
+ tester.test(test, &re);
+ }
+ tester.assert();
+}
+
+// A basic sanity test that checks we can serialize and then deserialize a
+// regex using sparse DFAs, and that the resulting regex can be used for
+// searching correctly.
+#[test]
+fn sparse_serialization_roundtrip() {
+ let mut builder = RegexBuilder::new();
+ builder.byte_classes(true);
+
+ let mut tester = RegexTester::new().skip_expensive();
+ for test in SUITE.tests() {
+ let builder = builder.clone();
+ let re: Regex = match tester.build_regex(builder, test) {
+ None => continue,
+ Some(re) => re,
+ };
+
+ let fwd_bytes = re
+ .forward()
+ .to_sparse()
+ .unwrap()
+ .to_bytes_native_endian()
+ .unwrap();
+ let rev_bytes = re
+ .reverse()
+ .to_sparse()
+ .unwrap()
+ .to_bytes_native_endian()
+ .unwrap();
+ let fwd: SparseDFA<&[u8], usize> =
+ unsafe { SparseDFA::from_bytes(&fwd_bytes) };
+ let rev: SparseDFA<&[u8], usize> =
+ unsafe { SparseDFA::from_bytes(&rev_bytes) };
+ let re = Regex::from_dfas(fwd, rev);
+
+ tester.test(test, &re);
+ }
+ tester.assert();
+}
diff --git a/tests/tests.rs b/tests/tests.rs
new file mode 100644
index 0000000..fb4cd77
--- /dev/null
+++ b/tests/tests.rs
@@ -0,0 +1,25 @@
+#[cfg(feature = "std")]
+#[macro_use]
+extern crate lazy_static;
+#[cfg(feature = "std")]
+extern crate regex;
+#[cfg(feature = "std")]
+extern crate regex_automata;
+#[cfg(feature = "std")]
+extern crate serde;
+#[cfg(feature = "std")]
+extern crate serde_bytes;
+#[cfg(feature = "std")]
+#[macro_use]
+extern crate serde_derive;
+#[cfg(feature = "std")]
+extern crate toml;
+
+#[cfg(feature = "std")]
+mod collection;
+#[cfg(feature = "std")]
+mod regression;
+#[cfg(feature = "std")]
+mod suite;
+#[cfg(feature = "std")]
+mod unescape;
diff --git a/tests/unescape.rs b/tests/unescape.rs
new file mode 100644
index 0000000..43fe04e
--- /dev/null
+++ b/tests/unescape.rs
@@ -0,0 +1,84 @@
+#[derive(Clone, Copy, Eq, PartialEq)]
+enum State {
+ /// The state after seeing a `\`.
+ Escape,
+ /// The state after seeing a `\x`.
+ HexFirst,
+ /// The state after seeing a `\x[0-9A-Fa-f]`.
+ HexSecond(char),
+ /// Default state.
+ Literal,
+}
+
+pub fn unescape(s: &str) -> Vec<u8> {
+ use self::State::*;
+
+ let mut bytes = vec![];
+ let mut state = Literal;
+ for c in s.chars() {
+ match state {
+ Escape => match c {
+ '\\' => {
+ bytes.push(b'\\');
+ state = Literal;
+ }
+ 'n' => {
+ bytes.push(b'\n');
+ state = Literal;
+ }
+ 'r' => {
+ bytes.push(b'\r');
+ state = Literal;
+ }
+ 't' => {
+ bytes.push(b'\t');
+ state = Literal;
+ }
+ 'x' => {
+ state = HexFirst;
+ }
+ c => {
+ bytes.extend(format!(r"\{}", c).into_bytes());
+ state = Literal;
+ }
+ },
+ HexFirst => match c {
+ '0'..='9' | 'A'..='F' | 'a'..='f' => {
+ state = HexSecond(c);
+ }
+ c => {
+ bytes.extend(format!(r"\x{}", c).into_bytes());
+ state = Literal;
+ }
+ },
+ HexSecond(first) => match c {
+ '0'..='9' | 'A'..='F' | 'a'..='f' => {
+ let ordinal = format!("{}{}", first, c);
+ let byte = u8::from_str_radix(&ordinal, 16).unwrap();
+ bytes.push(byte);
+ state = Literal;
+ }
+ c => {
+ let original = format!(r"\x{}{}", first, c);
+ bytes.extend(original.into_bytes());
+ state = Literal;
+ }
+ },
+ Literal => match c {
+ '\\' => {
+ state = Escape;
+ }
+ c => {
+ bytes.extend(c.to_string().as_bytes());
+ }
+ },
+ }
+ }
+ match state {
+ Escape => bytes.push(b'\\'),
+ HexFirst => bytes.extend(b"\\x"),
+ HexSecond(c) => bytes.extend(format!("\\x{}", c).into_bytes()),
+ Literal => {}
+ }
+ bytes
+}