aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChih-Hung Hsieh <chh@google.com>2020-04-17 01:35:30 +0000
committerAutomerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>2020-04-17 01:35:30 +0000
commit7b456f1d9de795231795b2541692205e00e93133 (patch)
tree7820a857da0555fc089addcce3710b33363592ea
parent15f90c2d64d844022e06bbb33cafdb56fae4989c (diff)
parent70d61f6af2e52b51edfff374d4f8dfd4940018c9 (diff)
downloadaho-corasick-7b456f1d9de795231795b2541692205e00e93133.tar.gz
Import 'aho-corasick' package version 0.7.10 am: 0a0edd505c am: 92b76d957f am: 70d61f6af2
Change-Id: I3485dbbb61c0610a1cd6ee32c9557f04c5b66c5d
-rw-r--r--.cargo_vcs_info.json5
-rw-r--r--.github/workflows/ci.yml111
-rw-r--r--.gitignore12
-rw-r--r--COPYING3
-rw-r--r--Cargo.toml42
-rw-r--r--Cargo.toml.orig44
-rw-r--r--DESIGN.md483
l---------LICENSE1
-rw-r--r--LICENSE-MIT21
-rw-r--r--METADATA19
-rw-r--r--MODULE_LICENSE_MIT0
-rw-r--r--OWNERS1
-rw-r--r--README.md186
-rw-r--r--UNLICENSE24
-rw-r--r--rustfmt.toml2
-rw-r--r--src/ahocorasick.rs2087
-rw-r--r--src/automaton.rs573
-rw-r--r--src/buffer.rs130
-rw-r--r--src/byte_frequencies.rs258
-rw-r--r--src/classes.rs238
-rw-r--r--src/dfa.rs709
-rw-r--r--src/error.rs101
-rw-r--r--src/lib.rs297
-rw-r--r--src/nfa.rs1363
-rw-r--r--src/packed/api.rs632
-rw-r--r--src/packed/mod.rs117
-rw-r--r--src/packed/pattern.rs318
-rw-r--r--src/packed/rabinkarp.rs185
-rw-r--r--src/packed/teddy/README.md386
-rw-r--r--src/packed/teddy/compile.rs414
-rw-r--r--src/packed/teddy/mod.rs62
-rw-r--r--src/packed/teddy/runtime.rs1204
-rw-r--r--src/packed/tests.rs568
-rw-r--r--src/packed/vector.rs181
-rw-r--r--src/prefilter.rs997
-rw-r--r--src/state_id.rs192
-rw-r--r--src/tests.rs1152
37 files changed, 13118 insertions, 0 deletions
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json
new file mode 100644
index 0000000..06dfa3d
--- /dev/null
+++ b/.cargo_vcs_info.json
@@ -0,0 +1,5 @@
+{
+ "git": {
+ "sha1": "36de9d383aeaf925c7425ed53eee91e61cb9b61c"
+ }
+}
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..06dcdd5
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,111 @@
+name: ci
+on:
+ pull_request:
+ push:
+ branches:
+ - master
+ schedule:
+ - cron: '00 01 * * *'
+jobs:
+ test:
+ name: test
+ env:
+ # For some builds, we use cross to test on 32-bit and big-endian
+ # systems.
+ CARGO: cargo
+ # When CARGO is set to CROSS, TARGET is set to `--target matrix.target`.
+ TARGET:
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ build:
+ - pinned
+ - stable
+ - stable-32
+ - stable-mips
+ - beta
+ - nightly
+ - macos
+ - win-msvc
+ - win-gnu
+ include:
+ - build: pinned
+ os: ubuntu-18.04
+ rust: 1.28.0
+ - build: stable
+ os: ubuntu-18.04
+ rust: stable
+ - build: stable-32
+ os: ubuntu-18.04
+ rust: stable
+ target: i686-unknown-linux-gnu
+ - build: stable-mips
+ os: ubuntu-18.04
+ rust: stable
+ target: mips64-unknown-linux-gnuabi64
+ - build: beta
+ os: ubuntu-18.04
+ rust: beta
+ - build: nightly
+ os: ubuntu-18.04
+ rust: nightly
+ - build: macos
+ os: macos-latest
+ rust: stable
+ - build: win-msvc
+ os: windows-2019
+ rust: stable
+ - build: win-gnu
+ os: windows-2019
+ rust: stable-x86_64-gnu
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v1
+ with:
+ fetch-depth: 1
+ - name: Install Rust
+ uses: actions-rs/toolchain@v1
+ with:
+ toolchain: ${{ matrix.rust }}
+ profile: minimal
+ override: true
+ - name: Use Cross
+ if: matrix.target != ''
+ run: |
+ # FIXME: to work around bugs in latest cross release, install master.
+ # See: https://github.com/rust-embedded/cross/issues/357
+ cargo install --git https://github.com/rust-embedded/cross
+ echo "::set-env name=CARGO::cross"
+ echo "::set-env name=TARGET::--target ${{ matrix.target }}"
+ - name: Show command used for Cargo
+ run: |
+ echo "cargo command is: ${{ env.CARGO }}"
+ echo "target flag is: ${{ env.TARGET }}"
+ - name: Show CPU info for debugging
+ if: matrix.os == 'ubuntu-18.04'
+ run: lscpu
+ - run: ${{ env.CARGO }} build --verbose
+ - run: ${{ env.CARGO }} doc --verbose
+ - run: ${{ env.CARGO }} test --verbose
+ - if: matrix.build == 'nightly'
+ run: ${{ env.CARGO }} build --manifest-path aho-corasick-debug/Cargo.toml
+ - if: matrix.build == 'nightly'
+ run: ${{ env.CARGO }} bench --verbose --manifest-path bench/Cargo.toml -- --test
+
+ rustfmt:
+ name: rustfmt
+ runs-on: ubuntu-18.04
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v1
+ with:
+ fetch-depth: 1
+ - name: Install Rust
+ uses: actions-rs/toolchain@v1
+ with:
+ toolchain: stable
+ profile: minimal
+ components: rustfmt
+ - name: Check formatting
+ run: |
+ cargo fmt --all -- --check
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..f1a4d65
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,12 @@
+.*.swp
+doc
+tags
+examples/ss10pusa.csv
+build
+target
+/Cargo.lock
+scratch*
+bench_large/huge
+BREADCRUMBS
+/tmp
+/aho-corasick-debug/Cargo.lock
diff --git a/COPYING b/COPYING
new file mode 100644
index 0000000..bb9c20a
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,3 @@
+This project is dual-licensed under the Unlicense and MIT licenses.
+
+You may use this code under the terms of either license.
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..b240ec3
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,42 @@
+# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
+#
+# When uploading crates to the registry Cargo will automatically
+# "normalize" Cargo.toml files for maximal compatibility
+# with all versions of Cargo and also rewrite `path` dependencies
+# to registry (e.g., crates.io) dependencies
+#
+# If you believe there's an error in this file please file an
+# issue against the rust-lang/cargo repository. If you're
+# editing this file be aware that the upstream Cargo.toml
+# will likely look very different (and much more reasonable)
+
+[package]
+name = "aho-corasick"
+version = "0.7.10"
+authors = ["Andrew Gallant <jamslam@gmail.com>"]
+exclude = ["/aho-corasick-debug", "/ci/*", "/.travis.yml", "/appveyor.yml"]
+autotests = false
+description = "Fast multiple substring searching."
+homepage = "https://github.com/BurntSushi/aho-corasick"
+readme = "README.md"
+keywords = ["string", "search", "text", "aho", "multi"]
+categories = ["text-processing"]
+license = "Unlicense/MIT"
+repository = "https://github.com/BurntSushi/aho-corasick"
+[profile.bench]
+debug = true
+
+[profile.release]
+debug = true
+
+[lib]
+name = "aho_corasick"
+[dependencies.memchr]
+version = "2.2.0"
+default-features = false
+[dev-dependencies.doc-comment]
+version = "0.3.1"
+
+[features]
+default = ["std"]
+std = ["memchr/use_std"]
diff --git a/Cargo.toml.orig b/Cargo.toml.orig
new file mode 100644
index 0000000..3166f9b
--- /dev/null
+++ b/Cargo.toml.orig
@@ -0,0 +1,44 @@
+[package]
+name = "aho-corasick"
+version = "0.7.10" #:version
+authors = ["Andrew Gallant <jamslam@gmail.com>"]
+description = "Fast multiple substring searching."
+homepage = "https://github.com/BurntSushi/aho-corasick"
+repository = "https://github.com/BurntSushi/aho-corasick"
+readme = "README.md"
+keywords = ["string", "search", "text", "aho", "multi"]
+license = "Unlicense/MIT"
+categories = ["text-processing"]
+autotests = false
+exclude = [
+ "/aho-corasick-debug", "/ci/*", "/.travis.yml", "/appveyor.yml",
+]
+
+[workspace]
+members = ["bench"]
+# We'd ideally not do this, but since the debug tool uses Rust 2018, older
+# versions of Rust (such as 1.28) fail to parse the manifest because it treats
+# `edition = "2018"` as an unstable feature.
+#
+# When we move our MSRV to Rust 2018, then we should be able to add this back
+# to the workspace.
+exclude = ["aho-corasick-debug"]
+
+[lib]
+name = "aho_corasick"
+
+[features]
+default = ["std"]
+std = ["memchr/use_std"]
+
+[dependencies]
+memchr = { version = "2.2.0", default-features = false }
+
+[dev-dependencies]
+doc-comment = "0.3.1"
+
+[profile.release]
+debug = true
+
+[profile.bench]
+debug = true
diff --git a/DESIGN.md b/DESIGN.md
new file mode 100644
index 0000000..367e203
--- /dev/null
+++ b/DESIGN.md
@@ -0,0 +1,483 @@
+This document describes the internal design of this crate, which is an object
+lesson in what happens when you take a fairly simple old algorithm like
+Aho-Corasick and make it fast and production ready.
+
+The target audience of this document is Rust programmers that have some
+familiarity with string searching, however, one does not need to know the
+Aho-Corasick algorithm in order to read this (it is explained below). One
+should, however, know what a trie is. (If you don't, go read its Wikipedia
+article.)
+
+The center-piece of this crate is an implementation of Aho-Corasick. On its
+own, Aho-Corasick isn't that complicated. The complex pieces come from the
+different variants of Aho-Corasick implemented in this crate. Specifically,
+they are:
+
+* Aho-Corasick as an NFA, using dense transitions near the root with sparse
+ transitions elsewhere.
+* Aho-Corasick as a DFA. (An NFA is slower to search, but cheaper to construct
+ and uses less memory.)
+ * A DFA with pre-multiplied state identifiers. This saves a multiplication
+ instruction in the core search loop.
+ * A DFA with equivalence classes of bytes as the alphabet, instead of the
+ traditional 256-byte alphabet. This shrinks the size of the DFA in memory,
+ but adds an extra lookup in the core search loop to map the input byte to
+ an equivalent class.
+* The option to choose how state identifiers are represented, via one of
+ u8, u16, u32, u64 or usize. This permits creating compact automatons when
+ matching a small number of patterns.
+* Supporting "standard" match semantics, along with its overlapping variant,
+ in addition to leftmost-first and leftmost-longest semantics. The "standard"
+ semantics are typically what you see in a textbook description of
+ Aho-Corasick. However, Aho-Corasick is also useful as an optimization in
+ regex engines, which often use leftmost-first or leftmost-longest semantics.
+ Thus, it is useful to implement those semantics here. The "standard" and
+ "leftmost" search algorithms are subtly different, and also require slightly
+ different construction algorithms.
+* Support for ASCII case insensitive matching.
+* Support for accelerating searches when the patterns all start with a small
+ number of fixed bytes. Or alternatively, when the patterns all contain a
+ small number of rare bytes. (Searching for these bytes uses SIMD vectorized
+ code courtesy of `memchr`.)
+* Transparent support for alternative SIMD vectorized search routines for
+ smaller number of literals, such as the Teddy algorithm. We called these
+ "packed" search routines because they use SIMD. They can often be an order of
+ magnitude faster than just Aho-Corasick, but don't scale as well.
+* Support for searching streams. This can reuse most of the underlying code,
+ but does require careful buffering support.
+* Support for anchored searches, which permit efficient `is_prefix` checks for
+ a large number of patterns.
+
+When you combine all of this together along with trying to make everything as
+fast as possible, what you end up with is enitrely too much code with too much
+`unsafe`. Alas, I was not smart enough to figure out how to reduce it. Instead,
+we will explain it.
+
+
+# Basics
+
+The fundamental problem this crate is trying to solve is to determine the
+occurrences of possibly many patterns in a haystack. The naive way to solve
+this is to look for a match for each pattern at each position in the haystack:
+
+ for i in 0..haystack.len():
+ for p in patterns.iter():
+ if haystack[i..].starts_with(p.bytes()):
+ return Match(p.id(), i, i + p.bytes().len())
+
+Those four lines are effectively all this crate does. The problem with those
+four lines is that they are very slow, especially when you're searching for a
+large number of patterns.
+
+While there are many different algorithms available to solve this, a popular
+one is Aho-Corasick. It's a common solution because it's not too hard to
+implement, scales quite well even when searching for thousands of patterns and
+is generally pretty fast. Aho-Corasick does well here because, regardless of
+the number of patterns you're searching for, it always visits each byte in the
+haystack exactly once. This means, generally speaking, adding more patterns to
+an Aho-Corasick automaton does not make it slower. (Strictly speaking, however,
+this is not true, since a larger automaton will make less effective use of the
+CPU's cache.)
+
+Aho-Corasick can be succinctly described as a trie with state transitions
+between some of the nodes that efficiently instruct the search algorithm to
+try matching alternative keys in the automaton. The trick is that these state
+transitions are arranged such that each byte of input needs to be inspected
+only once. These state transitions are typically called "failure transitions,"
+because they instruct the searcher (the thing traversing the automaton while
+reading from the haystack) what to do when a byte in the haystack does not
+correspond to a valid transition in the current state of the trie.
+
+More formally, a failure transition points to a state in the automaton that may
+lead to a match whose prefix is a proper suffix of the path traversed through
+the trie so far. (If no such proper suffix exists, then the failure transition
+points back to the start state of the trie, effectively restarting the search.)
+This is perhaps simpler to explain pictorally. For example, let's say we built
+an Aho-Corasick automaton with the following patterns: 'abcd' and 'cef'. The
+trie looks like this:
+
+ a - S1 - b - S2 - c - S3 - d - S4*
+ /
+ S0 - c - S5 - e - S6 - f - S7*
+
+where states marked with a `*` are match states (meaning, the search algorithm
+should stop and report a match to the caller).
+
+So given this trie, it should be somewhat straight-forward to see how it can
+be used to determine whether any particular haystack *starts* with either
+`abcd` or `cef`. It's easy to express this in code:
+
+ fn has_prefix(trie: &Trie, haystack: &[u8]) -> bool {
+ let mut state_id = trie.start();
+ // If the empty pattern is in trie, then state_id is a match state.
+ if trie.is_match(state_id) {
+ return true;
+ }
+ for (i, &b) in haystack.iter().enumerate() {
+ state_id = match trie.next_state(state_id, b) {
+ Some(id) => id,
+ // If there was no transition for this state and byte, then we know
+ // the haystack does not start with one of the patterns in our trie.
+ None => return false,
+ };
+ if trie.is_match(state_id) {
+ return true;
+ }
+ }
+ false
+ }
+
+And that's pretty much it. All we do is move through the trie starting with the
+bytes at the beginning of the haystack. If we find ourselves in a position
+where we can't move, or if we've looked through the entire haystack without
+seeing a match state, then we know the haystack does not start with any of the
+patterns in the trie.
+
+The meat of the Aho-Corasick algorithm is in how we add failure transitions to
+our trie to keep searching efficient. Specifically, it permits us to not only
+check whether a haystack *starts* with any one of a number of patterns, but
+rather, whether the haystack contains any of a number of patterns *anywhere* in
+the haystack.
+
+As mentioned before, failure transitions connect a proper suffix of the path
+traversed through the trie before, with a path that leads to a match that has a
+prefix corresponding to that proper suffix. So in our case, for patterns `abcd`
+and `cef`, with a haystack `abcef`, we want to transition to state `S5` (from
+the diagram above) from `S3` upon seeing that the byte following `c` is not
+`d`. Namely, the proper suffix in this example is `c`, which is a prefix of
+`cef`. So the modified diagram looks like this:
+
+
+ a - S1 - b - S2 - c - S3 - d - S4*
+ / /
+ / ----------------
+ / /
+ S0 - c - S5 - e - S6 - f - S7*
+
+One thing that isn't shown in this diagram is that *all* states have a failure
+transition, but only `S3` has a *non-trivial* failure transition. That is, all
+other states have a failure transition back to the start state. So if our
+haystack was `abzabcd`, then the searcher would transition back to `S0` after
+seeing `z`, which effectively restarts the search. (Because there is no pattern
+in our trie that has a prefix of `bz` or `z`.)
+
+The code for traversing this *automaton* or *finite state machine* (it is no
+longer just a trie) is not that much different from the `has_prefix` code
+above:
+
+ fn contains(fsm: &FiniteStateMachine, haystack: &[u8]) -> bool {
+ let mut state_id = fsm.start();
+ // If the empty pattern is in fsm, then state_id is a match state.
+ if fsm.is_match(state_id) {
+ return true;
+ }
+ for (i, &b) in haystack.iter().enumerate() {
+ // While the diagram above doesn't show this, we may wind up needing
+ // to follow multiple failure transitions before we land on a state
+ // in which we can advance. Therefore, when searching for the next
+ // state, we need to loop until we don't see a failure transition.
+ //
+ // This loop terminates because the start state has no empty
+ // transitions. Every transition from the start state either points to
+ // another state, or loops back to the start state.
+ loop {
+ match fsm.next_state(state_id, b) {
+ Some(id) => {
+ state_id = id;
+ break;
+ }
+ // Unlike our code above, if there was no transition for this
+ // state, then we don't quit. Instead, we look for this state's
+ // failure transition and follow that instead.
+ None => {
+ state_id = fsm.next_fail_state(state_id);
+ }
+ };
+ }
+ if fsm.is_match(state_id) {
+ return true;
+ }
+ }
+ false
+ }
+
+Other than the complication around traversing failure transitions, this code
+is still roughly "traverse the automaton with bytes from the haystack, and quit
+when a match is seen."
+
+And that concludes our section on the basics. While we didn't go deep into
+how the automaton is built (see `src/nfa.rs`, which has detailed comments about
+that), the basic structure of Aho-Corasick should be reasonably clear.
+
+
+# NFAs and DFAs
+
+There are generally two types of finite automata: non-deterministic finite
+automata (NFA) and deterministic finite automata (DFA). The difference between
+them is, principally, that an NFA can be in multiple states at once. This is
+typically accomplished by things called _epsilon_ transitions, where one could
+move to a new state without consuming any bytes from the input. (The other
+mechanism by which NFAs can be in more than one state is where the same byte in
+a particular state transitions to multiple distinct states.) In contrast, a DFA
+can only ever be in one state at a time. A DFA has no epsilon transitions, and
+for any given state, a byte transitions to at most one other state.
+
+By this formulation, the Aho-Corasick automaton described in the previous
+section is an NFA. This is because failure transitions are, effectively,
+epsilon transitions. That is, whenever the automaton is in state `S`, it is
+actually in the set of states that are reachable by recursively following
+failure transitions from `S`. (This means that, for example, the start state
+is always active since the start state is reachable via failure transitions
+from any state in the automaton.)
+
+NFAs have a lot of nice properties. They tend to be easier to construct, and
+also tend to use less memory. However, their primary downside is that they are
+typically slower to execute. For example, the code above showing how to search
+with an Aho-Corasick automaton needs to potentially iterate through many
+failure transitions for every byte of input. While this is a fairly small
+amount of overhead, this can add up, especially if the automaton has a lot of
+overlapping patterns with a lot of failure transitions.
+
+A DFA's search code, by contrast, looks like this:
+
+ fn contains(dfa: &DFA, haystack: &[u8]) -> bool {
+ let mut state_id = dfa.start();
+ // If the empty pattern is in dfa, then state_id is a match state.
+ if dfa.is_match(state_id) {
+ return true;
+ }
+ for (i, &b) in haystack.iter().enumerate() {
+ // An Aho-Corasick DFA *never* has a missing state that requires
+ // failure transitions to be followed. One byte of input advances the
+ // automaton by one state. Always.
+ state_id = trie.next_state(state_id, b);
+ if fsm.is_match(state_id) {
+ return true;
+ }
+ }
+ false
+ }
+
+The search logic here is much simpler than for the NFA, and this tends to
+translate into significant performance benefits as well, since there's a lot
+less work being done for each byte in the haystack. How is this accomplished?
+It's done by pre-following all failure transitions for all states for all bytes
+in the alphabet, and then building a single state transition table. Building
+this DFA can be much more costly than building the NFA, and use much more
+memory, but the better performance can be worth it.
+
+Users of this crate can actually choose between using an NFA or a DFA. By
+default, an NFA is used, because it typically strikes the best balance between
+space usage and search performance. But the DFA option is available for cases
+where a little extra memory and upfront time building the automaton is okay.
+For example, the `AhoCorasick::auto_configure` and
+`AhoCorasickBuilder::auto_configure` methods will enable the DFA setting if
+there are a small number of patterns.
+
+
+# More DFA tricks
+
+As described in the previous section, one of the downsides of using a DFA
+is that is uses more memory and can take longer to build. One small way of
+mitigating these concerns is to map the alphabet used by the automaton into
+a smaller space. Typically, the alphabet of a DFA has 256 elements in it:
+one element for each possible value that fits into a byte. However, in many
+cases, one does not need the full alphabet. For example, if all patterns in an
+Aho-Corasick automaton are ASCII letters, then this only uses up 52 distinct
+bytes. As far as the automaton is concerned, the rest of the 204 bytes are
+indistinguishable from one another: they will never disrciminate between a
+match or a non-match. Therefore, in cases like that, the alphabet can be shrunk
+to just 53 elements. One for each ASCII letter, and then another to serve as a
+placeholder for every other unused byte.
+
+In practice, this library doesn't quite compute the optimal set of equivalence
+classes, but it's close enough in most cases. The key idea is that this then
+allows the transition table for the DFA to be potentially much smaller. The
+downside of doing this, however, is that since the transition table is defined
+in terms of this smaller alphabet space, every byte in the haystack must be
+re-mapped to this smaller space. This requires an additional 256-byte table.
+In practice, this can lead to a small search time hit, but it can be difficult
+to measure. Moreover, it can sometimes lead to faster search times for bigger
+automata, since it could be difference between more parts of the automaton
+staying in the CPU cache or not.
+
+One other trick for DFAs employed by this crate is the notion of premultiplying
+state identifiers. Specifically, the normal way to compute the next transition
+in a DFA is via the following (assuming that the transition table is laid out
+sequentially in memory, in row-major order, where the rows are states):
+
+ next_state_id = dfa.transitions[current_state_id * 256 + current_byte]
+
+However, since the value `256` is a fixed constant, we can actually premultiply
+the state identifiers in the table when we build the table initially. Then, the
+next transition computation simply becomes:
+
+ next_state_id = dfa.transitions[current_state_id + current_byte]
+
+This doesn't seem like much, but when this is being executed for every byte of
+input that you're searching, saving that extra multiplication instruction can
+add up.
+
+The same optimization works even when equivalence classes are enabled, as
+described above. The only difference is that the premultiplication is by the
+total number of equivalence classes instead of 256.
+
+There isn't much downside to premultiplying state identifiers, other than the
+fact that you may need to choose a bigger integer representation than you would
+otherwise. For example, if you don't premultiply state identifiers, then an
+automaton that uses `u8` as a state identifier can hold up to 256 states.
+However, if they are premultiplied, then it can only hold up to
+`floor(256 / len(alphabet))` states. Thus premultiplication impacts how compact
+your DFA can be. In practice, it's pretty rare to use `u8` as a state
+identifier, so premultiplication is usually a good thing to do.
+
+Both equivalence classes and premultiplication are tuneable parameters via the
+`AhoCorasickBuilder` type, and both are enabled by default.
+
+
+# Match semantics
+
+One of the more interesting things about this implementation of Aho-Corasick
+that (as far as this author knows) separates it from other implementations, is
+that it natively supports leftmost-first and leftmost-longest match semantics.
+Briefly, match semantics refer to the decision procedure by which searching
+will disambiguate matches when there are multiple to choose from:
+
+* **standard** match semantics emits matches as soon as they are detected by
+ the automaton. This is typically equivalent to the textbook non-overlapping
+ formulation of Aho-Corasick.
+* **leftmost-first** match semantics means that 1) the next match is the match
+ starting at the leftmost position and 2) among multiple matches starting at
+ the same leftmost position, the match corresponding to the pattern provided
+ first by the caller is reported.
+* **leftmost-longest** is like leftmost-first, except when there are multiple
+ matches starting at the same leftmost position, the pattern corresponding to
+ the longest match is returned.
+
+(The crate API documentation discusses these differences, with examples, in
+more depth on the `MatchKind` type.)
+
+The reason why supporting these match semantics is important is because it
+gives the user more control over the match procedure. For example,
+leftmost-first permits users to implement match priority by simply putting the
+higher priority patterns first. Leftmost-longest, on the other hand, permits
+finding the longest possible match, which might be useful when trying to find
+words matching a dictionary. Additionally, regex engines often want to use
+Aho-Corasick as an optimization when searching for an alternation of literals.
+In order to preserve correct match semantics, regex engines typically can't use
+the standard textbook definition directly, since regex engines will implement
+either leftmost-first (Perl-like) or leftmost-longest (POSIX) match semantics.
+
+Supporting leftmost semantics requires a couple key changes:
+
+* Constructing the Aho-Corasick automaton changes a bit in both how the trie is
+ constructed and how failure transitions are found. Namely, only a subset of
+ the failure transitions are added. Specifically, only the failure transitions
+ that either do not occur after a match or do occur after a match but preserve
+ that match are kept. (More details on this can be found in `src/nfa.rs`.)
+* The search algorithm changes slightly. Since we are looking for the leftmost
+ match, we cannot quit as soon as a match is detected. Instead, after a match
+ is detected, we must keep searching until either the end of the input or
+ until a dead state is seen. (Dead states are not used for standard match
+ semantics. Dead states mean that searching should stop after a match has been
+ found.)
+
+Other implementations of Aho-Corasick do support leftmost match semantics, but
+they do it with more overhead at search time, or even worse, with a queue of
+matches and sophisticated hijinks to disambiguate the matches. While our
+construction algorithm becomes a bit more complicated, the correct match
+semantics fall out from the structure of the automaton itself.
+
+
+# Overlapping matches
+
+One of the nice properties of an Aho-Corasick automaton is that it can report
+all possible matches, even when they overlap with one another. In this mode,
+the match semantics don't matter, since all possible matches are reported.
+Overlapping searches work just like regular searches, except the state
+identifier at which the previous search left off is carried over to the next
+search, so that it can pick up where it left off. If there are additional
+matches at that state, then they are reported before resuming the search.
+
+Enabling leftmost-first or leftmost-longest match semantics causes the
+automaton to use a subset of all failure transitions, which means that
+overlapping searches cannot be used. Therefore, if leftmost match semantics are
+used, attempting to do an overlapping search will panic. Thus, to get
+overlapping searches, the caller must use the default standard match semantics.
+This behavior was chosen because there are only two alternatives, which were
+deemed worse:
+
+* Compile two automatons internally, one for standard semantics and one for
+ the semantics requested by the caller (if not standard).
+* Create a new type, distinct from the `AhoCorasick` type, which has different
+ capabilities based on the configuration options.
+
+The first is untenable because of the amount of memory used by the automaton.
+The second increases the complexity of the API too much by adding too many
+types that do similar things. It is conceptually much simpler to keep all
+searching isolated to a single type. Callers may query whether the automaton
+supports overlapping searches via the `AhoCorasick::supports_overlapping`
+method.
+
+
+# Stream searching
+
+Since Aho-Corasick is an automaton, it is possible to do partial searches on
+partial parts of the haystack, and then resume that search on subsequent pieces
+of the haystack. This is useful when the haystack you're trying to search is
+not stored contiguous in memory, or if one does not want to read the entire
+haystack into memory at once.
+
+Currently, only standard semantics are supported for stream searching. This is
+some of the more complicated code in this crate, and is something I would very
+much like to improve. In particular, it currently has the restriction that it
+must buffer at least enough of the haystack in memory in order to fit the
+longest possible match. The difficulty in getting stream searching right is
+that the implementation choices (such as the buffer size) often impact what the
+API looks like and what it's allowed to do.
+
+
+# Prefilters
+
+In some cases, Aho-Corasick is not the fastest way to find matches containing
+multiple patterns. Sometimes, the search can be accelerated using highly
+optimized SIMD routines. For example, consider searching the following
+patterns:
+
+ Sherlock
+ Moriarty
+ Watson
+
+It is plausible that it would be much faster to quickly look for occurrences of
+the leading bytes, `S`, `M` or `W`, before trying to start searching via the
+automaton. Indeed, this is exactly what this crate will do.
+
+When there are more than three distinct starting bytes, then this crate will
+look for three distinct bytes occurring at any position in the patterns, while
+preferring bytes that are heuristically determined to be rare over others. For
+example:
+
+ Abuzz
+ Sanchez
+ Vasquez
+ Topaz
+ Waltz
+
+Here, we have more than 3 distinct starting bytes, but all of the patterns
+contain `z`, which is typically a rare byte. In this case, the prefilter will
+scan for `z`, back up a bit, and then execute the Aho-Corasick automaton.
+
+If all of that fails, then a packed multiple substring algorithm will be
+attempted. Currently, the only algorithm available for this is Teddy, but more
+may be added in the future. Teddy is unlike the above prefilters in that it
+confirms its own matches, so when Teddy is active, it might not be necessary
+for Aho-Corasick to run at all. (See `Automaton::leftmost_find_at_no_state_imp`
+in `src/automaton.rs`.) However, the current Teddy implementation only works
+in `x86_64` and when SSSE3 or AVX2 are available, and moreover, only works
+_well_ when there are a small number of patterns (say, less than 100). Teddy
+also requires the haystack to be of a certain length (more than 16-34 bytes).
+When the haystack is shorter than that, Rabin-Karp is used instead. (See
+`src/packed/rabinkarp.rs`.)
+
+There is a more thorough description of Teddy at
+[`src/packed/teddy/README.md`](src/packed/teddy/README.md).
diff --git a/LICENSE b/LICENSE
new file mode 120000
index 0000000..7f9a88e
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1 @@
+LICENSE-MIT \ No newline at end of file
diff --git a/LICENSE-MIT b/LICENSE-MIT
new file mode 100644
index 0000000..3b0a5dc
--- /dev/null
+++ b/LICENSE-MIT
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2015 Andrew Gallant
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/METADATA b/METADATA
new file mode 100644
index 0000000..a6b685d
--- /dev/null
+++ b/METADATA
@@ -0,0 +1,19 @@
+name: "aho-corasick"
+description: "A library for finding occurrences of many patterns at once with SIMD acceleration in some cases. This library provides multiple pattern search principally through an implementation of the Aho-Corasick algorithm, which builds a finite state machine for executing searches in linear time. Features include case insensitive matching, overlapping matches and search & replace in streams."
+third_party {
+ url {
+ type: HOMEPAGE
+ value: "https://crates.io/crates/aho-corasick"
+ }
+ url {
+ type: GIT
+ value: "https://github.com/BurntSushi/aho-corasick"
+ }
+ version: "0.7.10"
+ license_type: NOTICE
+ last_upgrade_date {
+ year: 2020
+ month: 3
+ day: 31
+ }
+}
diff --git a/MODULE_LICENSE_MIT b/MODULE_LICENSE_MIT
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/MODULE_LICENSE_MIT
diff --git a/OWNERS b/OWNERS
new file mode 100644
index 0000000..46fc303
--- /dev/null
+++ b/OWNERS
@@ -0,0 +1 @@
+include platform/prebuilts/rust:/OWNERS
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..9ae3427
--- /dev/null
+++ b/README.md
@@ -0,0 +1,186 @@
+aho-corasick
+============
+A library for finding occurrences of many patterns at once with SIMD
+acceleration in some cases. This library provides multiple pattern
+search principally through an implementation of the
+[Aho-Corasick algorithm](https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm),
+which builds a finite state machine for executing searches in linear time.
+Features include case insensitive matching, overlapping matches and search &
+replace in streams.
+
+[![Build status](https://github.com/BurntSushi/aho-corasick/workflows/ci/badge.svg)](https://github.com/BurntSushi/aho-corasick/actions)
+[![](http://meritbadge.herokuapp.com/aho-corasick)](https://crates.io/crates/aho-corasick)
+
+Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org).
+
+
+### Documentation
+
+https://docs.rs/aho-corasick
+
+
+### Usage
+
+Add this to your `Cargo.toml`:
+
+```toml
+[dependencies]
+aho-corasick = "0.7"
+```
+
+and this to your crate root (if you're using Rust 2015):
+
+```rust
+extern crate aho_corasick;
+```
+
+
+### Example: basic searching
+
+This example shows how to search for occurrences of multiple patterns
+simultaneously. Each match includes the pattern that matched along with the
+byte offsets of the match.
+
+```rust
+use aho_corasick::AhoCorasick;
+
+let patterns = &["apple", "maple", "Snapple"];
+let haystack = "Nobody likes maple in their apple flavored Snapple.";
+
+let ac = AhoCorasick::new(patterns);
+let mut matches = vec![];
+for mat in ac.find_iter(haystack) {
+ matches.push((mat.pattern(), mat.start(), mat.end()));
+}
+assert_eq!(matches, vec![
+ (1, 13, 18),
+ (0, 28, 33),
+ (2, 43, 50),
+]);
+```
+
+
+### Example: case insensitivity
+
+This is like the previous example, but matches `Snapple` case insensitively
+using `AhoCorasickBuilder`:
+
+```rust
+use aho_corasick::AhoCorasickBuilder;
+
+let patterns = &["apple", "maple", "snapple"];
+let haystack = "Nobody likes maple in their apple flavored Snapple.";
+
+let ac = AhoCorasickBuilder::new()
+ .ascii_case_insensitive(true)
+ .build(patterns);
+let mut matches = vec![];
+for mat in ac.find_iter(haystack) {
+ matches.push((mat.pattern(), mat.start(), mat.end()));
+}
+assert_eq!(matches, vec![
+ (1, 13, 18),
+ (0, 28, 33),
+ (2, 43, 50),
+]);
+```
+
+
+### Example: replacing matches in a stream
+
+This example shows how to execute a search and replace on a stream without
+loading the entire stream into memory first.
+
+```rust
+use aho_corasick::AhoCorasick;
+
+let patterns = &["fox", "brown", "quick"];
+let replace_with = &["sloth", "grey", "slow"];
+
+// In a real example, these might be `std::fs::File`s instead. All you need to
+// do is supply a pair of `std::io::Read` and `std::io::Write` implementations.
+let rdr = "The quick brown fox.";
+let mut wtr = vec![];
+
+let ac = AhoCorasick::new(patterns);
+ac.stream_replace_all(rdr.as_bytes(), &mut wtr, replace_with)?;
+assert_eq!(b"The slow grey sloth.".to_vec(), wtr);
+```
+
+
+### Example: finding the leftmost first match
+
+In the textbook description of Aho-Corasick, its formulation is typically
+structured such that it reports all possible matches, even when they overlap
+with another. In many cases, overlapping matches may not be desired, such as
+the case of finding all successive non-overlapping matches like you might with
+a standard regular expression.
+
+Unfortunately the "obvious" way to modify the Aho-Corasick algorithm to do
+this doesn't always work in the expected way, since it will report matches as
+soon as they are seen. For example, consider matching the regex `Samwise|Sam`
+against the text `Samwise`. Most regex engines (that are Perl-like, or
+non-POSIX) will report `Samwise` as a match, but the standard Aho-Corasick
+algorithm modified for reporting non-overlapping matches will report `Sam`.
+
+A novel contribution of this library is the ability to change the match
+semantics of Aho-Corasick (without additional search time overhead) such that
+`Samwise` is reported instead. For example, here's the standard approach:
+
+```rust
+use aho_corasick::AhoCorasick;
+
+let patterns = &["Samwise", "Sam"];
+let haystack = "Samwise";
+
+let ac = AhoCorasick::new(patterns);
+let mat = ac.find(haystack).expect("should have a match");
+assert_eq!("Sam", &haystack[mat.start()..mat.end()]);
+```
+
+And now here's the leftmost-first version, which matches how a Perl-like
+regex will work:
+
+```rust
+use aho_corasick::{AhoCorasickBuilder, MatchKind};
+
+let patterns = &["Samwise", "Sam"];
+let haystack = "Samwise";
+
+let ac = AhoCorasickBuilder::new()
+ .match_kind(MatchKind::LeftmostFirst)
+ .build(patterns);
+let mat = ac.find(haystack).expect("should have a match");
+assert_eq!("Samwise", &haystack[mat.start()..mat.end()]);
+```
+
+In addition to leftmost-first semantics, this library also supports
+leftmost-longest semantics, which match the POSIX behavior of a regular
+expression alternation. See `MatchKind` in the docs for more details.
+
+
+### Minimum Rust version policy
+
+This crate's minimum supported `rustc` version is `1.28.0`.
+
+The current policy is that the minimum Rust version required to use this crate
+can be increased in minor version updates. For example, if `crate 1.0` requires
+Rust 1.20.0, then `crate 1.0.z` for all values of `z` will also require Rust
+1.20.0 or newer. However, `crate 1.y` for `y > 0` may require a newer minimum
+version of Rust.
+
+In general, this crate will be conservative with respect to the minimum
+supported version of Rust.
+
+
+### Future work
+
+Here are some plans for the future:
+
+* Assuming the current API is sufficient, I'd like to commit to it and release
+ a `1.0` version of this crate some time in the next 6-12 months.
+* Support stream searching with leftmost match semantics. Currently, only
+ standard match semantics are supported. Getting this right seems possible,
+ but is tricky since the match state needs to be propagated through multiple
+ searches. (With standard semantics, as soon as a match is seen the search
+ ends.)
diff --git a/UNLICENSE b/UNLICENSE
new file mode 100644
index 0000000..68a49da
--- /dev/null
+++ b/UNLICENSE
@@ -0,0 +1,24 @@
+This is free and unencumbered software released into the public domain.
+
+Anyone is free to copy, modify, publish, use, compile, sell, or
+distribute this software, either in source code form or as a compiled
+binary, for any purpose, commercial or non-commercial, and by any
+means.
+
+In jurisdictions that recognize copyright laws, the author or authors
+of this software dedicate any and all copyright interest in the
+software to the public domain. We make this dedication for the benefit
+of the public at large and to the detriment of our heirs and
+successors. We intend this dedication to be an overt act of
+relinquishment in perpetuity of all present and future rights to this
+software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
+For more information, please refer to <http://unlicense.org/>
diff --git a/rustfmt.toml b/rustfmt.toml
new file mode 100644
index 0000000..aa37a21
--- /dev/null
+++ b/rustfmt.toml
@@ -0,0 +1,2 @@
+max_width = 79
+use_small_heuristics = "max"
diff --git a/src/ahocorasick.rs b/src/ahocorasick.rs
new file mode 100644
index 0000000..9b7d9e7
--- /dev/null
+++ b/src/ahocorasick.rs
@@ -0,0 +1,2087 @@
+use std::io;
+
+use automaton::Automaton;
+use buffer::Buffer;
+use dfa::{self, DFA};
+use error::Result;
+use nfa::{self, NFA};
+use packed;
+use prefilter::PrefilterState;
+use state_id::StateID;
+use Match;
+
+/// An automaton for searching multiple strings in linear time.
+///
+/// The `AhoCorasick` type supports a few basic ways of constructing an
+/// automaton, including
+/// [`AhoCorasick::new`](struct.AhoCorasick.html#method.new)
+/// and
+/// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured).
+/// However, there are a fair number of configurable options that can be set
+/// by using
+/// [`AhoCorasickBuilder`](struct.AhoCorasickBuilder.html)
+/// instead. Such options include, but are not limited to, how matches are
+/// determined, simple case insensitivity, whether to use a DFA or not and
+/// various knobs for controlling the space-vs-time trade offs taken when
+/// building the automaton.
+///
+/// If you aren't sure where to start, try beginning with
+/// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured).
+///
+/// # Resource usage
+///
+/// Aho-Corasick automatons are always constructed in `O(p)` time, where `p`
+/// is the combined length of all patterns being searched. With that said,
+/// building an automaton can be fairly costly because of high constant
+/// factors, particularly when enabling the
+/// [DFA](struct.AhoCorasickBuilder.html#method.dfa)
+/// option (which is disabled by default). For this reason, it's generally a
+/// good idea to build an automaton once and reuse it as much as possible.
+///
+/// Aho-Corasick automatons can also use a fair bit of memory. To get a
+/// concrete idea of how much memory is being used, try using the
+/// [`AhoCorasick::heap_bytes`](struct.AhoCorasick.html#method.heap_bytes)
+/// method.
+///
+/// # Examples
+///
+/// This example shows how to search for occurrences of multiple patterns
+/// simultaneously in a case insensitive fashion. Each match includes the
+/// pattern that matched along with the byte offsets of the match.
+///
+/// ```
+/// use aho_corasick::AhoCorasickBuilder;
+///
+/// let patterns = &["apple", "maple", "snapple"];
+/// let haystack = "Nobody likes maple in their apple flavored Snapple.";
+///
+/// let ac = AhoCorasickBuilder::new()
+/// .ascii_case_insensitive(true)
+/// .build(patterns);
+/// let mut matches = vec![];
+/// for mat in ac.find_iter(haystack) {
+/// matches.push((mat.pattern(), mat.start(), mat.end()));
+/// }
+/// assert_eq!(matches, vec![
+/// (1, 13, 18),
+/// (0, 28, 33),
+/// (2, 43, 50),
+/// ]);
+/// ```
+///
+/// This example shows how to replace matches with some other string:
+///
+/// ```
+/// use aho_corasick::AhoCorasick;
+///
+/// let patterns = &["fox", "brown", "quick"];
+/// let haystack = "The quick brown fox.";
+/// let replace_with = &["sloth", "grey", "slow"];
+///
+/// let ac = AhoCorasick::new(patterns);
+/// let result = ac.replace_all(haystack, replace_with);
+/// assert_eq!(result, "The slow grey sloth.");
+/// ```
+#[derive(Clone, Debug)]
+pub struct AhoCorasick<S: StateID = usize> {
+ imp: Imp<S>,
+ match_kind: MatchKind,
+}
+
+impl AhoCorasick {
+ /// Create a new Aho-Corasick automaton using the default configuration.
+ ///
+ /// The default configuration optimizes for less space usage, but at the
+ /// expense of longer search times. To change the configuration, use
+ /// [`AhoCorasickBuilder`](struct.AhoCorasickBuilder.html)
+ /// for fine-grained control, or
+ /// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured)
+ /// for automatic configuration if you aren't sure which settings to pick.
+ ///
+ /// This uses the default
+ /// [`MatchKind::Standard`](enum.MatchKind.html#variant.Standard)
+ /// match semantics, which reports a match as soon as it is found. This
+ /// corresponds to the standard match semantics supported by textbook
+ /// descriptions of the Aho-Corasick algorithm.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::AhoCorasick;
+ ///
+ /// let ac = AhoCorasick::new(&[
+ /// "foo", "bar", "baz",
+ /// ]);
+ /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
+ /// ```
+ pub fn new<I, P>(patterns: I) -> AhoCorasick
+ where
+ I: IntoIterator<Item = P>,
+ P: AsRef<[u8]>,
+ {
+ AhoCorasickBuilder::new().build(patterns)
+ }
+
+ /// Build an Aho-Corasick automaton with an automatically determined
+ /// configuration.
+ ///
+ /// Specifically, this requires a slice of patterns instead of an iterator
+ /// since the configuration is determined by looking at the patterns before
+ /// constructing the automaton. The idea here is to balance space and time
+ /// automatically. That is, when searching a small number of patterns, this
+ /// will attempt to use the fastest possible configuration since the total
+ /// space required will be small anyway. As the number of patterns grows,
+ /// this will fall back to slower configurations that use less space.
+ ///
+ /// If you want auto configuration but with match semantics different from
+ /// the default `MatchKind::Standard`, then use
+ /// [`AhoCorasickBuilder::auto_configure`](struct.AhoCorasickBuilder.html#method.auto_configure).
+ ///
+ /// # Examples
+ ///
+ /// Basic usage is just like `new`, except you must provide a slice:
+ ///
+ /// ```
+ /// use aho_corasick::AhoCorasick;
+ ///
+ /// let ac = AhoCorasick::new_auto_configured(&[
+ /// "foo", "bar", "baz",
+ /// ]);
+ /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
+ /// ```
+ pub fn new_auto_configured<B>(patterns: &[B]) -> AhoCorasick
+ where
+ B: AsRef<[u8]>,
+ {
+ AhoCorasickBuilder::new().auto_configure(patterns).build(patterns)
+ }
+}
+
+impl<S: StateID> AhoCorasick<S> {
+ /// Returns true if and only if this automaton matches the haystack at any
+ /// position.
+ ///
+ /// `haystack` may be any type that is cheaply convertible to a `&[u8]`.
+ /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and
+ /// `&[u8]` itself.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::AhoCorasick;
+ ///
+ /// let ac = AhoCorasick::new(&[
+ /// "foo", "bar", "quux", "baz",
+ /// ]);
+ /// assert!(ac.is_match("xxx bar xxx"));
+ /// assert!(!ac.is_match("xxx qux xxx"));
+ /// ```
+ pub fn is_match<B: AsRef<[u8]>>(&self, haystack: B) -> bool {
+ self.earliest_find(haystack).is_some()
+ }
+
+ /// Returns the location of the first detected match in `haystack`.
+ ///
+ /// This method has the same behavior regardless of the
+ /// [`MatchKind`](enum.MatchKind.html)
+ /// of this automaton.
+ ///
+ /// `haystack` may be any type that is cheaply convertible to a `&[u8]`.
+ /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and
+ /// `&[u8]` itself.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::AhoCorasick;
+ ///
+ /// let ac = AhoCorasick::new(&[
+ /// "abc", "b",
+ /// ]);
+ /// let mat = ac.earliest_find("abcd").expect("should have match");
+ /// assert_eq!(1, mat.pattern());
+ /// assert_eq!((1, 2), (mat.start(), mat.end()));
+ /// ```
+ pub fn earliest_find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match> {
+ let mut prestate = PrefilterState::new(self.max_pattern_len());
+ let mut start = self.imp.start_state();
+ self.imp.earliest_find_at(
+ &mut prestate,
+ haystack.as_ref(),
+ 0,
+ &mut start,
+ )
+ }
+
+ /// Returns the location of the first match according to the match
+ /// semantics that this automaton was constructed with.
+ ///
+ /// When using `MatchKind::Standard`, this corresponds precisely to the
+ /// same behavior as
+ /// [`earliest_find`](struct.AhoCorasick.html#method.earliest_find).
+ /// Otherwise, match semantics correspond to either
+ /// [leftmost-first](enum.MatchKind.html#variant.LeftmostFirst)
+ /// or
+ /// [leftmost-longest](enum.MatchKind.html#variant.LeftmostLongest).
+ ///
+ /// `haystack` may be any type that is cheaply convertible to a `&[u8]`.
+ /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and
+ /// `&[u8]` itself.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage, with standard semantics:
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
+ ///
+ /// let patterns = &["b", "abc", "abcd"];
+ /// let haystack = "abcd";
+ ///
+ /// let ac = AhoCorasickBuilder::new()
+ /// .match_kind(MatchKind::Standard) // default, not necessary
+ /// .build(patterns);
+ /// let mat = ac.find(haystack).expect("should have a match");
+ /// assert_eq!("b", &haystack[mat.start()..mat.end()]);
+ /// ```
+ ///
+ /// Now with leftmost-first semantics:
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
+ ///
+ /// let patterns = &["b", "abc", "abcd"];
+ /// let haystack = "abcd";
+ ///
+ /// let ac = AhoCorasickBuilder::new()
+ /// .match_kind(MatchKind::LeftmostFirst)
+ /// .build(patterns);
+ /// let mat = ac.find(haystack).expect("should have a match");
+ /// assert_eq!("abc", &haystack[mat.start()..mat.end()]);
+ /// ```
+ ///
+ /// And finally, leftmost-longest semantics:
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
+ ///
+ /// let patterns = &["b", "abc", "abcd"];
+ /// let haystack = "abcd";
+ ///
+ /// let ac = AhoCorasickBuilder::new()
+ /// .match_kind(MatchKind::LeftmostLongest)
+ /// .build(patterns);
+ /// let mat = ac.find(haystack).expect("should have a match");
+ /// assert_eq!("abcd", &haystack[mat.start()..mat.end()]);
+ /// ```
+ pub fn find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match> {
+ let mut prestate = PrefilterState::new(self.max_pattern_len());
+ self.imp.find_at_no_state(&mut prestate, haystack.as_ref(), 0)
+ }
+
+ /// Returns an iterator of non-overlapping matches, using the match
+ /// semantics that this automaton was constructed with.
+ ///
+ /// `haystack` may be any type that is cheaply convertible to a `&[u8]`.
+ /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and
+ /// `&[u8]` itself.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage, with standard semantics:
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
+ ///
+ /// let patterns = &["append", "appendage", "app"];
+ /// let haystack = "append the app to the appendage";
+ ///
+ /// let ac = AhoCorasickBuilder::new()
+ /// .match_kind(MatchKind::Standard) // default, not necessary
+ /// .build(patterns);
+ /// let matches: Vec<usize> = ac
+ /// .find_iter(haystack)
+ /// .map(|mat| mat.pattern())
+ /// .collect();
+ /// assert_eq!(vec![2, 2, 2], matches);
+ /// ```
+ ///
+ /// Now with leftmost-first semantics:
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
+ ///
+ /// let patterns = &["append", "appendage", "app"];
+ /// let haystack = "append the app to the appendage";
+ ///
+ /// let ac = AhoCorasickBuilder::new()
+ /// .match_kind(MatchKind::LeftmostFirst)
+ /// .build(patterns);
+ /// let matches: Vec<usize> = ac
+ /// .find_iter(haystack)
+ /// .map(|mat| mat.pattern())
+ /// .collect();
+ /// assert_eq!(vec![0, 2, 0], matches);
+ /// ```
+ ///
+ /// And finally, leftmost-longest semantics:
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
+ ///
+ /// let patterns = &["append", "appendage", "app"];
+ /// let haystack = "append the app to the appendage";
+ ///
+ /// let ac = AhoCorasickBuilder::new()
+ /// .match_kind(MatchKind::LeftmostLongest)
+ /// .build(patterns);
+ /// let matches: Vec<usize> = ac
+ /// .find_iter(haystack)
+ /// .map(|mat| mat.pattern())
+ /// .collect();
+ /// assert_eq!(vec![0, 2, 1], matches);
+ /// ```
+ pub fn find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>(
+ &'a self,
+ haystack: &'b B,
+ ) -> FindIter<'a, 'b, S> {
+ FindIter::new(self, haystack.as_ref())
+ }
+
+ /// Returns an iterator of overlapping matches in the given `haystack`.
+ ///
+ /// Overlapping matches can _only_ be detected using
+ /// `MatchKind::Standard` semantics. If this automaton was constructed with
+ /// leftmost semantics, then this method will panic. To determine whether
+ /// this will panic at runtime, use the
+ /// [`AhoCorasick::supports_overlapping`](struct.AhoCorasick.html#method.supports_overlapping)
+ /// method.
+ ///
+ /// `haystack` may be any type that is cheaply convertible to a `&[u8]`.
+ /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and
+ /// `&[u8]` itself.
+ ///
+ /// # Panics
+ ///
+ /// This panics when `AhoCorasick::supports_overlapping` returns `false`.
+ /// That is, this panics when this automaton's match semantics are not
+ /// `MatchKind::Standard`.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage, with standard semantics:
+ ///
+ /// ```
+ /// use aho_corasick::AhoCorasick;
+ ///
+ /// let patterns = &["append", "appendage", "app"];
+ /// let haystack = "append the app to the appendage";
+ ///
+ /// let ac = AhoCorasick::new(patterns);
+ /// let matches: Vec<usize> = ac
+ /// .find_overlapping_iter(haystack)
+ /// .map(|mat| mat.pattern())
+ /// .collect();
+ /// assert_eq!(vec![2, 0, 2, 2, 0, 1], matches);
+ /// ```
+ pub fn find_overlapping_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>(
+ &'a self,
+ haystack: &'b B,
+ ) -> FindOverlappingIter<'a, 'b, S> {
+ FindOverlappingIter::new(self, haystack.as_ref())
+ }
+
+ /// Replace all matches with a corresponding value in the `replace_with`
+ /// slice given. Matches correspond to the same matches as reported by
+ /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
+ ///
+ /// Replacements are determined by the index of the matching pattern.
+ /// For example, if the pattern with index `2` is found, then it is
+ /// replaced by `replace_with[2]`.
+ ///
+ /// # Panics
+ ///
+ /// This panics when `replace_with.len()` does not equal the total number
+ /// of patterns that are matched by this automaton.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
+ ///
+ /// let patterns = &["append", "appendage", "app"];
+ /// let haystack = "append the app to the appendage";
+ ///
+ /// let ac = AhoCorasickBuilder::new()
+ /// .match_kind(MatchKind::LeftmostFirst)
+ /// .build(patterns);
+ /// let result = ac.replace_all(haystack, &["x", "y", "z"]);
+ /// assert_eq!("x the z to the xage", result);
+ /// ```
+ pub fn replace_all<B>(&self, haystack: &str, replace_with: &[B]) -> String
+ where
+ B: AsRef<str>,
+ {
+ assert_eq!(
+ replace_with.len(),
+ self.pattern_count(),
+ "replace_all requires a replacement for every pattern \
+ in the automaton"
+ );
+ let mut dst = String::with_capacity(haystack.len());
+ self.replace_all_with(haystack, &mut dst, |mat, _, dst| {
+ dst.push_str(replace_with[mat.pattern()].as_ref());
+ true
+ });
+ dst
+ }
+
+ /// Replace all matches using raw bytes with a corresponding value in the
+ /// `replace_with` slice given. Matches correspond to the same matches as
+ /// reported by [`find_iter`](struct.AhoCorasick.html#method.find_iter).
+ ///
+ /// Replacements are determined by the index of the matching pattern.
+ /// For example, if the pattern with index `2` is found, then it is
+ /// replaced by `replace_with[2]`.
+ ///
+ /// # Panics
+ ///
+ /// This panics when `replace_with.len()` does not equal the total number
+ /// of patterns that are matched by this automaton.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
+ ///
+ /// let patterns = &["append", "appendage", "app"];
+ /// let haystack = b"append the app to the appendage";
+ ///
+ /// let ac = AhoCorasickBuilder::new()
+ /// .match_kind(MatchKind::LeftmostFirst)
+ /// .build(patterns);
+ /// let result = ac.replace_all_bytes(haystack, &["x", "y", "z"]);
+ /// assert_eq!(b"x the z to the xage".to_vec(), result);
+ /// ```
+ pub fn replace_all_bytes<B>(
+ &self,
+ haystack: &[u8],
+ replace_with: &[B],
+ ) -> Vec<u8>
+ where
+ B: AsRef<[u8]>,
+ {
+ assert_eq!(
+ replace_with.len(),
+ self.pattern_count(),
+ "replace_all_bytes requires a replacement for every pattern \
+ in the automaton"
+ );
+ let mut dst = Vec::with_capacity(haystack.len());
+ self.replace_all_with_bytes(haystack, &mut dst, |mat, _, dst| {
+ dst.extend(replace_with[mat.pattern()].as_ref());
+ true
+ });
+ dst
+ }
+
+ /// Replace all matches using a closure called on each match.
+ /// Matches correspond to the same matches as reported by
+ /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
+ ///
+ /// The closure accepts three parameters: the match found, the text of
+ /// the match and a string buffer with which to write the replaced text
+ /// (if any). If the closure returns `true`, then it continues to the next
+ /// match. If the closure returns false, then searching is stopped.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
+ ///
+ /// let patterns = &["append", "appendage", "app"];
+ /// let haystack = "append the app to the appendage";
+ ///
+ /// let ac = AhoCorasickBuilder::new()
+ /// .match_kind(MatchKind::LeftmostFirst)
+ /// .build(patterns);
+ /// let mut result = String::new();
+ /// ac.replace_all_with(haystack, &mut result, |mat, _, dst| {
+ /// dst.push_str(&mat.pattern().to_string());
+ /// true
+ /// });
+ /// assert_eq!("0 the 2 to the 0age", result);
+ /// ```
+ pub fn replace_all_with<F>(
+ &self,
+ haystack: &str,
+ dst: &mut String,
+ mut replace_with: F,
+ ) where
+ F: FnMut(&Match, &str, &mut String) -> bool,
+ {
+ let mut last_match = 0;
+ for mat in self.find_iter(haystack) {
+ dst.push_str(&haystack[last_match..mat.start()]);
+ last_match = mat.end();
+ replace_with(&mat, &haystack[mat.start()..mat.end()], dst);
+ }
+ dst.push_str(&haystack[last_match..]);
+ }
+
+ /// Replace all matches using raw bytes with a closure called on each
+ /// match. Matches correspond to the same matches as reported by
+ /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
+ ///
+ /// The closure accepts three parameters: the match found, the text of
+ /// the match and a byte buffer with which to write the replaced text
+ /// (if any). If the closure returns `true`, then it continues to the next
+ /// match. If the closure returns false, then searching is stopped.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
+ ///
+ /// let patterns = &["append", "appendage", "app"];
+ /// let haystack = b"append the app to the appendage";
+ ///
+ /// let ac = AhoCorasickBuilder::new()
+ /// .match_kind(MatchKind::LeftmostFirst)
+ /// .build(patterns);
+ /// let mut result = vec![];
+ /// ac.replace_all_with_bytes(haystack, &mut result, |mat, _, dst| {
+ /// dst.extend(mat.pattern().to_string().bytes());
+ /// true
+ /// });
+ /// assert_eq!(b"0 the 2 to the 0age".to_vec(), result);
+ /// ```
+ pub fn replace_all_with_bytes<F>(
+ &self,
+ haystack: &[u8],
+ dst: &mut Vec<u8>,
+ mut replace_with: F,
+ ) where
+ F: FnMut(&Match, &[u8], &mut Vec<u8>) -> bool,
+ {
+ let mut last_match = 0;
+ for mat in self.find_iter(haystack) {
+ dst.extend(&haystack[last_match..mat.start()]);
+ last_match = mat.end();
+ replace_with(&mat, &haystack[mat.start()..mat.end()], dst);
+ }
+ dst.extend(&haystack[last_match..]);
+ }
+
+ /// Returns an iterator of non-overlapping matches in the given
+ /// stream. Matches correspond to the same matches as reported by
+ /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
+ ///
+ /// The matches yielded by this iterator use absolute position offsets in
+ /// the stream given, where the first byte has index `0`. Matches are
+ /// yieled until the stream is exhausted.
+ ///
+ /// Each item yielded by the iterator is an `io::Result<Match>`, where an
+ /// error is yielded if there was a problem reading from the reader given.
+ ///
+ /// When searching a stream, an internal buffer is used. Therefore, callers
+ /// should avoiding providing a buffered reader, if possible.
+ ///
+ /// Searching a stream requires that the automaton was built with
+ /// `MatchKind::Standard` semantics. If this automaton was constructed
+ /// with leftmost semantics, then this method will panic. To determine
+ /// whether this will panic at runtime, use the
+ /// [`AhoCorasick::supports_stream`](struct.AhoCorasick.html#method.supports_stream)
+ /// method.
+ ///
+ /// # Memory usage
+ ///
+ /// In general, searching streams will use a constant amount of memory for
+ /// its internal buffer. The one requirement is that the internal buffer
+ /// must be at least the size of the longest possible match. In most use
+ /// cases, the default buffer size will be much larger than any individual
+ /// match.
+ ///
+ /// # Panics
+ ///
+ /// This panics when `AhoCorasick::supports_stream` returns `false`.
+ /// That is, this panics when this automaton's match semantics are not
+ /// `MatchKind::Standard`. This restriction may be lifted in the future.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::AhoCorasick;
+ ///
+ /// # fn example() -> Result<(), ::std::io::Error> {
+ /// let patterns = &["append", "appendage", "app"];
+ /// let haystack = "append the app to the appendage";
+ ///
+ /// let ac = AhoCorasick::new(patterns);
+ /// let mut matches = vec![];
+ /// for result in ac.stream_find_iter(haystack.as_bytes()) {
+ /// let mat = result?;
+ /// matches.push(mat.pattern());
+ /// }
+ /// assert_eq!(vec![2, 2, 2], matches);
+ /// # Ok(()) }; example().unwrap()
+ /// ```
+ pub fn stream_find_iter<'a, R: io::Read>(
+ &'a self,
+ rdr: R,
+ ) -> StreamFindIter<'a, R, S> {
+ StreamFindIter::new(self, rdr)
+ }
+
+ /// Search for and replace all matches of this automaton in
+ /// the given reader, and write the replacements to the given
+ /// writer. Matches correspond to the same matches as reported by
+ /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
+ ///
+ /// Replacements are determined by the index of the matching pattern.
+ /// For example, if the pattern with index `2` is found, then it is
+ /// replaced by `replace_with[2]`.
+ ///
+ /// After all matches are replaced, the writer is _not_ flushed.
+ ///
+ /// If there was a problem reading from the given reader or writing to the
+ /// given writer, then the corresponding `io::Error` is returned and all
+ /// replacement is stopped.
+ ///
+ /// When searching a stream, an internal buffer is used. Therefore, callers
+ /// should avoiding providing a buffered reader, if possible. However,
+ /// callers may want to provide a buffered writer.
+ ///
+ /// Searching a stream requires that the automaton was built with
+ /// `MatchKind::Standard` semantics. If this automaton was constructed
+ /// with leftmost semantics, then this method will panic. To determine
+ /// whether this will panic at runtime, use the
+ /// [`AhoCorasick::supports_stream`](struct.AhoCorasick.html#method.supports_stream)
+ /// method.
+ ///
+ /// # Memory usage
+ ///
+ /// In general, searching streams will use a constant amount of memory for
+ /// its internal buffer. The one requirement is that the internal buffer
+ /// must be at least the size of the longest possible match. In most use
+ /// cases, the default buffer size will be much larger than any individual
+ /// match.
+ ///
+ /// # Panics
+ ///
+ /// This panics when `AhoCorasick::supports_stream` returns `false`.
+ /// That is, this panics when this automaton's match semantics are not
+ /// `MatchKind::Standard`. This restriction may be lifted in the future.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::AhoCorasick;
+ ///
+ /// # fn example() -> Result<(), ::std::io::Error> {
+ /// let patterns = &["fox", "brown", "quick"];
+ /// let haystack = "The quick brown fox.";
+ /// let replace_with = &["sloth", "grey", "slow"];
+ ///
+ /// let ac = AhoCorasick::new(patterns);
+ /// let mut result = vec![];
+ /// ac.stream_replace_all(haystack.as_bytes(), &mut result, replace_with)?;
+ /// assert_eq!(b"The slow grey sloth.".to_vec(), result);
+ /// # Ok(()) }; example().unwrap()
+ /// ```
+ pub fn stream_replace_all<R, W, B>(
+ &self,
+ rdr: R,
+ wtr: W,
+ replace_with: &[B],
+ ) -> io::Result<()>
+ where
+ R: io::Read,
+ W: io::Write,
+ B: AsRef<[u8]>,
+ {
+ assert_eq!(
+ replace_with.len(),
+ self.pattern_count(),
+ "stream_replace_all requires a replacement for every pattern \
+ in the automaton"
+ );
+ self.stream_replace_all_with(rdr, wtr, |mat, _, wtr| {
+ wtr.write_all(replace_with[mat.pattern()].as_ref())
+ })
+ }
+
+ /// Search the given reader and replace all matches of this automaton
+ /// using the given closure. The result is written to the given
+ /// writer. Matches correspond to the same matches as reported by
+ /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
+ ///
+ /// The closure accepts three parameters: the match found, the text of
+ /// the match and the writer with which to write the replaced text
+ /// (if any). If the closure returns `true`, then it continues to the next
+ /// match. If the closure returns false, then searching is stopped.
+ ///
+ /// After all matches are replaced, the writer is _not_ flushed.
+ ///
+ /// If there was a problem reading from the given reader or writing to the
+ /// given writer, then the corresponding `io::Error` is returned and all
+ /// replacement is stopped.
+ ///
+ /// When searching a stream, an internal buffer is used. Therefore, callers
+ /// should avoiding providing a buffered reader, if possible. However,
+ /// callers may want to provide a buffered writer.
+ ///
+ /// Searching a stream requires that the automaton was built with
+ /// `MatchKind::Standard` semantics. If this automaton was constructed
+ /// with leftmost semantics, then this method will panic. To determine
+ /// whether this will panic at runtime, use the
+ /// [`AhoCorasick::supports_stream`](struct.AhoCorasick.html#method.supports_stream)
+ /// method.
+ ///
+ /// # Memory usage
+ ///
+ /// In general, searching streams will use a constant amount of memory for
+ /// its internal buffer. The one requirement is that the internal buffer
+ /// must be at least the size of the longest possible match. In most use
+ /// cases, the default buffer size will be much larger than any individual
+ /// match.
+ ///
+ /// # Panics
+ ///
+ /// This panics when `AhoCorasick::supports_stream` returns `false`.
+ /// That is, this panics when this automaton's match semantics are not
+ /// `MatchKind::Standard`. This restriction may be lifted in the future.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use std::io::Write;
+ /// use aho_corasick::AhoCorasick;
+ ///
+ /// # fn example() -> Result<(), ::std::io::Error> {
+ /// let patterns = &["fox", "brown", "quick"];
+ /// let haystack = "The quick brown fox.";
+ ///
+ /// let ac = AhoCorasick::new(patterns);
+ /// let mut result = vec![];
+ /// ac.stream_replace_all_with(
+ /// haystack.as_bytes(),
+ /// &mut result,
+ /// |mat, _, wtr| {
+ /// wtr.write_all(mat.pattern().to_string().as_bytes())
+ /// },
+ /// )?;
+ /// assert_eq!(b"The 2 1 0.".to_vec(), result);
+ /// # Ok(()) }; example().unwrap()
+ /// ```
+ pub fn stream_replace_all_with<R, W, F>(
+ &self,
+ rdr: R,
+ mut wtr: W,
+ mut replace_with: F,
+ ) -> io::Result<()>
+ where
+ R: io::Read,
+ W: io::Write,
+ F: FnMut(&Match, &[u8], &mut W) -> io::Result<()>,
+ {
+ let mut it = StreamChunkIter::new(self, rdr);
+ while let Some(result) = it.next() {
+ let chunk = result?;
+ match chunk {
+ StreamChunk::NonMatch { bytes, .. } => {
+ wtr.write_all(bytes)?;
+ }
+ StreamChunk::Match { bytes, mat } => {
+ replace_with(&mat, bytes, &mut wtr)?;
+ }
+ }
+ }
+ Ok(())
+ }
+
+ /// Returns the match kind used by this automaton.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasick, MatchKind};
+ ///
+ /// let ac = AhoCorasick::new(&[
+ /// "foo", "bar", "quux", "baz",
+ /// ]);
+ /// assert_eq!(&MatchKind::Standard, ac.match_kind());
+ /// ```
+ pub fn match_kind(&self) -> &MatchKind {
+ self.imp.match_kind()
+ }
+
+ /// Returns the length of the longest pattern matched by this automaton.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::AhoCorasick;
+ ///
+ /// let ac = AhoCorasick::new(&[
+ /// "foo", "bar", "quux", "baz",
+ /// ]);
+ /// assert_eq!(4, ac.max_pattern_len());
+ /// ```
+ pub fn max_pattern_len(&self) -> usize {
+ self.imp.max_pattern_len()
+ }
+
+ /// Return the total number of patterns matched by this automaton.
+ ///
+ /// This includes patterns that may never participate in a match. For
+ /// example, if
+ /// [`MatchKind::LeftmostFirst`](enum.MatchKind.html#variant.LeftmostFirst)
+ /// match semantics are used, and the patterns `Sam` and `Samwise` were
+ /// used to build the automaton, then `Samwise` can never participate in a
+ /// match because `Sam` will always take priority.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::AhoCorasick;
+ ///
+ /// let ac = AhoCorasick::new(&[
+ /// "foo", "bar", "baz",
+ /// ]);
+ /// assert_eq!(3, ac.pattern_count());
+ /// ```
+ pub fn pattern_count(&self) -> usize {
+ self.imp.pattern_count()
+ }
+
+ /// Returns true if and only if this automaton supports reporting
+ /// overlapping matches.
+ ///
+ /// If this returns false and overlapping matches are requested, then it
+ /// will result in a panic.
+ ///
+ /// Since leftmost matching is inherently incompatible with overlapping
+ /// matches, only
+ /// [`MatchKind::Standard`](enum.MatchKind.html#variant.Standard)
+ /// supports overlapping matches. This is unlikely to change in the future.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
+ ///
+ /// let ac = AhoCorasickBuilder::new()
+ /// .match_kind(MatchKind::Standard)
+ /// .build(&["foo", "bar", "baz"]);
+ /// assert!(ac.supports_overlapping());
+ ///
+ /// let ac = AhoCorasickBuilder::new()
+ /// .match_kind(MatchKind::LeftmostFirst)
+ /// .build(&["foo", "bar", "baz"]);
+ /// assert!(!ac.supports_overlapping());
+ /// ```
+ pub fn supports_overlapping(&self) -> bool {
+ self.match_kind.supports_overlapping()
+ }
+
+ /// Returns true if and only if this automaton supports stream searching.
+ ///
+ /// If this returns false and stream searching (or replacing) is attempted,
+ /// then it will result in a panic.
+ ///
+ /// Currently, only
+ /// [`MatchKind::Standard`](enum.MatchKind.html#variant.Standard)
+ /// supports streaming. This may be expanded in the future.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
+ ///
+ /// let ac = AhoCorasickBuilder::new()
+ /// .match_kind(MatchKind::Standard)
+ /// .build(&["foo", "bar", "baz"]);
+ /// assert!(ac.supports_stream());
+ ///
+ /// let ac = AhoCorasickBuilder::new()
+ /// .match_kind(MatchKind::LeftmostFirst)
+ /// .build(&["foo", "bar", "baz"]);
+ /// assert!(!ac.supports_stream());
+ /// ```
+ pub fn supports_stream(&self) -> bool {
+ self.match_kind.supports_stream()
+ }
+
+ /// Returns the approximate total amount of heap used by this automaton, in
+ /// units of bytes.
+ ///
+ /// # Examples
+ ///
+ /// This example shows the difference in heap usage between a few
+ /// configurations:
+ ///
+ /// ```ignore
+ /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
+ ///
+ /// let ac = AhoCorasickBuilder::new()
+ /// .dfa(false) // default
+ /// .build(&["foo", "bar", "baz"]);
+ /// assert_eq!(10_336, ac.heap_bytes());
+ ///
+ /// let ac = AhoCorasickBuilder::new()
+ /// .dfa(false) // default
+ /// .ascii_case_insensitive(true)
+ /// .build(&["foo", "bar", "baz"]);
+ /// assert_eq!(10_384, ac.heap_bytes());
+ ///
+ /// let ac = AhoCorasickBuilder::new()
+ /// .dfa(true)
+ /// .byte_classes(false)
+ /// .build(&["foo", "bar", "baz"]);
+ /// assert_eq!(20_768, ac.heap_bytes());
+ ///
+ /// let ac = AhoCorasickBuilder::new()
+ /// .dfa(true)
+ /// .byte_classes(true) // default
+ /// .build(&["foo", "bar", "baz"]);
+ /// assert_eq!(1_248, ac.heap_bytes());
+ ///
+ /// let ac = AhoCorasickBuilder::new()
+ /// .dfa(true)
+ /// .ascii_case_insensitive(true)
+ /// .build(&["foo", "bar", "baz"]);
+ /// assert_eq!(1_248, ac.heap_bytes());
+ /// ```
+ pub fn heap_bytes(&self) -> usize {
+ match self.imp {
+ Imp::NFA(ref nfa) => nfa.heap_bytes(),
+ Imp::DFA(ref dfa) => dfa.heap_bytes(),
+ }
+ }
+}
+
+/// The internal implementation of Aho-Corasick, which is either an NFA or
+/// a DFA. The NFA is slower but uses less memory. The DFA is faster but uses
+/// more memory.
+#[derive(Clone, Debug)]
+enum Imp<S: StateID> {
+ NFA(NFA<S>),
+ DFA(DFA<S>),
+}
+
+impl<S: StateID> Imp<S> {
+ /// Returns the type of match semantics implemented by this automaton.
+ fn match_kind(&self) -> &MatchKind {
+ match *self {
+ Imp::NFA(ref nfa) => nfa.match_kind(),
+ Imp::DFA(ref dfa) => dfa.match_kind(),
+ }
+ }
+
+ /// Returns the identifier of the start state.
+ fn start_state(&self) -> S {
+ match *self {
+ Imp::NFA(ref nfa) => nfa.start_state(),
+ Imp::DFA(ref dfa) => dfa.start_state(),
+ }
+ }
+
+ /// The length, in bytes, of the longest pattern in this automaton. This
+ /// information is useful for maintaining correct buffer sizes when
+ /// searching on streams.
+ fn max_pattern_len(&self) -> usize {
+ match *self {
+ Imp::NFA(ref nfa) => nfa.max_pattern_len(),
+ Imp::DFA(ref dfa) => dfa.max_pattern_len(),
+ }
+ }
+
+ /// The total number of patterns added to this automaton. This includes
+ /// patterns that may never match. The maximum matching pattern that can be
+ /// reported is exactly one less than this number.
+ fn pattern_count(&self) -> usize {
+ match *self {
+ Imp::NFA(ref nfa) => nfa.pattern_count(),
+ Imp::DFA(ref dfa) => dfa.pattern_count(),
+ }
+ }
+
+ #[inline(always)]
+ fn overlapping_find_at(
+ &self,
+ prestate: &mut PrefilterState,
+ haystack: &[u8],
+ at: usize,
+ state_id: &mut S,
+ match_index: &mut usize,
+ ) -> Option<Match> {
+ match *self {
+ Imp::NFA(ref nfa) => nfa.overlapping_find_at(
+ prestate,
+ haystack,
+ at,
+ state_id,
+ match_index,
+ ),
+ Imp::DFA(ref dfa) => dfa.overlapping_find_at(
+ prestate,
+ haystack,
+ at,
+ state_id,
+ match_index,
+ ),
+ }
+ }
+
+ #[inline(always)]
+ fn earliest_find_at(
+ &self,
+ prestate: &mut PrefilterState,
+ haystack: &[u8],
+ at: usize,
+ state_id: &mut S,
+ ) -> Option<Match> {
+ match *self {
+ Imp::NFA(ref nfa) => {
+ nfa.earliest_find_at(prestate, haystack, at, state_id)
+ }
+ Imp::DFA(ref dfa) => {
+ dfa.earliest_find_at(prestate, haystack, at, state_id)
+ }
+ }
+ }
+
+ #[inline(always)]
+ fn find_at_no_state(
+ &self,
+ prestate: &mut PrefilterState,
+ haystack: &[u8],
+ at: usize,
+ ) -> Option<Match> {
+ match *self {
+ Imp::NFA(ref nfa) => nfa.find_at_no_state(prestate, haystack, at),
+ Imp::DFA(ref dfa) => dfa.find_at_no_state(prestate, haystack, at),
+ }
+ }
+}
+
+/// An iterator of non-overlapping matches in a particular haystack.
+///
+/// This iterator yields matches according to the
+/// [`MatchKind`](enum.MatchKind.html)
+/// used by this automaton.
+///
+/// This iterator is constructed via the
+/// [`AhoCorasick::find_iter`](struct.AhoCorasick.html#method.find_iter)
+/// method.
+///
+/// The type variable `S` refers to the representation used for state
+/// identifiers. (By default, this is `usize`.)
+///
+/// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton.
+///
+/// The lifetime `'b` refers to the lifetime of the haystack being searched.
+#[derive(Debug)]
+pub struct FindIter<'a, 'b, S: 'a + StateID> {
+ fsm: &'a Imp<S>,
+ prestate: PrefilterState,
+ haystack: &'b [u8],
+ pos: usize,
+}
+
+impl<'a, 'b, S: StateID> FindIter<'a, 'b, S> {
+ fn new(ac: &'a AhoCorasick<S>, haystack: &'b [u8]) -> FindIter<'a, 'b, S> {
+ let prestate = PrefilterState::new(ac.max_pattern_len());
+ FindIter { fsm: &ac.imp, prestate, haystack, pos: 0 }
+ }
+}
+
+impl<'a, 'b, S: StateID> Iterator for FindIter<'a, 'b, S> {
+ type Item = Match;
+
+ fn next(&mut self) -> Option<Match> {
+ if self.pos > self.haystack.len() {
+ return None;
+ }
+ let result = self.fsm.find_at_no_state(
+ &mut self.prestate,
+ self.haystack,
+ self.pos,
+ );
+ let mat = match result {
+ None => return None,
+ Some(mat) => mat,
+ };
+ if mat.end() == self.pos {
+ // If the automaton can match the empty string and if we found an
+ // empty match, then we need to forcefully move the position.
+ self.pos += 1;
+ } else {
+ self.pos = mat.end();
+ }
+ Some(mat)
+ }
+}
+
+/// An iterator of overlapping matches in a particular haystack.
+///
+/// This iterator will report all possible matches in a particular haystack,
+/// even when the matches overlap.
+///
+/// This iterator is constructed via the
+/// [`AhoCorasick::find_overlapping_iter`](struct.AhoCorasick.html#method.find_overlapping_iter)
+/// method.
+///
+/// The type variable `S` refers to the representation used for state
+/// identifiers. (By default, this is `usize`.)
+///
+/// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton.
+///
+/// The lifetime `'b` refers to the lifetime of the haystack being searched.
+#[derive(Debug)]
+pub struct FindOverlappingIter<'a, 'b, S: 'a + StateID> {
+ fsm: &'a Imp<S>,
+ prestate: PrefilterState,
+ haystack: &'b [u8],
+ pos: usize,
+ last_match_end: usize,
+ state_id: S,
+ match_index: usize,
+}
+
+impl<'a, 'b, S: StateID> FindOverlappingIter<'a, 'b, S> {
+ fn new(
+ ac: &'a AhoCorasick<S>,
+ haystack: &'b [u8],
+ ) -> FindOverlappingIter<'a, 'b, S> {
+ assert!(
+ ac.supports_overlapping(),
+ "automaton does not support overlapping searches"
+ );
+ let prestate = PrefilterState::new(ac.max_pattern_len());
+ FindOverlappingIter {
+ fsm: &ac.imp,
+ prestate,
+ haystack,
+ pos: 0,
+ last_match_end: 0,
+ state_id: ac.imp.start_state(),
+ match_index: 0,
+ }
+ }
+}
+
+impl<'a, 'b, S: StateID> Iterator for FindOverlappingIter<'a, 'b, S> {
+ type Item = Match;
+
+ fn next(&mut self) -> Option<Match> {
+ let result = self.fsm.overlapping_find_at(
+ &mut self.prestate,
+ self.haystack,
+ self.pos,
+ &mut self.state_id,
+ &mut self.match_index,
+ );
+ match result {
+ None => return None,
+ Some(m) => {
+ self.pos = m.end();
+ Some(m)
+ }
+ }
+ }
+}
+
+/// An iterator that reports Aho-Corasick matches in a stream.
+///
+/// This iterator yields elements of type `io::Result<Match>`, where an error
+/// is reported if there was a problem reading from the underlying stream.
+/// The iterator terminates only when the underlying stream reaches `EOF`.
+///
+/// This iterator is constructed via the
+/// [`AhoCorasick::stream_find_iter`](struct.AhoCorasick.html#method.stream_find_iter)
+/// method.
+///
+/// The type variable `R` refers to the `io::Read` stream that is being read
+/// from.
+///
+/// The type variable `S` refers to the representation used for state
+/// identifiers. (By default, this is `usize`.)
+///
+/// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton.
+#[derive(Debug)]
+pub struct StreamFindIter<'a, R, S: 'a + StateID> {
+ it: StreamChunkIter<'a, R, S>,
+}
+
+impl<'a, R: io::Read, S: StateID> StreamFindIter<'a, R, S> {
+ fn new(ac: &'a AhoCorasick<S>, rdr: R) -> StreamFindIter<'a, R, S> {
+ StreamFindIter { it: StreamChunkIter::new(ac, rdr) }
+ }
+}
+
+impl<'a, R: io::Read, S: StateID> Iterator for StreamFindIter<'a, R, S> {
+ type Item = io::Result<Match>;
+
+ fn next(&mut self) -> Option<io::Result<Match>> {
+ loop {
+ match self.it.next() {
+ None => return None,
+ Some(Err(err)) => return Some(Err(err)),
+ Some(Ok(StreamChunk::NonMatch { .. })) => {}
+ Some(Ok(StreamChunk::Match { mat, .. })) => {
+ return Some(Ok(mat));
+ }
+ }
+ }
+ }
+}
+
+/// An iterator over chunks in an underlying reader. Each chunk either
+/// corresponds to non-matching bytes or matching bytes, but all bytes from
+/// the underlying reader are reported in sequence. There may be an arbitrary
+/// number of non-matching chunks before seeing a matching chunk.
+///
+/// N.B. This does not actually implement Iterator because we need to borrow
+/// from the underlying reader. But conceptually, it's still an iterator.
+#[derive(Debug)]
+struct StreamChunkIter<'a, R, S: 'a + StateID> {
+ /// The AC automaton.
+ fsm: &'a Imp<S>,
+ /// State associated with this automaton's prefilter. It is a heuristic
+ /// for stopping the prefilter if it's deemed ineffective.
+ prestate: PrefilterState,
+ /// The source of bytes we read from.
+ rdr: R,
+ /// A fixed size buffer. This is what we actually search. There are some
+ /// invariants around the buffer's size, namely, it must be big enough to
+ /// contain the longest possible match.
+ buf: Buffer,
+ /// The ID of the FSM state we're currently in.
+ state_id: S,
+ /// The current position at which to start the next search in `buf`.
+ search_pos: usize,
+ /// The absolute position of `search_pos`, where `0` corresponds to the
+ /// position of the first byte read from `rdr`.
+ absolute_pos: usize,
+ /// The ending position of the last StreamChunk that was returned to the
+ /// caller. This position is used to determine whether we need to emit
+ /// non-matching bytes before emitting a match.
+ report_pos: usize,
+ /// A match that should be reported on the next call.
+ pending_match: Option<Match>,
+ /// Enabled only when the automaton can match the empty string. When
+ /// enabled, we need to execute one final search after consuming the
+ /// reader to find the trailing empty match.
+ has_empty_match_at_end: bool,
+}
+
+/// A single chunk yielded by the stream chunk iterator.
+///
+/// The `'r` lifetime refers to the lifetime of the stream chunk iterator.
+#[derive(Debug)]
+enum StreamChunk<'r> {
+ /// A chunk that does not contain any matches.
+ NonMatch { bytes: &'r [u8], start: usize },
+ /// A chunk that precisely contains a match.
+ Match { bytes: &'r [u8], mat: Match },
+}
+
+impl<'a, R: io::Read, S: StateID> StreamChunkIter<'a, R, S> {
+ fn new(ac: &'a AhoCorasick<S>, rdr: R) -> StreamChunkIter<'a, R, S> {
+ assert!(
+ ac.supports_stream(),
+ "stream searching is only supported for Standard match semantics"
+ );
+
+ let prestate = PrefilterState::new(ac.max_pattern_len());
+ let buf = Buffer::new(ac.imp.max_pattern_len());
+ let state_id = ac.imp.start_state();
+ StreamChunkIter {
+ fsm: &ac.imp,
+ prestate,
+ rdr,
+ buf,
+ state_id,
+ absolute_pos: 0,
+ report_pos: 0,
+ search_pos: 0,
+ pending_match: None,
+ has_empty_match_at_end: ac.is_match(""),
+ }
+ }
+
+ fn next<'r>(&'r mut self) -> Option<io::Result<StreamChunk<'r>>> {
+ loop {
+ if let Some(mut mat) = self.pending_match.take() {
+ let bytes = &self.buf.buffer()[mat.start()..mat.end()];
+ self.report_pos = mat.end();
+ mat = mat.increment(self.absolute_pos);
+ return Some(Ok(StreamChunk::Match { bytes, mat }));
+ }
+ if self.search_pos >= self.buf.len() {
+ if let Some(end) = self.unreported() {
+ let bytes = &self.buf.buffer()[self.report_pos..end];
+ let start = self.absolute_pos + self.report_pos;
+ self.report_pos = end;
+ return Some(Ok(StreamChunk::NonMatch { bytes, start }));
+ }
+ if self.buf.len() >= self.buf.min_buffer_len() {
+ // This is the point at which we roll our buffer, which we
+ // only do if our buffer has at least the minimum amount of
+ // bytes in it. Before rolling, we update our various
+ // positions to be consistent with the buffer after it has
+ // been rolled.
+
+ self.report_pos -=
+ self.buf.len() - self.buf.min_buffer_len();
+ self.absolute_pos +=
+ self.search_pos - self.buf.min_buffer_len();
+ self.search_pos = self.buf.min_buffer_len();
+ self.buf.roll();
+ }
+ match self.buf.fill(&mut self.rdr) {
+ Err(err) => return Some(Err(err)),
+ Ok(false) => {
+ // We've hit EOF, but if there are still some
+ // unreported bytes remaining, return them now.
+ if self.report_pos < self.buf.len() {
+ let bytes = &self.buf.buffer()[self.report_pos..];
+ let start = self.absolute_pos + self.report_pos;
+ self.report_pos = self.buf.len();
+
+ let chunk = StreamChunk::NonMatch { bytes, start };
+ return Some(Ok(chunk));
+ } else {
+ // We've reported everything, but there might still
+ // be a match at the very last position.
+ if !self.has_empty_match_at_end {
+ return None;
+ }
+ // fallthrough for another search to get trailing
+ // empty matches
+ self.has_empty_match_at_end = false;
+ }
+ }
+ Ok(true) => {}
+ }
+ }
+ let result = self.fsm.earliest_find_at(
+ &mut self.prestate,
+ self.buf.buffer(),
+ self.search_pos,
+ &mut self.state_id,
+ );
+ match result {
+ None => {
+ self.search_pos = self.buf.len();
+ }
+ Some(mat) => {
+ self.state_id = self.fsm.start_state();
+ if mat.end() == self.search_pos {
+ // If the automaton can match the empty string and if
+ // we found an empty match, then we need to forcefully
+ // move the position.
+ self.search_pos += 1;
+ } else {
+ self.search_pos = mat.end();
+ }
+ self.pending_match = Some(mat.clone());
+ if self.report_pos < mat.start() {
+ let bytes =
+ &self.buf.buffer()[self.report_pos..mat.start()];
+ let start = self.absolute_pos + self.report_pos;
+ self.report_pos = mat.start();
+
+ let chunk = StreamChunk::NonMatch { bytes, start };
+ return Some(Ok(chunk));
+ }
+ }
+ }
+ }
+ }
+
+ fn unreported(&self) -> Option<usize> {
+ let end = self.search_pos.saturating_sub(self.buf.min_buffer_len());
+ if self.report_pos < end {
+ Some(end)
+ } else {
+ None
+ }
+ }
+}
+
+/// A builder for configuring an Aho-Corasick automaton.
+#[derive(Clone, Debug)]
+pub struct AhoCorasickBuilder {
+ nfa_builder: nfa::Builder,
+ dfa_builder: dfa::Builder,
+ dfa: bool,
+}
+
+impl Default for AhoCorasickBuilder {
+ fn default() -> AhoCorasickBuilder {
+ AhoCorasickBuilder::new()
+ }
+}
+
+impl AhoCorasickBuilder {
+ /// Create a new builder for configuring an Aho-Corasick automaton.
+ ///
+ /// If you don't need fine grained configuration or aren't sure which knobs
+ /// to set, try using
+ /// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured)
+ /// instead.
+ pub fn new() -> AhoCorasickBuilder {
+ AhoCorasickBuilder {
+ nfa_builder: nfa::Builder::new(),
+ dfa_builder: dfa::Builder::new(),
+ dfa: false,
+ }
+ }
+
+ /// Build an Aho-Corasick automaton using the configuration set on this
+ /// builder.
+ ///
+ /// A builder may be reused to create more automatons.
+ ///
+ /// This method will use the default for representing internal state
+ /// identifiers, which is `usize`. This guarantees that building the
+ /// automaton will succeed and is generally a good default, but can make
+ /// the size of the automaton 2-8 times bigger than it needs to be,
+ /// depending on your target platform.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::AhoCorasickBuilder;
+ ///
+ /// let patterns = &["foo", "bar", "baz"];
+ /// let ac = AhoCorasickBuilder::new()
+ /// .build(patterns);
+ /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
+ /// ```
+ pub fn build<I, P>(&self, patterns: I) -> AhoCorasick
+ where
+ I: IntoIterator<Item = P>,
+ P: AsRef<[u8]>,
+ {
+ // The builder only returns an error if the chosen state ID
+ // representation is too small to fit all of the given patterns. In
+ // this case, since we fix the representation to usize, it will always
+ // work because it's impossible to overflow usize since the underlying
+ // storage would OOM long before that happens.
+ self.build_with_size::<usize, I, P>(patterns)
+ .expect("usize state ID type should always work")
+ }
+
+ /// Build an Aho-Corasick automaton using the configuration set on this
+ /// builder with a specific state identifier representation. This only has
+ /// an effect when the `dfa` option is enabled.
+ ///
+ /// Generally, the choices for a state identifier representation are
+ /// `u8`, `u16`, `u32`, `u64` or `usize`, with `usize` being the default.
+ /// The advantage of choosing a smaller state identifier representation
+ /// is that the automaton produced will be smaller. This might be
+ /// beneficial for just generally using less space, or might even allow it
+ /// to fit more of the automaton in your CPU's cache, leading to overall
+ /// better search performance.
+ ///
+ /// Unlike the standard `build` method, this can report an error if the
+ /// state identifier representation cannot support the size of the
+ /// automaton.
+ ///
+ /// Note that the state identifier representation is determined by the
+ /// `S` type variable. This requires a type hint of some sort, either
+ /// by specifying the return type or using the turbofish, e.g.,
+ /// `build_with_size::<u16, _, _>(...)`.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasick, AhoCorasickBuilder};
+ ///
+ /// # fn example() -> Result<(), ::aho_corasick::Error> {
+ /// let patterns = &["foo", "bar", "baz"];
+ /// let ac: AhoCorasick<u8> = AhoCorasickBuilder::new()
+ /// .build_with_size(patterns)?;
+ /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
+ /// # Ok(()) }; example().unwrap()
+ /// ```
+ ///
+ /// Or alternatively, with turbofish:
+ ///
+ /// ```
+ /// use aho_corasick::AhoCorasickBuilder;
+ ///
+ /// # fn example() -> Result<(), ::aho_corasick::Error> {
+ /// let patterns = &["foo", "bar", "baz"];
+ /// let ac = AhoCorasickBuilder::new()
+ /// .build_with_size::<u8, _, _>(patterns)?;
+ /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
+ /// # Ok(()) }; example().unwrap()
+ /// ```
+ pub fn build_with_size<S, I, P>(
+ &self,
+ patterns: I,
+ ) -> Result<AhoCorasick<S>>
+ where
+ S: StateID,
+ I: IntoIterator<Item = P>,
+ P: AsRef<[u8]>,
+ {
+ let nfa = self.nfa_builder.build(patterns)?;
+ let match_kind = nfa.match_kind().clone();
+ let imp = if self.dfa {
+ let dfa = self.dfa_builder.build(&nfa)?;
+ Imp::DFA(dfa)
+ } else {
+ Imp::NFA(nfa)
+ };
+ Ok(AhoCorasick { imp, match_kind })
+ }
+
+ /// Automatically configure the settings on this builder according to the
+ /// patterns that will be used to construct the automaton.
+ ///
+ /// The idea here is to balance space and time automatically. That is, when
+ /// searching a small number of patterns, this will attempt to use the
+ /// fastest possible configuration since the total space required will be
+ /// small anyway. As the number of patterns grows, this will fall back to
+ /// slower configurations that use less space.
+ ///
+ /// This is guaranteed to never set `match_kind`, but any other option may
+ /// be overridden.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::AhoCorasickBuilder;
+ ///
+ /// let patterns = &["foo", "bar", "baz"];
+ /// let ac = AhoCorasickBuilder::new()
+ /// .auto_configure(patterns)
+ /// .build(patterns);
+ /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
+ /// ```
+ pub fn auto_configure<B: AsRef<[u8]>>(
+ &mut self,
+ patterns: &[B],
+ ) -> &mut AhoCorasickBuilder {
+ // N.B. Currently we only use the length of `patterns` to make a
+ // decision here, and could therefore ask for an `ExactSizeIterator`
+ // instead. But it's conceivable that we might adapt this to look at
+ // the total number of bytes, which would requires a second pass.
+ //
+ // The logic here is fairly rudimentary at the moment, but probably
+ // OK. The idea here is to use the fastest thing possible for a small
+ // number of patterns. That is, a DFA with no byte classes, since byte
+ // classes require an extra indirection for every byte searched. With a
+ // moderate number of patterns, we still want a DFA, but save on both
+ // space and compilation time by enabling byte classes. Finally, fall
+ // back to the slower but smaller NFA.
+ if patterns.len() <= 100 {
+ // N.B. Using byte classes can actually be faster by improving
+ // locality, but this only really applies for multi-megabyte
+ // automata (i.e., automata that don't fit in your CPU's cache).
+ self.dfa(true).byte_classes(false);
+ } else if patterns.len() <= 5000 {
+ self.dfa(true);
+ }
+ self
+ }
+
+ /// Set the desired match semantics.
+ ///
+ /// The default is `MatchKind::Standard`, which corresponds to the match
+ /// semantics supported by the standard textbook description of the
+ /// Aho-Corasick algorithm. Namely, matches are reported as soon as they
+ /// are found. Moreover, this is the only way to get overlapping matches
+ /// or do stream searching.
+ ///
+ /// The other kinds of match semantics that are supported are
+ /// `MatchKind::LeftmostFirst` and `MatchKind::LeftmostLongest`. The former
+ /// corresponds to the match you would get if you were to try to match
+ /// each pattern at each position in the haystack in the same order that
+ /// you give to the automaton. That is, it returns the leftmost match
+ /// corresponding the earliest pattern given to the automaton. The latter
+ /// corresponds to finding the longest possible match among all leftmost
+ /// matches.
+ ///
+ /// For more details on match semantics, see the
+ /// [documentation for `MatchKind`](enum.MatchKind.html).
+ ///
+ /// # Examples
+ ///
+ /// In these examples, we demonstrate the differences between match
+ /// semantics for a particular set of patterns in a specific order:
+ /// `b`, `abc`, `abcd`.
+ ///
+ /// Standard semantics:
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
+ ///
+ /// let patterns = &["b", "abc", "abcd"];
+ /// let haystack = "abcd";
+ ///
+ /// let ac = AhoCorasickBuilder::new()
+ /// .match_kind(MatchKind::Standard) // default, not necessary
+ /// .build(patterns);
+ /// let mat = ac.find(haystack).expect("should have a match");
+ /// assert_eq!("b", &haystack[mat.start()..mat.end()]);
+ /// ```
+ ///
+ /// Leftmost-first semantics:
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
+ ///
+ /// let patterns = &["b", "abc", "abcd"];
+ /// let haystack = "abcd";
+ ///
+ /// let ac = AhoCorasickBuilder::new()
+ /// .match_kind(MatchKind::LeftmostFirst)
+ /// .build(patterns);
+ /// let mat = ac.find(haystack).expect("should have a match");
+ /// assert_eq!("abc", &haystack[mat.start()..mat.end()]);
+ /// ```
+ ///
+ /// Leftmost-longest semantics:
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
+ ///
+ /// let patterns = &["b", "abc", "abcd"];
+ /// let haystack = "abcd";
+ ///
+ /// let ac = AhoCorasickBuilder::new()
+ /// .match_kind(MatchKind::LeftmostLongest)
+ /// .build(patterns);
+ /// let mat = ac.find(haystack).expect("should have a match");
+ /// assert_eq!("abcd", &haystack[mat.start()..mat.end()]);
+ /// ```
+ pub fn match_kind(&mut self, kind: MatchKind) -> &mut AhoCorasickBuilder {
+ self.nfa_builder.match_kind(kind);
+ self
+ }
+
+ /// Enable anchored mode, which requires all matches to start at the
+ /// first position in a haystack.
+ ///
+ /// This option is disabled by default.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::AhoCorasickBuilder;
+ ///
+ /// let patterns = &["foo", "bar"];
+ /// let haystack = "foobar";
+ ///
+ /// let ac = AhoCorasickBuilder::new()
+ /// .anchored(true)
+ /// .build(patterns);
+ /// assert_eq!(1, ac.find_iter(haystack).count());
+ /// ```
+ ///
+ /// When searching for overlapping matches, all matches that start at
+ /// the beginning of a haystack will be reported:
+ ///
+ /// ```
+ /// use aho_corasick::AhoCorasickBuilder;
+ ///
+ /// let patterns = &["foo", "foofoo"];
+ /// let haystack = "foofoo";
+ ///
+ /// let ac = AhoCorasickBuilder::new()
+ /// .anchored(true)
+ /// .build(patterns);
+ /// assert_eq!(2, ac.find_overlapping_iter(haystack).count());
+ /// // A non-anchored search would return 3 matches.
+ /// ```
+ pub fn anchored(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
+ self.nfa_builder.anchored(yes);
+ self
+ }
+
+ /// Enable ASCII-aware case insensitive matching.
+ ///
+ /// When this option is enabled, searching will be performed without
+ /// respect to case for ASCII letters (`a-z` and `A-Z`) only.
+ ///
+ /// Enabling this option does not change the search algorithm, but it may
+ /// increase the size of the automaton.
+ ///
+ /// **NOTE:** In the future, support for full Unicode case insensitivity
+ /// may be added, but ASCII case insensitivity is comparatively much
+ /// simpler to add.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::AhoCorasickBuilder;
+ ///
+ /// let patterns = &["FOO", "bAr", "BaZ"];
+ /// let haystack = "foo bar baz";
+ ///
+ /// let ac = AhoCorasickBuilder::new()
+ /// .ascii_case_insensitive(true)
+ /// .build(patterns);
+ /// assert_eq!(3, ac.find_iter(haystack).count());
+ /// ```
+ pub fn ascii_case_insensitive(
+ &mut self,
+ yes: bool,
+ ) -> &mut AhoCorasickBuilder {
+ self.nfa_builder.ascii_case_insensitive(yes);
+ self
+ }
+
+ /// Set the limit on how many NFA states use a dense representation for
+ /// their transitions.
+ ///
+ /// A dense representation uses more space, but supports faster access to
+ /// transitions at search time. Thus, this setting permits the control of a
+ /// space vs time trade off when using the NFA variant of Aho-Corasick.
+ ///
+ /// This limit is expressed in terms of the depth of a state, i.e., the
+ /// number of transitions from the starting state of the NFA. The idea is
+ /// that most of the time searching will be spent near the starting state
+ /// of the automaton, so states near the start state should use a dense
+ /// representation. States further away from the start state would then use
+ /// a sparse representation, which uses less space but is slower to access
+ /// transitions at search time.
+ ///
+ /// By default, this is set to a low but non-zero number.
+ ///
+ /// This setting has no effect if the `dfa` option is enabled.
+ pub fn dense_depth(&mut self, depth: usize) -> &mut AhoCorasickBuilder {
+ self.nfa_builder.dense_depth(depth);
+ self
+ }
+
+ /// Compile the standard Aho-Corasick automaton into a deterministic finite
+ /// automaton (DFA).
+ ///
+ /// When this is disabled (which is the default), then a non-deterministic
+ /// finite automaton (NFA) is used instead.
+ ///
+ /// The main benefit to a DFA is that it can execute searches more quickly
+ /// than a DFA (perhaps 2-4 times as fast). The main drawback is that the
+ /// DFA uses more space and can take much longer to build.
+ ///
+ /// Enabling this option does not change the time complexity for
+ /// constructing the Aho-Corasick automaton (which is `O(p)` where
+ /// `p` is the total number of patterns being compiled). Enabling this
+ /// option does however reduce the time complexity of non-overlapping
+ /// searches from `O(n + p)` to `O(n)`, where `n` is the length of the
+ /// haystack.
+ ///
+ /// In general, it's a good idea to enable this if you're searching a
+ /// small number of fairly short patterns (~1000), or if you want the
+ /// fastest possible search without regard to compilation time or space
+ /// usage.
+ pub fn dfa(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
+ self.dfa = yes;
+ self
+ }
+
+ /// Enable heuristic prefilter optimizations.
+ ///
+ /// When enabled, searching will attempt to quickly skip to match
+ /// candidates using specialized literal search routines. A prefilter
+ /// cannot always be used, and is generally treated as a heuristic. It
+ /// can be useful to disable this if the prefilter is observed to be
+ /// sub-optimal for a particular workload.
+ ///
+ /// This is enabled by default.
+ pub fn prefilter(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
+ self.nfa_builder.prefilter(yes);
+ self
+ }
+
+ /// Shrink the size of the transition alphabet by mapping bytes to their
+ /// equivalence classes. This only has an effect when the `dfa` option is
+ /// enabled.
+ ///
+ /// When enabled, each a DFA will use a map from all possible bytes
+ /// to their corresponding equivalence class. Each equivalence class
+ /// represents a set of bytes that does not discriminate between a match
+ /// and a non-match in the DFA. For example, the patterns `bar` and `baz`
+ /// have at least five equivalence classes: singleton sets of `b`, `a`, `r`
+ /// and `z`, and a final set that contains every other byte.
+ ///
+ /// The advantage of this map is that the size of the transition table can
+ /// be reduced drastically from `#states * 256 * sizeof(id)` to
+ /// `#states * k * sizeof(id)` where `k` is the number of equivalence
+ /// classes. As a result, total space usage can decrease substantially.
+ /// Moreover, since a smaller alphabet is used, compilation becomes faster
+ /// as well.
+ ///
+ /// The disadvantage of this map is that every byte searched must be
+ /// passed through this map before it can be used to determine the next
+ /// transition. This has a small match time performance cost. However, if
+ /// the DFA is otherwise very large without byte classes, then using byte
+ /// classes can greatly improve memory locality and thus lead to better
+ /// overall performance.
+ ///
+ /// This option is enabled by default.
+ pub fn byte_classes(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
+ self.dfa_builder.byte_classes(yes);
+ self
+ }
+
+ /// Premultiply state identifiers in the transition table. This only has
+ /// an effect when the `dfa` option is enabled.
+ ///
+ /// When enabled, state identifiers are premultiplied to point to their
+ /// corresponding row in the transition table. That is, given the `i`th
+ /// state, its corresponding premultiplied identifier is `i * k` where `k`
+ /// is the alphabet size of the automaton. (The alphabet size is at most
+ /// 256, but is in practice smaller if byte classes is enabled.)
+ ///
+ /// When state identifiers are not premultiplied, then the identifier of
+ /// the `i`th state is `i`.
+ ///
+ /// The advantage of premultiplying state identifiers is that is saves a
+ /// multiplication instruction per byte when searching with a DFA. This has
+ /// been observed to lead to a 20% performance benefit in micro-benchmarks.
+ ///
+ /// The primary disadvantage of premultiplying state identifiers is
+ /// that they require a larger integer size to represent. For example,
+ /// if the DFA has 200 states, then its premultiplied form requires 16
+ /// bits to represent every possible state identifier, where as its
+ /// non-premultiplied form only requires 8 bits.
+ ///
+ /// This option is enabled by default.
+ pub fn premultiply(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
+ self.dfa_builder.premultiply(yes);
+ self
+ }
+}
+
+/// A knob for controlling the match semantics of an Aho-Corasick automaton.
+///
+/// There are two generally different ways that Aho-Corasick automatons can
+/// report matches. The first way is the "standard" approach that results from
+/// implementing most textbook explanations of Aho-Corasick. The second way is
+/// to report only the leftmost non-overlapping matches. The leftmost approach
+/// is in turn split into two different ways of resolving ambiguous matches:
+/// leftmost-first and leftmost-longest.
+///
+/// The `Standard` match kind is the default and is the only one that supports
+/// overlapping matches and stream searching. (Trying to find overlapping
+/// or streaming matches using leftmost match semantics will result in a
+/// panic.) The `Standard` match kind will report matches as they are seen.
+/// When searching for overlapping matches, then all possible matches are
+/// reported. When searching for non-overlapping matches, the first match seen
+/// is reported. For example, for non-overlapping matches, given the patterns
+/// `abcd` and `b` and the subject string `abcdef`, only a match for `b` is
+/// reported since it is detected first. The `abcd` match is never reported
+/// since it overlaps with the `b` match.
+///
+/// In contrast, the leftmost match kind always prefers the leftmost match
+/// among all possible matches. Given the same example as above with `abcd` and
+/// `b` as patterns and `abcdef` as the subject string, the leftmost match is
+/// `abcd` since it begins before the `b` match, even though the `b` match is
+/// detected before the `abcd` match. In this case, the `b` match is not
+/// reported at all since it overlaps with the `abcd` match.
+///
+/// The difference between leftmost-first and leftmost-longest is in how they
+/// resolve ambiguous matches when there are multiple leftmost matches to
+/// choose from. Leftmost-first always chooses the pattern that was provided
+/// earliest, where as leftmost-longest always chooses the longest matching
+/// pattern. For example, given the patterns `a` and `ab` and the subject
+/// string `ab`, the leftmost-first match is `a` but the leftmost-longest match
+/// is `ab`. Conversely, if the patterns were given in reverse order, i.e.,
+/// `ab` and `a`, then both the leftmost-first and leftmost-longest matches
+/// would be `ab`. Stated differently, the leftmost-first match depends on the
+/// order in which the patterns were given to the Aho-Corasick automaton.
+/// Because of that, when leftmost-first matching is used, if a pattern `A`
+/// that appears before a pattern `B` is a prefix of `B`, then it is impossible
+/// to ever observe a match of `B`.
+///
+/// If you're not sure which match kind to pick, then stick with the standard
+/// kind, which is the default. In particular, if you need overlapping or
+/// streaming matches, then you _must_ use the standard kind. The leftmost
+/// kinds are useful in specific circumstances. For example, leftmost-first can
+/// be very useful as a way to implement match priority based on the order of
+/// patterns given and leftmost-longest can be useful for dictionary searching
+/// such that only the longest matching words are reported.
+///
+/// # Relationship with regular expression alternations
+///
+/// Understanding match semantics can be a little tricky, and one easy way
+/// to conceptualize non-overlapping matches from an Aho-Corasick automaton
+/// is to think about them as a simple alternation of literals in a regular
+/// expression. For example, let's say we wanted to match the strings
+/// `Sam` and `Samwise`, which would turn into the regex `Sam|Samwise`. It
+/// turns out that regular expression engines have two different ways of
+/// matching this alternation. The first way, leftmost-longest, is commonly
+/// found in POSIX compatible implementations of regular expressions (such as
+/// `grep`). The second way, leftmost-first, is commonly found in backtracking
+/// implementations such as Perl. (Some regex engines, such as RE2 and Rust's
+/// regex engine do not use backtracking, but still implement leftmost-first
+/// semantics in an effort to match the behavior of dominant backtracking
+/// regex engines such as those found in Perl, Ruby, Python, Javascript and
+/// PHP.)
+///
+/// That is, when matching `Sam|Samwise` against `Samwise`, a POSIX regex
+/// will match `Samwise` because it is the longest possible match, but a
+/// Perl-like regex will match `Sam` since it appears earlier in the
+/// alternation. Indeed, the regex `Sam|Samwise` in a Perl-like regex engine
+/// will never match `Samwise` since `Sam` will always have higher priority.
+/// Conversely, matching the regex `Samwise|Sam` against `Samwise` will lead to
+/// a match of `Samwise` in both POSIX and Perl-like regexes since `Samwise` is
+/// still longest match, but it also appears earlier than `Sam`.
+///
+/// The "standard" match semantics of Aho-Corasick generally don't correspond
+/// to the match semantics of any large group of regex implementations, so
+/// there's no direct analogy that can be made here. Standard match semantics
+/// are generally useful for overlapping matches, or if you just want to see
+/// matches as they are detected.
+///
+/// The main conclusion to draw from this section is that the match semantics
+/// can be tweaked to precisely match either Perl-like regex alternations or
+/// POSIX regex alternations.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum MatchKind {
+ /// Use standard match semantics, which support overlapping matches. When
+ /// used with non-overlapping matches, matches are reported as they are
+ /// seen.
+ Standard,
+ /// Use leftmost-first match semantics, which reports leftmost matches.
+ /// When there are multiple possible leftmost matches, the match
+ /// corresponding to the pattern that appeared earlier when constructing
+ /// the automaton is reported.
+ ///
+ /// This does **not** support overlapping matches or stream searching. If
+ /// this match kind is used, attempting to find overlapping matches or
+ /// stream matches will panic.
+ LeftmostFirst,
+ /// Use leftmost-longest match semantics, which reports leftmost matches.
+ /// When there are multiple possible leftmost matches, the longest match
+ /// is chosen.
+ ///
+ /// This does **not** support overlapping matches or stream searching. If
+ /// this match kind is used, attempting to find overlapping matches or
+ /// stream matches will panic.
+ LeftmostLongest,
+ /// Hints that destructuring should not be exhaustive.
+ ///
+ /// This enum may grow additional variants, so this makes sure clients
+ /// don't count on exhaustive matching. (Otherwise, adding a new variant
+ /// could break existing code.)
+ #[doc(hidden)]
+ __Nonexhaustive,
+}
+
+/// The default match kind is `MatchKind::Standard`.
+impl Default for MatchKind {
+ fn default() -> MatchKind {
+ MatchKind::Standard
+ }
+}
+
+impl MatchKind {
+ fn supports_overlapping(&self) -> bool {
+ self.is_standard()
+ }
+
+ fn supports_stream(&self) -> bool {
+ // TODO: It may be possible to support this. It's hard.
+ //
+ // See: https://github.com/rust-lang/regex/issues/425#issuecomment-471367838
+ self.is_standard()
+ }
+
+ pub(crate) fn is_standard(&self) -> bool {
+ *self == MatchKind::Standard
+ }
+
+ pub(crate) fn is_leftmost(&self) -> bool {
+ *self == MatchKind::LeftmostFirst
+ || *self == MatchKind::LeftmostLongest
+ }
+
+ pub(crate) fn is_leftmost_first(&self) -> bool {
+ *self == MatchKind::LeftmostFirst
+ }
+
+ /// Convert this match kind into a packed match kind. If this match kind
+ /// corresponds to standard semantics, then this returns None, since
+ /// packed searching does not support standard semantics.
+ pub(crate) fn as_packed(&self) -> Option<packed::MatchKind> {
+ match *self {
+ MatchKind::Standard => None,
+ MatchKind::LeftmostFirst => Some(packed::MatchKind::LeftmostFirst),
+ MatchKind::LeftmostLongest => {
+ Some(packed::MatchKind::LeftmostLongest)
+ }
+ MatchKind::__Nonexhaustive => unreachable!(),
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn oibits() {
+ use std::panic::{RefUnwindSafe, UnwindSafe};
+
+ fn assert_send<T: Send>() {}
+ fn assert_sync<T: Sync>() {}
+ fn assert_unwind_safe<T: RefUnwindSafe + UnwindSafe>() {}
+
+ assert_send::<AhoCorasick>();
+ assert_sync::<AhoCorasick>();
+ assert_unwind_safe::<AhoCorasick>();
+ assert_send::<AhoCorasickBuilder>();
+ assert_sync::<AhoCorasickBuilder>();
+ assert_unwind_safe::<AhoCorasickBuilder>();
+ }
+}
diff --git a/src/automaton.rs b/src/automaton.rs
new file mode 100644
index 0000000..2ada1a0
--- /dev/null
+++ b/src/automaton.rs
@@ -0,0 +1,573 @@
+use ahocorasick::MatchKind;
+use prefilter::{self, Candidate, Prefilter, PrefilterState};
+use state_id::{dead_id, fail_id, StateID};
+use Match;
+
+// NOTE: This trait essentially started as a copy of the same trait from from
+// regex-automata, with some wording changed since we use this trait for
+// NFAs in addition to DFAs in this crate. Additionally, we do not export
+// this trait. It's only used internally to reduce code duplication. The
+// regex-automata crate needs to expose it because its Regex type is generic
+// over implementations of this trait. In this crate, we encapsulate everything
+// behind the AhoCorasick type.
+//
+// This trait is a bit of a mess, but it's not quite clear how to fix it.
+// Basically, there are several competing concerns:
+//
+// * We need performance, so everything effectively needs to get monomorphized.
+// * There are several variations on searching Aho-Corasick automatons:
+// overlapping, standard and leftmost. Overlapping and standard are somewhat
+// combined together below, but there is no real way to combine standard with
+// leftmost. Namely, leftmost requires continuing a search even after a match
+// is found, in order to correctly disambiguate a match.
+// * On top of that, *sometimes* callers want to know which state the automaton
+// is in after searching. This is principally useful for overlapping and
+// stream searches. However, when callers don't care about this, we really
+// do not want to be forced to compute it, since it sometimes requires extra
+// work. Thus, there are effectively two copies of leftmost searching: one
+// for tracking the state ID and one that doesn't. We should ideally do the
+// same for standard searching, but my sanity stopped me.
+
+// SAFETY RATIONALE: Previously, the code below went to some length to remove
+// all bounds checks. This generally produced tighter assembly and lead to
+// 20-50% improvements in micro-benchmarks on corpora made up of random
+// characters. This somewhat makes sense, since the branch predictor is going
+// to be at its worse on random text.
+//
+// However, using the aho-corasick-debug tool and manually benchmarking
+// different inputs, the code *with* bounds checks actually wound up being
+// slightly faster:
+//
+// $ cat input
+// Sherlock Holmes
+// John Watson
+// Professor Moriarty
+// Irene Adler
+// Mary Watson
+//
+// $ aho-corasick-debug-safe \
+// input OpenSubtitles2018.raw.sample.en --kind leftmost-first --dfa
+// pattern read time: 32.824µs
+// automaton build time: 444.687µs
+// automaton heap usage: 72392 bytes
+// match count: 639
+// count time: 1.809961702s
+//
+// $ aho-corasick-debug-master \
+// input OpenSubtitles2018.raw.sample.en --kind leftmost-first --dfa
+// pattern read time: 31.425µs
+// automaton build time: 317.434µs
+// automaton heap usage: 72392 bytes
+// match count: 639
+// count time: 2.059157705s
+//
+// I was able to reproduce this result on two different machines (an i5 and
+// an i7). Therefore, we go the route of safe code for now.
+
+/// A trait describing the interface of an Aho-Corasick finite state machine.
+///
+/// Every automaton has exactly one fail state, one dead state and exactly one
+/// start state. Generally, these correspond to the first, second and third
+/// states, respectively. The failure state is always treated as a sentinel.
+/// That is, no correct Aho-Corasick automaton will ever transition into the
+/// fail state. The dead state, however, can be transitioned into, but only
+/// when leftmost-first or leftmost-longest match semantics are enabled and
+/// only when at least one match has been observed.
+///
+/// Every automaton also has one or more match states, such that
+/// `Automaton::is_match_state(id)` returns `true` if and only if `id`
+/// corresponds to a match state.
+pub trait Automaton {
+ /// The representation used for state identifiers in this automaton.
+ ///
+ /// Typically, this is one of `u8`, `u16`, `u32`, `u64` or `usize`.
+ type ID: StateID;
+
+ /// The type of matching that should be done.
+ fn match_kind(&self) -> &MatchKind;
+
+ /// Returns true if and only if this automaton uses anchored searches.
+ fn anchored(&self) -> bool;
+
+ /// An optional prefilter for quickly skipping to the next candidate match.
+ /// A prefilter must report at least every match, although it may report
+ /// positions that do not correspond to a match. That is, it must not allow
+ /// false negatives, but can allow false positives.
+ ///
+ /// Currently, a prefilter only runs when the automaton is in the start
+ /// state. That is, the position reported by a prefilter should always
+ /// correspond to the start of a potential match.
+ fn prefilter(&self) -> Option<&dyn Prefilter>;
+
+ /// Return the identifier of this automaton's start state.
+ fn start_state(&self) -> Self::ID;
+
+ /// Returns true if and only if the given state identifier refers to a
+ /// valid state.
+ fn is_valid(&self, id: Self::ID) -> bool;
+
+ /// Returns true if and only if the given identifier corresponds to a match
+ /// state.
+ ///
+ /// The state ID given must be valid, or else implementors may panic.
+ fn is_match_state(&self, id: Self::ID) -> bool;
+
+ /// Returns true if and only if the given identifier corresponds to a state
+ /// that is either the dead state or a match state.
+ ///
+ /// Depending on the implementation of the automaton, this routine can
+ /// be used to save a branch in the core matching loop. Nevertheless,
+ /// `is_match_state(id) || id == dead_id()` is always a valid
+ /// implementation. Indeed, this is the default implementation.
+ ///
+ /// The state ID given must be valid, or else implementors may panic.
+ fn is_match_or_dead_state(&self, id: Self::ID) -> bool {
+ id == dead_id() || self.is_match_state(id)
+ }
+
+ /// If the given state is a match state, return the match corresponding
+ /// to the given match index. `end` must be the ending position of the
+ /// detected match. If no match exists or if `match_index` exceeds the
+ /// number of matches in this state, then `None` is returned.
+ ///
+ /// The state ID given must be valid, or else implementors may panic.
+ ///
+ /// If the given state ID is correct and if the `match_index` is less than
+ /// the number of matches for that state, then this is guaranteed to return
+ /// a match.
+ fn get_match(
+ &self,
+ id: Self::ID,
+ match_index: usize,
+ end: usize,
+ ) -> Option<Match>;
+
+ /// Returns the number of matches for the given state. If the given state
+ /// is not a match state, then this returns 0.
+ ///
+ /// The state ID given must be valid, or else implementors must panic.
+ fn match_count(&self, id: Self::ID) -> usize;
+
+ /// Given the current state that this automaton is in and the next input
+ /// byte, this method returns the identifier of the next state. The
+ /// identifier returned must always be valid and may never correspond to
+ /// the fail state. The returned identifier may, however, point to the
+ /// dead state.
+ ///
+ /// This is not safe so that implementors may look up the next state
+ /// without memory safety checks such as bounds checks. As such, callers
+ /// must ensure that the given identifier corresponds to a valid automaton
+ /// state. Implementors must, in turn, ensure that this routine is safe for
+ /// all valid state identifiers and for all possible `u8` values.
+ fn next_state(&self, current: Self::ID, input: u8) -> Self::ID;
+
+ /// Like next_state, but debug_asserts that the underlying
+ /// implementation never returns a `fail_id()` for the next state.
+ fn next_state_no_fail(&self, current: Self::ID, input: u8) -> Self::ID {
+ let next = self.next_state(current, input);
+ // We should never see a transition to the failure state.
+ debug_assert!(
+ next != fail_id(),
+ "automaton should never return fail_id for next state"
+ );
+ next
+ }
+
+ /// Execute a search using standard match semantics.
+ ///
+ /// This can be used even when the automaton was constructed with leftmost
+ /// match semantics when you want to find the earliest possible match. This
+ /// can also be used as part of an overlapping search implementation.
+ ///
+ /// N.B. This does not report a match if `state_id` is given as a matching
+ /// state. As such, this should not be used directly.
+ #[inline(always)]
+ fn standard_find_at(
+ &self,
+ prestate: &mut PrefilterState,
+ haystack: &[u8],
+ at: usize,
+ state_id: &mut Self::ID,
+ ) -> Option<Match> {
+ if let Some(pre) = self.prefilter() {
+ self.standard_find_at_imp(
+ prestate,
+ Some(pre),
+ haystack,
+ at,
+ state_id,
+ )
+ } else {
+ self.standard_find_at_imp(prestate, None, haystack, at, state_id)
+ }
+ }
+
+ // It's important for this to always be inlined. Namely, its only caller
+ // is standard_find_at, and the inlining should remove the case analysis
+ // for prefilter scanning when there is no prefilter available.
+ #[inline(always)]
+ fn standard_find_at_imp(
+ &self,
+ prestate: &mut PrefilterState,
+ prefilter: Option<&dyn Prefilter>,
+ haystack: &[u8],
+ mut at: usize,
+ state_id: &mut Self::ID,
+ ) -> Option<Match> {
+ while at < haystack.len() {
+ if let Some(pre) = prefilter {
+ if prestate.is_effective(at) && *state_id == self.start_state()
+ {
+ let c = prefilter::next(prestate, pre, haystack, at)
+ .into_option();
+ match c {
+ None => return None,
+ Some(i) => {
+ at = i;
+ }
+ }
+ }
+ }
+ // CORRECTNESS: next_state is correct for all possible u8 values,
+ // so the only thing we're concerned about is the validity of
+ // `state_id`. `state_id` either comes from the caller (in which
+ // case, we assume it is correct), or it comes from the return
+ // value of next_state, which is guaranteed to be correct.
+ *state_id = self.next_state_no_fail(*state_id, haystack[at]);
+ at += 1;
+ // This routine always quits immediately after seeing a
+ // match, and since dead states can only come after seeing
+ // a match, seeing a dead state here is impossible. (Unless
+ // we have an anchored automaton, in which case, dead states
+ // are used to stop a search.)
+ debug_assert!(
+ *state_id != dead_id() || self.anchored(),
+ "standard find should never see a dead state"
+ );
+
+ if self.is_match_or_dead_state(*state_id) {
+ return if *state_id == dead_id() {
+ None
+ } else {
+ self.get_match(*state_id, 0, at)
+ };
+ }
+ }
+ None
+ }
+
+ /// Execute a search using leftmost (either first or longest) match
+ /// semantics.
+ ///
+ /// The principle difference between searching with standard semantics and
+ /// searching with leftmost semantics is that leftmost searching will
+ /// continue searching even after a match has been found. Once a match
+ /// is found, the search does not stop until either the haystack has been
+ /// exhausted or a dead state is observed in the automaton. (Dead states
+ /// only exist in automatons constructed with leftmost semantics.) That is,
+ /// we rely on the construction of the automaton to tell us when to quit.
+ #[inline(never)]
+ fn leftmost_find_at(
+ &self,
+ prestate: &mut PrefilterState,
+ haystack: &[u8],
+ at: usize,
+ state_id: &mut Self::ID,
+ ) -> Option<Match> {
+ if let Some(pre) = self.prefilter() {
+ self.leftmost_find_at_imp(
+ prestate,
+ Some(pre),
+ haystack,
+ at,
+ state_id,
+ )
+ } else {
+ self.leftmost_find_at_imp(prestate, None, haystack, at, state_id)
+ }
+ }
+
+ // It's important for this to always be inlined. Namely, its only caller
+ // is leftmost_find_at, and the inlining should remove the case analysis
+ // for prefilter scanning when there is no prefilter available.
+ #[inline(always)]
+ fn leftmost_find_at_imp(
+ &self,
+ prestate: &mut PrefilterState,
+ prefilter: Option<&dyn Prefilter>,
+ haystack: &[u8],
+ mut at: usize,
+ state_id: &mut Self::ID,
+ ) -> Option<Match> {
+ debug_assert!(self.match_kind().is_leftmost());
+ if self.anchored() && at > 0 && *state_id == self.start_state() {
+ return None;
+ }
+ let mut last_match = self.get_match(*state_id, 0, at);
+ while at < haystack.len() {
+ if let Some(pre) = prefilter {
+ if prestate.is_effective(at) && *state_id == self.start_state()
+ {
+ let c = prefilter::next(prestate, pre, haystack, at)
+ .into_option();
+ match c {
+ None => return None,
+ Some(i) => {
+ at = i;
+ }
+ }
+ }
+ }
+ // CORRECTNESS: next_state is correct for all possible u8 values,
+ // so the only thing we're concerned about is the validity of
+ // `state_id`. `state_id` either comes from the caller (in which
+ // case, we assume it is correct), or it comes from the return
+ // value of next_state, which is guaranteed to be correct.
+ *state_id = self.next_state_no_fail(*state_id, haystack[at]);
+ at += 1;
+ if self.is_match_or_dead_state(*state_id) {
+ if *state_id == dead_id() {
+ // The only way to enter into a dead state is if a match
+ // has been found, so we assert as much. This is different
+ // from normal automata, where you might enter a dead state
+ // if you know a subsequent match will never be found
+ // (regardless of whether a match has already been found).
+ // For Aho-Corasick, it is built so that we can match at
+ // any position, so the possibility of a match always
+ // exists.
+ //
+ // (Unless we have an anchored automaton, in which case,
+ // dead states are used to stop a search.)
+ debug_assert!(
+ last_match.is_some() || self.anchored(),
+ "failure state should only be seen after match"
+ );
+ return last_match;
+ }
+ last_match = self.get_match(*state_id, 0, at);
+ }
+ }
+ last_match
+ }
+
+ /// This is like leftmost_find_at, but does not need to track a caller
+ /// provided state id. In other words, the only output of this routine is a
+ /// match, if one exists.
+ ///
+ /// It is regrettable that we need to effectively copy a chunk of
+ /// implementation twice, but when we don't need to track the state ID, we
+ /// can allow the prefilter to report matches immediately without having
+ /// to re-confirm them with the automaton. The re-confirmation step is
+ /// necessary in leftmost_find_at because tracing through the automaton is
+ /// the only way to correctly set the state ID. (Perhaps an alternative
+ /// would be to keep a map from pattern ID to matching state ID, but that
+ /// complicates the code and still doesn't permit us to defer to the
+ /// prefilter entirely when possible.)
+ ///
+ /// I did try a few things to avoid the code duplication here, but nothing
+ /// optimized as well as this approach. (In microbenchmarks, there was
+ /// about a 25% difference.)
+ #[inline(never)]
+ fn leftmost_find_at_no_state(
+ &self,
+ prestate: &mut PrefilterState,
+ haystack: &[u8],
+ at: usize,
+ ) -> Option<Match> {
+ if let Some(pre) = self.prefilter() {
+ self.leftmost_find_at_no_state_imp(
+ prestate,
+ Some(pre),
+ haystack,
+ at,
+ )
+ } else {
+ self.leftmost_find_at_no_state_imp(prestate, None, haystack, at)
+ }
+ }
+
+ // It's important for this to always be inlined. Namely, its only caller
+ // is leftmost_find_at_no_state, and the inlining should remove the case
+ // analysis for prefilter scanning when there is no prefilter available.
+ #[inline(always)]
+ fn leftmost_find_at_no_state_imp(
+ &self,
+ prestate: &mut PrefilterState,
+ prefilter: Option<&dyn Prefilter>,
+ haystack: &[u8],
+ mut at: usize,
+ ) -> Option<Match> {
+ debug_assert!(self.match_kind().is_leftmost());
+ if self.anchored() && at > 0 {
+ return None;
+ }
+ // If our prefilter handles confirmation of matches 100% of the
+ // time, and since we don't need to track state IDs, we can avoid
+ // Aho-Corasick completely.
+ if let Some(pre) = prefilter {
+ // We should never have a prefilter during an anchored search.
+ debug_assert!(!self.anchored());
+ if !pre.reports_false_positives() {
+ return match pre.next_candidate(prestate, haystack, at) {
+ Candidate::None => None,
+ Candidate::Match(m) => Some(m),
+ Candidate::PossibleStartOfMatch(_) => unreachable!(),
+ };
+ }
+ }
+
+ let mut state_id = self.start_state();
+ let mut last_match = self.get_match(state_id, 0, at);
+ while at < haystack.len() {
+ if let Some(pre) = prefilter {
+ if prestate.is_effective(at) && state_id == self.start_state()
+ {
+ match prefilter::next(prestate, pre, haystack, at) {
+ Candidate::None => return None,
+ // Since we aren't tracking a state ID, we can
+ // quit early once we know we have a match.
+ Candidate::Match(m) => return Some(m),
+ Candidate::PossibleStartOfMatch(i) => {
+ at = i;
+ }
+ }
+ }
+ }
+ // CORRECTNESS: next_state is correct for all possible u8 values,
+ // so the only thing we're concerned about is the validity of
+ // `state_id`. `state_id` either comes from the caller (in which
+ // case, we assume it is correct), or it comes from the return
+ // value of next_state, which is guaranteed to be correct.
+ state_id = self.next_state_no_fail(state_id, haystack[at]);
+ at += 1;
+ if self.is_match_or_dead_state(state_id) {
+ if state_id == dead_id() {
+ // The only way to enter into a dead state is if a
+ // match has been found, so we assert as much. This
+ // is different from normal automata, where you might
+ // enter a dead state if you know a subsequent match
+ // will never be found (regardless of whether a match
+ // has already been found). For Aho-Corasick, it is
+ // built so that we can match at any position, so the
+ // possibility of a match always exists.
+ //
+ // (Unless we have an anchored automaton, in which
+ // case, dead states are used to stop a search.)
+ debug_assert!(
+ last_match.is_some() || self.anchored(),
+ "failure state should only be seen after match"
+ );
+ return last_match;
+ }
+ last_match = self.get_match(state_id, 0, at);
+ }
+ }
+ last_match
+ }
+
+ /// Execute an overlapping search.
+ ///
+ /// When executing an overlapping match, the previous state ID in addition
+ /// to the previous match index should be given. If there are more matches
+ /// at the given state, then the match is reported and the given index is
+ /// incremented.
+ #[inline(always)]
+ fn overlapping_find_at(
+ &self,
+ prestate: &mut PrefilterState,
+ haystack: &[u8],
+ at: usize,
+ state_id: &mut Self::ID,
+ match_index: &mut usize,
+ ) -> Option<Match> {
+ if self.anchored() && at > 0 && *state_id == self.start_state() {
+ return None;
+ }
+
+ let match_count = self.match_count(*state_id);
+ if *match_index < match_count {
+ // This is guaranteed to return a match since
+ // match_index < match_count.
+ let result = self.get_match(*state_id, *match_index, at);
+ debug_assert!(result.is_some(), "must be a match");
+ *match_index += 1;
+ return result;
+ }
+
+ *match_index = 0;
+ match self.standard_find_at(prestate, haystack, at, state_id) {
+ None => None,
+ Some(m) => {
+ *match_index = 1;
+ Some(m)
+ }
+ }
+ }
+
+ /// Return the earliest match found. This returns as soon as we know that
+ /// we have a match. As such, this does not necessarily correspond to the
+ /// leftmost starting match, but rather, the leftmost position at which a
+ /// match ends.
+ #[inline(always)]
+ fn earliest_find_at(
+ &self,
+ prestate: &mut PrefilterState,
+ haystack: &[u8],
+ at: usize,
+ state_id: &mut Self::ID,
+ ) -> Option<Match> {
+ if *state_id == self.start_state() {
+ if self.anchored() && at > 0 {
+ return None;
+ }
+ if let Some(m) = self.get_match(*state_id, 0, at) {
+ return Some(m);
+ }
+ }
+ self.standard_find_at(prestate, haystack, at, state_id)
+ }
+
+ /// A convenience function for finding the next match according to the
+ /// match semantics of this automaton. For standard match semantics, this
+ /// finds the earliest match. Otherwise, the leftmost match is found.
+ #[inline(always)]
+ fn find_at(
+ &self,
+ prestate: &mut PrefilterState,
+ haystack: &[u8],
+ at: usize,
+ state_id: &mut Self::ID,
+ ) -> Option<Match> {
+ match *self.match_kind() {
+ MatchKind::Standard => {
+ self.earliest_find_at(prestate, haystack, at, state_id)
+ }
+ MatchKind::LeftmostFirst | MatchKind::LeftmostLongest => {
+ self.leftmost_find_at(prestate, haystack, at, state_id)
+ }
+ MatchKind::__Nonexhaustive => unreachable!(),
+ }
+ }
+
+ /// Like find_at, but does not track state identifiers. This permits some
+ /// optimizations when a prefilter that confirms its own matches is
+ /// present.
+ #[inline(always)]
+ fn find_at_no_state(
+ &self,
+ prestate: &mut PrefilterState,
+ haystack: &[u8],
+ at: usize,
+ ) -> Option<Match> {
+ match *self.match_kind() {
+ MatchKind::Standard => {
+ let mut state = self.start_state();
+ self.earliest_find_at(prestate, haystack, at, &mut state)
+ }
+ MatchKind::LeftmostFirst | MatchKind::LeftmostLongest => {
+ self.leftmost_find_at_no_state(prestate, haystack, at)
+ }
+ MatchKind::__Nonexhaustive => unreachable!(),
+ }
+ }
+}
diff --git a/src/buffer.rs b/src/buffer.rs
new file mode 100644
index 0000000..1008196
--- /dev/null
+++ b/src/buffer.rs
@@ -0,0 +1,130 @@
+use std::cmp;
+use std::io;
+use std::ptr;
+
+/// The default buffer capacity that we use for the stream buffer.
+const DEFAULT_BUFFER_CAPACITY: usize = 8 * (1 << 10); // 8 KB
+
+/// A fairly simple roll buffer for supporting stream searches.
+///
+/// This buffer acts as a temporary place to store a fixed amount of data when
+/// reading from a stream. Its central purpose is to allow "rolling" some
+/// suffix of the data to the beginning of the buffer before refilling it with
+/// more data from the stream. For example, let's say we are trying to match
+/// "foobar" on a stream. When we report the match, we'd like to not only
+/// report the correct offsets at which the match occurs, but also the matching
+/// bytes themselves. So let's say our stream is a file with the following
+/// contents: `test test foobar test test`. Now assume that we happen to read
+/// the aforementioned file in two chunks: `test test foo` and `bar test test`.
+/// Naively, it would not be possible to report a single contiguous `foobar`
+/// match, but this roll buffer allows us to do that. Namely, after the second
+/// read, the contents of the buffer should be `st foobar test test`, where the
+/// search should ultimately resume immediately after `foo`. (The prefix `st `
+/// is included because the roll buffer saves N bytes at the end of the buffer,
+/// where N is the maximum possible length of a match.)
+///
+/// A lot of the logic for dealing with this is unfortunately split out between
+/// this roll buffer and the `StreamChunkIter`.
+#[derive(Debug)]
+pub struct Buffer {
+ /// The raw buffer contents. This has a fixed size and never increases.
+ buf: Vec<u8>,
+ /// The minimum size of the buffer, which is equivalent to the maximum
+ /// possible length of a match. This corresponds to the amount that we
+ /// roll
+ min: usize,
+ /// The end of the contents of this buffer.
+ end: usize,
+}
+
+impl Buffer {
+ /// Create a new buffer for stream searching. The minimum buffer length
+ /// given should be the size of the maximum possible match length.
+ pub fn new(min_buffer_len: usize) -> Buffer {
+ let min = cmp::max(1, min_buffer_len);
+ // The minimum buffer amount is also the amount that we roll our
+ // buffer in order to support incremental searching. To this end,
+ // our actual capacity needs to be at least 1 byte bigger than our
+ // minimum amount, otherwise we won't have any overlap. In actuality,
+ // we want our buffer to be a bit bigger than that for performance
+ // reasons, so we set a lower bound of `8 * min`.
+ //
+ // TODO: It would be good to find a way to test the streaming
+ // implementation with the minimal buffer size.
+ let capacity = cmp::max(min * 8, DEFAULT_BUFFER_CAPACITY);
+ Buffer { buf: vec![0; capacity], min, end: 0 }
+ }
+
+ /// Return the contents of this buffer.
+ #[inline]
+ pub fn buffer(&self) -> &[u8] {
+ &self.buf[..self.end]
+ }
+
+ /// Return the minimum size of the buffer. The only way a buffer may be
+ /// smaller than this is if the stream itself contains less than the
+ /// minimum buffer amount.
+ #[inline]
+ pub fn min_buffer_len(&self) -> usize {
+ self.min
+ }
+
+ /// Return the total length of the contents in the buffer.
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.end
+ }
+
+ /// Return all free capacity in this buffer.
+ fn free_buffer(&mut self) -> &mut [u8] {
+ &mut self.buf[self.end..]
+ }
+
+ /// Refill the contents of this buffer by reading as much as possible into
+ /// this buffer's free capacity. If no more bytes could be read, then this
+ /// returns false. Otherwise, this reads until it has filled the buffer
+ /// past the minimum amount.
+ pub fn fill<R: io::Read>(&mut self, mut rdr: R) -> io::Result<bool> {
+ let mut readany = false;
+ loop {
+ let readlen = rdr.read(self.free_buffer())?;
+ if readlen == 0 {
+ return Ok(readany);
+ }
+ readany = true;
+ self.end += readlen;
+ if self.len() >= self.min {
+ return Ok(true);
+ }
+ }
+ }
+
+ /// Roll the contents of the buffer so that the suffix of this buffer is
+ /// moved to the front and all other contents are dropped. The size of the
+ /// suffix corresponds precisely to the minimum buffer length.
+ ///
+ /// This should only be called when the entire contents of this buffer have
+ /// been searched.
+ pub fn roll(&mut self) {
+ let roll_start = self
+ .end
+ .checked_sub(self.min)
+ .expect("buffer capacity should be bigger than minimum amount");
+ let roll_len = self.min;
+
+ assert!(roll_start + roll_len <= self.end);
+ unsafe {
+ // SAFETY: A buffer contains Copy data, so there's no problem
+ // moving it around. Safety also depends on our indices being in
+ // bounds, which they always should be, given the assert above.
+ //
+ // TODO: Switch to [T]::copy_within once our MSRV is high enough.
+ ptr::copy(
+ self.buf[roll_start..].as_ptr(),
+ self.buf.as_mut_ptr(),
+ roll_len,
+ );
+ }
+ self.end = roll_len;
+ }
+}
diff --git a/src/byte_frequencies.rs b/src/byte_frequencies.rs
new file mode 100644
index 0000000..c313b62
--- /dev/null
+++ b/src/byte_frequencies.rs
@@ -0,0 +1,258 @@
+pub const BYTE_FREQUENCIES: [u8; 256] = [
+ 55, // '\x00'
+ 52, // '\x01'
+ 51, // '\x02'
+ 50, // '\x03'
+ 49, // '\x04'
+ 48, // '\x05'
+ 47, // '\x06'
+ 46, // '\x07'
+ 45, // '\x08'
+ 103, // '\t'
+ 242, // '\n'
+ 66, // '\x0b'
+ 67, // '\x0c'
+ 229, // '\r'
+ 44, // '\x0e'
+ 43, // '\x0f'
+ 42, // '\x10'
+ 41, // '\x11'
+ 40, // '\x12'
+ 39, // '\x13'
+ 38, // '\x14'
+ 37, // '\x15'
+ 36, // '\x16'
+ 35, // '\x17'
+ 34, // '\x18'
+ 33, // '\x19'
+ 56, // '\x1a'
+ 32, // '\x1b'
+ 31, // '\x1c'
+ 30, // '\x1d'
+ 29, // '\x1e'
+ 28, // '\x1f'
+ 255, // ' '
+ 148, // '!'
+ 164, // '"'
+ 149, // '#'
+ 136, // '$'
+ 160, // '%'
+ 155, // '&'
+ 173, // "'"
+ 221, // '('
+ 222, // ')'
+ 134, // '*'
+ 122, // '+'
+ 232, // ','
+ 202, // '-'
+ 215, // '.'
+ 224, // '/'
+ 208, // '0'
+ 220, // '1'
+ 204, // '2'
+ 187, // '3'
+ 183, // '4'
+ 179, // '5'
+ 177, // '6'
+ 168, // '7'
+ 178, // '8'
+ 200, // '9'
+ 226, // ':'
+ 195, // ';'
+ 154, // '<'
+ 184, // '='
+ 174, // '>'
+ 126, // '?'
+ 120, // '@'
+ 191, // 'A'
+ 157, // 'B'
+ 194, // 'C'
+ 170, // 'D'
+ 189, // 'E'
+ 162, // 'F'
+ 161, // 'G'
+ 150, // 'H'
+ 193, // 'I'
+ 142, // 'J'
+ 137, // 'K'
+ 171, // 'L'
+ 176, // 'M'
+ 185, // 'N'
+ 167, // 'O'
+ 186, // 'P'
+ 112, // 'Q'
+ 175, // 'R'
+ 192, // 'S'
+ 188, // 'T'
+ 156, // 'U'
+ 140, // 'V'
+ 143, // 'W'
+ 123, // 'X'
+ 133, // 'Y'
+ 128, // 'Z'
+ 147, // '['
+ 138, // '\\'
+ 146, // ']'
+ 114, // '^'
+ 223, // '_'
+ 151, // '`'
+ 249, // 'a'
+ 216, // 'b'
+ 238, // 'c'
+ 236, // 'd'
+ 253, // 'e'
+ 227, // 'f'
+ 218, // 'g'
+ 230, // 'h'
+ 247, // 'i'
+ 135, // 'j'
+ 180, // 'k'
+ 241, // 'l'
+ 233, // 'm'
+ 246, // 'n'
+ 244, // 'o'
+ 231, // 'p'
+ 139, // 'q'
+ 245, // 'r'
+ 243, // 's'
+ 251, // 't'
+ 235, // 'u'
+ 201, // 'v'
+ 196, // 'w'
+ 240, // 'x'
+ 214, // 'y'
+ 152, // 'z'
+ 182, // '{'
+ 205, // '|'
+ 181, // '}'
+ 127, // '~'
+ 27, // '\x7f'
+ 212, // '\x80'
+ 211, // '\x81'
+ 210, // '\x82'
+ 213, // '\x83'
+ 228, // '\x84'
+ 197, // '\x85'
+ 169, // '\x86'
+ 159, // '\x87'
+ 131, // '\x88'
+ 172, // '\x89'
+ 105, // '\x8a'
+ 80, // '\x8b'
+ 98, // '\x8c'
+ 96, // '\x8d'
+ 97, // '\x8e'
+ 81, // '\x8f'
+ 207, // '\x90'
+ 145, // '\x91'
+ 116, // '\x92'
+ 115, // '\x93'
+ 144, // '\x94'
+ 130, // '\x95'
+ 153, // '\x96'
+ 121, // '\x97'
+ 107, // '\x98'
+ 132, // '\x99'
+ 109, // '\x9a'
+ 110, // '\x9b'
+ 124, // '\x9c'
+ 111, // '\x9d'
+ 82, // '\x9e'
+ 108, // '\x9f'
+ 118, // '\xa0'
+ 141, // '¡'
+ 113, // '¢'
+ 129, // '£'
+ 119, // '¤'
+ 125, // '¥'
+ 165, // '¦'
+ 117, // '§'
+ 92, // '¨'
+ 106, // '©'
+ 83, // 'ª'
+ 72, // '«'
+ 99, // '¬'
+ 93, // '\xad'
+ 65, // '®'
+ 79, // '¯'
+ 166, // '°'
+ 237, // '±'
+ 163, // '²'
+ 199, // '³'
+ 190, // '´'
+ 225, // 'µ'
+ 209, // '¶'
+ 203, // '·'
+ 198, // '¸'
+ 217, // '¹'
+ 219, // 'º'
+ 206, // '»'
+ 234, // '¼'
+ 248, // '½'
+ 158, // '¾'
+ 239, // '¿'
+ 255, // 'À'
+ 255, // 'Á'
+ 255, // 'Â'
+ 255, // 'Ã'
+ 255, // 'Ä'
+ 255, // 'Å'
+ 255, // 'Æ'
+ 255, // 'Ç'
+ 255, // 'È'
+ 255, // 'É'
+ 255, // 'Ê'
+ 255, // 'Ë'
+ 255, // 'Ì'
+ 255, // 'Í'
+ 255, // 'Î'
+ 255, // 'Ï'
+ 255, // 'Ð'
+ 255, // 'Ñ'
+ 255, // 'Ò'
+ 255, // 'Ó'
+ 255, // 'Ô'
+ 255, // 'Õ'
+ 255, // 'Ö'
+ 255, // '×'
+ 255, // 'Ø'
+ 255, // 'Ù'
+ 255, // 'Ú'
+ 255, // 'Û'
+ 255, // 'Ü'
+ 255, // 'Ý'
+ 255, // 'Þ'
+ 255, // 'ß'
+ 255, // 'à'
+ 255, // 'á'
+ 255, // 'â'
+ 255, // 'ã'
+ 255, // 'ä'
+ 255, // 'å'
+ 255, // 'æ'
+ 255, // 'ç'
+ 255, // 'è'
+ 255, // 'é'
+ 255, // 'ê'
+ 255, // 'ë'
+ 255, // 'ì'
+ 255, // 'í'
+ 255, // 'î'
+ 255, // 'ï'
+ 255, // 'ð'
+ 255, // 'ñ'
+ 255, // 'ò'
+ 255, // 'ó'
+ 255, // 'ô'
+ 255, // 'õ'
+ 255, // 'ö'
+ 255, // '÷'
+ 255, // 'ø'
+ 255, // 'ù'
+ 255, // 'ú'
+ 255, // 'û'
+ 255, // 'ü'
+ 255, // 'ý'
+ 255, // 'þ'
+ 255, // 'ÿ'
+];
diff --git a/src/classes.rs b/src/classes.rs
new file mode 100644
index 0000000..1fba7ea
--- /dev/null
+++ b/src/classes.rs
@@ -0,0 +1,238 @@
+use std::fmt;
+
+/// A representation of byte oriented equivalence classes.
+///
+/// This is used in an FSM to reduce the size of the transition table. This can
+/// have a particularly large impact not only on the total size of an FSM, but
+/// also on compile times.
+#[derive(Clone, Copy)]
+pub struct ByteClasses([u8; 256]);
+
+impl ByteClasses {
+ /// Creates a new set of equivalence classes where all bytes are mapped to
+ /// the same class.
+ pub fn empty() -> ByteClasses {
+ ByteClasses([0; 256])
+ }
+
+ /// Creates a new set of equivalence classes where each byte belongs to
+ /// its own equivalence class.
+ pub fn singletons() -> ByteClasses {
+ let mut classes = ByteClasses::empty();
+ for i in 0..256 {
+ classes.set(i as u8, i as u8);
+ }
+ classes
+ }
+
+ /// Set the equivalence class for the given byte.
+ #[inline]
+ pub fn set(&mut self, byte: u8, class: u8) {
+ self.0[byte as usize] = class;
+ }
+
+ /// Get the equivalence class for the given byte.
+ #[inline]
+ pub fn get(&self, byte: u8) -> u8 {
+ // SAFETY: This is safe because all dense transitions have
+ // exactly 256 elements, so all u8 values are valid indices.
+ self.0[byte as usize]
+ }
+
+ /// Return the total number of elements in the alphabet represented by
+ /// these equivalence classes. Equivalently, this returns the total number
+ /// of equivalence classes.
+ #[inline]
+ pub fn alphabet_len(&self) -> usize {
+ self.0[255] as usize + 1
+ }
+
+ /// Returns true if and only if every byte in this class maps to its own
+ /// equivalence class. Equivalently, there are 256 equivalence classes
+ /// and each class contains exactly one byte.
+ #[inline]
+ pub fn is_singleton(&self) -> bool {
+ self.alphabet_len() == 256
+ }
+
+ /// Returns an iterator over a sequence of representative bytes from each
+ /// equivalence class. Namely, this yields exactly N items, where N is
+ /// equivalent to the number of equivalence classes. Each item is an
+ /// arbitrary byte drawn from each equivalence class.
+ ///
+ /// This is useful when one is determinizing an NFA and the NFA's alphabet
+ /// hasn't been converted to equivalence classes yet. Picking an arbitrary
+ /// byte from each equivalence class then permits a full exploration of
+ /// the NFA instead of using every possible byte value.
+ pub fn representatives(&self) -> ByteClassRepresentatives {
+ ByteClassRepresentatives { classes: self, byte: 0, last_class: None }
+ }
+
+ /// Returns all of the bytes in the given equivalence class.
+ ///
+ /// The second element in the tuple indicates the number of elements in
+ /// the array.
+ fn elements(&self, equiv: u8) -> ([u8; 256], usize) {
+ let (mut array, mut len) = ([0; 256], 0);
+ for b in 0..256 {
+ if self.get(b as u8) == equiv {
+ array[len] = b as u8;
+ len += 1;
+ }
+ }
+ (array, len)
+ }
+}
+
+impl fmt::Debug for ByteClasses {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ if self.is_singleton() {
+ write!(f, "ByteClasses({{singletons}})")
+ } else {
+ write!(f, "ByteClasses(")?;
+ for equiv in 0..self.alphabet_len() {
+ let (members, len) = self.elements(equiv as u8);
+ write!(f, " {} => {:?}", equiv, &members[..len])?;
+ }
+ write!(f, ")")
+ }
+ }
+}
+
+/// An iterator over representative bytes from each equivalence class.
+#[derive(Debug)]
+pub struct ByteClassRepresentatives<'a> {
+ classes: &'a ByteClasses,
+ byte: usize,
+ last_class: Option<u8>,
+}
+
+impl<'a> Iterator for ByteClassRepresentatives<'a> {
+ type Item = u8;
+
+ fn next(&mut self) -> Option<u8> {
+ while self.byte < 256 {
+ let byte = self.byte as u8;
+ let class = self.classes.get(byte);
+ self.byte += 1;
+
+ if self.last_class != Some(class) {
+ self.last_class = Some(class);
+ return Some(byte);
+ }
+ }
+ None
+ }
+}
+
+/// A byte class builder keeps track of an *approximation* of equivalence
+/// classes of bytes during NFA construction. That is, every byte in an
+/// equivalence class cannot discriminate between a match and a non-match.
+///
+/// For example, in the literals `abc` and `xyz`, the bytes [\x00-`], [d-w]
+/// and [{-\xFF] never discriminate between a match and a non-match, precisely
+/// because they never occur in the literals anywhere.
+///
+/// Note though that this does not necessarily compute the minimal set of
+/// equivalence classes. For example, in the literals above, the byte ranges
+/// [\x00-`], [d-w] and [{-\xFF] are all treated as distinct equivalence
+/// classes even though they could be treated a single class. The reason for
+/// this is implementation complexity. In the future, we should endeavor to
+/// compute the minimal equivalence classes since they can have a rather large
+/// impact on the size of the DFA.
+///
+/// The representation here is 256 booleans, all initially set to false. Each
+/// boolean maps to its corresponding byte based on position. A `true` value
+/// indicates the end of an equivalence class, where its corresponding byte
+/// and all of the bytes corresponding to all previous contiguous `false`
+/// values are in the same equivalence class.
+///
+/// This particular representation only permits contiguous ranges of bytes to
+/// be in the same equivalence class, which means that we can never discover
+/// the true minimal set of equivalence classes.
+#[derive(Debug)]
+pub struct ByteClassBuilder(Vec<bool>);
+
+impl ByteClassBuilder {
+ /// Create a new builder of byte classes where all bytes are part of the
+ /// same equivalence class.
+ pub fn new() -> ByteClassBuilder {
+ ByteClassBuilder(vec![false; 256])
+ }
+
+ /// Indicate the the range of byte given (inclusive) can discriminate a
+ /// match between it and all other bytes outside of the range.
+ pub fn set_range(&mut self, start: u8, end: u8) {
+ debug_assert!(start <= end);
+ if start > 0 {
+ self.0[start as usize - 1] = true;
+ }
+ self.0[end as usize] = true;
+ }
+
+ /// Build byte classes that map all byte values to their corresponding
+ /// equivalence class. The last mapping indicates the largest equivalence
+ /// class identifier (which is never bigger than 255).
+ pub fn build(&self) -> ByteClasses {
+ let mut classes = ByteClasses::empty();
+ let mut class = 0u8;
+ let mut i = 0;
+ loop {
+ classes.set(i as u8, class as u8);
+ if i >= 255 {
+ break;
+ }
+ if self.0[i] {
+ class = class.checked_add(1).unwrap();
+ }
+ i += 1;
+ }
+ classes
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn byte_classes() {
+ let mut set = ByteClassBuilder::new();
+ set.set_range(b'a', b'z');
+
+ let classes = set.build();
+ assert_eq!(classes.get(0), 0);
+ assert_eq!(classes.get(1), 0);
+ assert_eq!(classes.get(2), 0);
+ assert_eq!(classes.get(b'a' - 1), 0);
+ assert_eq!(classes.get(b'a'), 1);
+ assert_eq!(classes.get(b'm'), 1);
+ assert_eq!(classes.get(b'z'), 1);
+ assert_eq!(classes.get(b'z' + 1), 2);
+ assert_eq!(classes.get(254), 2);
+ assert_eq!(classes.get(255), 2);
+
+ let mut set = ByteClassBuilder::new();
+ set.set_range(0, 2);
+ set.set_range(4, 6);
+ let classes = set.build();
+ assert_eq!(classes.get(0), 0);
+ assert_eq!(classes.get(1), 0);
+ assert_eq!(classes.get(2), 0);
+ assert_eq!(classes.get(3), 1);
+ assert_eq!(classes.get(4), 2);
+ assert_eq!(classes.get(5), 2);
+ assert_eq!(classes.get(6), 2);
+ assert_eq!(classes.get(7), 3);
+ assert_eq!(classes.get(255), 3);
+ }
+
+ #[test]
+ fn full_byte_classes() {
+ let mut set = ByteClassBuilder::new();
+ for i in 0..256u16 {
+ set.set_range(i as u8, i as u8);
+ }
+ assert_eq!(set.build().alphabet_len(), 256);
+ }
+}
diff --git a/src/dfa.rs b/src/dfa.rs
new file mode 100644
index 0000000..1bf37d5
--- /dev/null
+++ b/src/dfa.rs
@@ -0,0 +1,709 @@
+use std::mem::size_of;
+
+use ahocorasick::MatchKind;
+use automaton::Automaton;
+use classes::ByteClasses;
+use error::Result;
+use nfa::{PatternID, PatternLength, NFA};
+use prefilter::{Prefilter, PrefilterObj, PrefilterState};
+use state_id::{dead_id, fail_id, premultiply_overflow_error, StateID};
+use Match;
+
+#[derive(Clone, Debug)]
+pub enum DFA<S> {
+ Standard(Standard<S>),
+ ByteClass(ByteClass<S>),
+ Premultiplied(Premultiplied<S>),
+ PremultipliedByteClass(PremultipliedByteClass<S>),
+}
+
+impl<S: StateID> DFA<S> {
+ fn repr(&self) -> &Repr<S> {
+ match *self {
+ DFA::Standard(ref dfa) => dfa.repr(),
+ DFA::ByteClass(ref dfa) => dfa.repr(),
+ DFA::Premultiplied(ref dfa) => dfa.repr(),
+ DFA::PremultipliedByteClass(ref dfa) => dfa.repr(),
+ }
+ }
+
+ pub fn match_kind(&self) -> &MatchKind {
+ &self.repr().match_kind
+ }
+
+ pub fn heap_bytes(&self) -> usize {
+ self.repr().heap_bytes
+ }
+
+ pub fn max_pattern_len(&self) -> usize {
+ self.repr().max_pattern_len
+ }
+
+ pub fn pattern_count(&self) -> usize {
+ self.repr().pattern_count
+ }
+
+ pub fn start_state(&self) -> S {
+ self.repr().start_id
+ }
+
+ #[inline(always)]
+ pub fn overlapping_find_at(
+ &self,
+ prestate: &mut PrefilterState,
+ haystack: &[u8],
+ at: usize,
+ state_id: &mut S,
+ match_index: &mut usize,
+ ) -> Option<Match> {
+ match *self {
+ DFA::Standard(ref dfa) => dfa.overlapping_find_at(
+ prestate,
+ haystack,
+ at,
+ state_id,
+ match_index,
+ ),
+ DFA::ByteClass(ref dfa) => dfa.overlapping_find_at(
+ prestate,
+ haystack,
+ at,
+ state_id,
+ match_index,
+ ),
+ DFA::Premultiplied(ref dfa) => dfa.overlapping_find_at(
+ prestate,
+ haystack,
+ at,
+ state_id,
+ match_index,
+ ),
+ DFA::PremultipliedByteClass(ref dfa) => dfa.overlapping_find_at(
+ prestate,
+ haystack,
+ at,
+ state_id,
+ match_index,
+ ),
+ }
+ }
+
+ #[inline(always)]
+ pub fn earliest_find_at(
+ &self,
+ prestate: &mut PrefilterState,
+ haystack: &[u8],
+ at: usize,
+ state_id: &mut S,
+ ) -> Option<Match> {
+ match *self {
+ DFA::Standard(ref dfa) => {
+ dfa.earliest_find_at(prestate, haystack, at, state_id)
+ }
+ DFA::ByteClass(ref dfa) => {
+ dfa.earliest_find_at(prestate, haystack, at, state_id)
+ }
+ DFA::Premultiplied(ref dfa) => {
+ dfa.earliest_find_at(prestate, haystack, at, state_id)
+ }
+ DFA::PremultipliedByteClass(ref dfa) => {
+ dfa.earliest_find_at(prestate, haystack, at, state_id)
+ }
+ }
+ }
+
+ #[inline(always)]
+ pub fn find_at_no_state(
+ &self,
+ prestate: &mut PrefilterState,
+ haystack: &[u8],
+ at: usize,
+ ) -> Option<Match> {
+ match *self {
+ DFA::Standard(ref dfa) => {
+ dfa.find_at_no_state(prestate, haystack, at)
+ }
+ DFA::ByteClass(ref dfa) => {
+ dfa.find_at_no_state(prestate, haystack, at)
+ }
+ DFA::Premultiplied(ref dfa) => {
+ dfa.find_at_no_state(prestate, haystack, at)
+ }
+ DFA::PremultipliedByteClass(ref dfa) => {
+ dfa.find_at_no_state(prestate, haystack, at)
+ }
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct Standard<S>(Repr<S>);
+
+impl<S: StateID> Standard<S> {
+ fn repr(&self) -> &Repr<S> {
+ &self.0
+ }
+}
+
+impl<S: StateID> Automaton for Standard<S> {
+ type ID = S;
+
+ fn match_kind(&self) -> &MatchKind {
+ &self.repr().match_kind
+ }
+
+ fn anchored(&self) -> bool {
+ self.repr().anchored
+ }
+
+ fn prefilter(&self) -> Option<&dyn Prefilter> {
+ self.repr().prefilter.as_ref().map(|p| p.as_ref())
+ }
+
+ fn start_state(&self) -> S {
+ self.repr().start_id
+ }
+
+ fn is_valid(&self, id: S) -> bool {
+ id.to_usize() < self.repr().state_count
+ }
+
+ fn is_match_state(&self, id: S) -> bool {
+ self.repr().is_match_state(id)
+ }
+
+ fn is_match_or_dead_state(&self, id: S) -> bool {
+ self.repr().is_match_or_dead_state(id)
+ }
+
+ fn get_match(
+ &self,
+ id: S,
+ match_index: usize,
+ end: usize,
+ ) -> Option<Match> {
+ self.repr().get_match(id, match_index, end)
+ }
+
+ fn match_count(&self, id: S) -> usize {
+ self.repr().match_count(id)
+ }
+
+ fn next_state(&self, current: S, input: u8) -> S {
+ let o = current.to_usize() * 256 + input as usize;
+ self.repr().trans[o]
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct ByteClass<S>(Repr<S>);
+
+impl<S: StateID> ByteClass<S> {
+ fn repr(&self) -> &Repr<S> {
+ &self.0
+ }
+}
+
+impl<S: StateID> Automaton for ByteClass<S> {
+ type ID = S;
+
+ fn match_kind(&self) -> &MatchKind {
+ &self.repr().match_kind
+ }
+
+ fn anchored(&self) -> bool {
+ self.repr().anchored
+ }
+
+ fn prefilter(&self) -> Option<&dyn Prefilter> {
+ self.repr().prefilter.as_ref().map(|p| p.as_ref())
+ }
+
+ fn start_state(&self) -> S {
+ self.repr().start_id
+ }
+
+ fn is_valid(&self, id: S) -> bool {
+ id.to_usize() < self.repr().state_count
+ }
+
+ fn is_match_state(&self, id: S) -> bool {
+ self.repr().is_match_state(id)
+ }
+
+ fn is_match_or_dead_state(&self, id: S) -> bool {
+ self.repr().is_match_or_dead_state(id)
+ }
+
+ fn get_match(
+ &self,
+ id: S,
+ match_index: usize,
+ end: usize,
+ ) -> Option<Match> {
+ self.repr().get_match(id, match_index, end)
+ }
+
+ fn match_count(&self, id: S) -> usize {
+ self.repr().match_count(id)
+ }
+
+ fn next_state(&self, current: S, input: u8) -> S {
+ let alphabet_len = self.repr().byte_classes.alphabet_len();
+ let input = self.repr().byte_classes.get(input);
+ let o = current.to_usize() * alphabet_len + input as usize;
+ self.repr().trans[o]
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct Premultiplied<S>(Repr<S>);
+
+impl<S: StateID> Premultiplied<S> {
+ fn repr(&self) -> &Repr<S> {
+ &self.0
+ }
+}
+
+impl<S: StateID> Automaton for Premultiplied<S> {
+ type ID = S;
+
+ fn match_kind(&self) -> &MatchKind {
+ &self.repr().match_kind
+ }
+
+ fn anchored(&self) -> bool {
+ self.repr().anchored
+ }
+
+ fn prefilter(&self) -> Option<&dyn Prefilter> {
+ self.repr().prefilter.as_ref().map(|p| p.as_ref())
+ }
+
+ fn start_state(&self) -> S {
+ self.repr().start_id
+ }
+
+ fn is_valid(&self, id: S) -> bool {
+ (id.to_usize() / 256) < self.repr().state_count
+ }
+
+ fn is_match_state(&self, id: S) -> bool {
+ self.repr().is_match_state(id)
+ }
+
+ fn is_match_or_dead_state(&self, id: S) -> bool {
+ self.repr().is_match_or_dead_state(id)
+ }
+
+ fn get_match(
+ &self,
+ id: S,
+ match_index: usize,
+ end: usize,
+ ) -> Option<Match> {
+ if id > self.repr().max_match {
+ return None;
+ }
+ self.repr()
+ .matches
+ .get(id.to_usize() / 256)
+ .and_then(|m| m.get(match_index))
+ .map(|&(id, len)| Match { pattern: id, len, end })
+ }
+
+ fn match_count(&self, id: S) -> usize {
+ let o = id.to_usize() / 256;
+ self.repr().matches[o].len()
+ }
+
+ fn next_state(&self, current: S, input: u8) -> S {
+ let o = current.to_usize() + input as usize;
+ self.repr().trans[o]
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct PremultipliedByteClass<S>(Repr<S>);
+
+impl<S: StateID> PremultipliedByteClass<S> {
+ fn repr(&self) -> &Repr<S> {
+ &self.0
+ }
+}
+
+impl<S: StateID> Automaton for PremultipliedByteClass<S> {
+ type ID = S;
+
+ fn match_kind(&self) -> &MatchKind {
+ &self.repr().match_kind
+ }
+
+ fn anchored(&self) -> bool {
+ self.repr().anchored
+ }
+
+ fn prefilter(&self) -> Option<&dyn Prefilter> {
+ self.repr().prefilter.as_ref().map(|p| p.as_ref())
+ }
+
+ fn start_state(&self) -> S {
+ self.repr().start_id
+ }
+
+ fn is_valid(&self, id: S) -> bool {
+ (id.to_usize() / self.repr().alphabet_len()) < self.repr().state_count
+ }
+
+ fn is_match_state(&self, id: S) -> bool {
+ self.repr().is_match_state(id)
+ }
+
+ fn is_match_or_dead_state(&self, id: S) -> bool {
+ self.repr().is_match_or_dead_state(id)
+ }
+
+ fn get_match(
+ &self,
+ id: S,
+ match_index: usize,
+ end: usize,
+ ) -> Option<Match> {
+ if id > self.repr().max_match {
+ return None;
+ }
+ self.repr()
+ .matches
+ .get(id.to_usize() / self.repr().alphabet_len())
+ .and_then(|m| m.get(match_index))
+ .map(|&(id, len)| Match { pattern: id, len, end })
+ }
+
+ fn match_count(&self, id: S) -> usize {
+ let o = id.to_usize() / self.repr().alphabet_len();
+ self.repr().matches[o].len()
+ }
+
+ fn next_state(&self, current: S, input: u8) -> S {
+ let input = self.repr().byte_classes.get(input);
+ let o = current.to_usize() + input as usize;
+ self.repr().trans[o]
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct Repr<S> {
+ match_kind: MatchKind,
+ anchored: bool,
+ premultiplied: bool,
+ start_id: S,
+ /// The length, in bytes, of the longest pattern in this automaton. This
+ /// information is useful for keeping correct buffer sizes when searching
+ /// on streams.
+ max_pattern_len: usize,
+ /// The total number of patterns added to this automaton. This includes
+ /// patterns that may never match.
+ pattern_count: usize,
+ state_count: usize,
+ max_match: S,
+ /// The number of bytes of heap used by this NFA's transition table.
+ heap_bytes: usize,
+ /// A prefilter for quickly detecting candidate matchs, if pertinent.
+ prefilter: Option<PrefilterObj>,
+ byte_classes: ByteClasses,
+ trans: Vec<S>,
+ matches: Vec<Vec<(PatternID, PatternLength)>>,
+}
+
+impl<S: StateID> Repr<S> {
+ /// Returns the total alphabet size for this DFA.
+ ///
+ /// If byte classes are enabled, then this corresponds to the number of
+ /// equivalence classes. If they are disabled, then this is always 256.
+ fn alphabet_len(&self) -> usize {
+ self.byte_classes.alphabet_len()
+ }
+
+ /// Returns true only if the given state is a match state.
+ fn is_match_state(&self, id: S) -> bool {
+ id <= self.max_match && id > dead_id()
+ }
+
+ /// Returns true only if the given state is either a dead state or a match
+ /// state.
+ fn is_match_or_dead_state(&self, id: S) -> bool {
+ id <= self.max_match
+ }
+
+ /// Get the ith match for the given state, where the end position of a
+ /// match was found at `end`.
+ ///
+ /// # Panics
+ ///
+ /// The caller must ensure that the given state identifier is valid,
+ /// otherwise this may panic. The `match_index` need not be valid. That is,
+ /// if the given state has no matches then this returns `None`.
+ fn get_match(
+ &self,
+ id: S,
+ match_index: usize,
+ end: usize,
+ ) -> Option<Match> {
+ if id > self.max_match {
+ return None;
+ }
+ self.matches
+ .get(id.to_usize())
+ .and_then(|m| m.get(match_index))
+ .map(|&(id, len)| Match { pattern: id, len, end })
+ }
+
+ /// Return the total number of matches for the given state.
+ ///
+ /// # Panics
+ ///
+ /// The caller must ensure that the given identifier is valid, or else
+ /// this panics.
+ fn match_count(&self, id: S) -> usize {
+ self.matches[id.to_usize()].len()
+ }
+
+ /// Get the next state given `from` as the current state and `byte` as the
+ /// current input byte.
+ fn next_state(&self, from: S, byte: u8) -> S {
+ let alphabet_len = self.alphabet_len();
+ let byte = self.byte_classes.get(byte);
+ self.trans[from.to_usize() * alphabet_len + byte as usize]
+ }
+
+ /// Set the `byte` transition for the `from` state to point to `to`.
+ fn set_next_state(&mut self, from: S, byte: u8, to: S) {
+ let alphabet_len = self.alphabet_len();
+ let byte = self.byte_classes.get(byte);
+ self.trans[from.to_usize() * alphabet_len + byte as usize] = to;
+ }
+
+ /// Swap the given states in place.
+ fn swap_states(&mut self, id1: S, id2: S) {
+ assert!(!self.premultiplied, "can't swap states in premultiplied DFA");
+
+ let o1 = id1.to_usize() * self.alphabet_len();
+ let o2 = id2.to_usize() * self.alphabet_len();
+ for b in 0..self.alphabet_len() {
+ self.trans.swap(o1 + b, o2 + b);
+ }
+ self.matches.swap(id1.to_usize(), id2.to_usize());
+ }
+
+ /// This routine shuffles all match states in this DFA to the beginning
+ /// of the DFA such that every non-match state appears after every match
+ /// state. (With one exception: the special fail and dead states remain as
+ /// the first two states.)
+ ///
+ /// The purpose of doing this shuffling is to avoid an extra conditional
+ /// in the search loop, and in particular, detecting whether a state is a
+ /// match or not does not need to access any memory.
+ ///
+ /// This updates `self.max_match` to point to the last matching state as
+ /// well as `self.start` if the starting state was moved.
+ fn shuffle_match_states(&mut self) {
+ assert!(
+ !self.premultiplied,
+ "cannot shuffle match states of premultiplied DFA"
+ );
+
+ if self.state_count <= 1 {
+ return;
+ }
+
+ let mut first_non_match = self.start_id.to_usize();
+ while first_non_match < self.state_count
+ && self.matches[first_non_match].len() > 0
+ {
+ first_non_match += 1;
+ }
+
+ let mut swaps: Vec<S> = vec![fail_id(); self.state_count];
+ let mut cur = self.state_count - 1;
+ while cur > first_non_match {
+ if self.matches[cur].len() > 0 {
+ self.swap_states(
+ S::from_usize(cur),
+ S::from_usize(first_non_match),
+ );
+ swaps[cur] = S::from_usize(first_non_match);
+ swaps[first_non_match] = S::from_usize(cur);
+
+ first_non_match += 1;
+ while first_non_match < cur
+ && self.matches[first_non_match].len() > 0
+ {
+ first_non_match += 1;
+ }
+ }
+ cur -= 1;
+ }
+ for id in (0..self.state_count).map(S::from_usize) {
+ let alphabet_len = self.alphabet_len();
+ let offset = id.to_usize() * alphabet_len;
+ for next in &mut self.trans[offset..offset + alphabet_len] {
+ if swaps[next.to_usize()] != fail_id() {
+ *next = swaps[next.to_usize()];
+ }
+ }
+ }
+ if swaps[self.start_id.to_usize()] != fail_id() {
+ self.start_id = swaps[self.start_id.to_usize()];
+ }
+ self.max_match = S::from_usize(first_non_match - 1);
+ }
+
+ fn premultiply(&mut self) -> Result<()> {
+ if self.premultiplied || self.state_count <= 1 {
+ return Ok(());
+ }
+
+ let alpha_len = self.alphabet_len();
+ premultiply_overflow_error(
+ S::from_usize(self.state_count - 1),
+ alpha_len,
+ )?;
+
+ for id in (2..self.state_count).map(S::from_usize) {
+ let offset = id.to_usize() * alpha_len;
+ for next in &mut self.trans[offset..offset + alpha_len] {
+ if *next == dead_id() {
+ continue;
+ }
+ *next = S::from_usize(next.to_usize() * alpha_len);
+ }
+ }
+ self.premultiplied = true;
+ self.start_id = S::from_usize(self.start_id.to_usize() * alpha_len);
+ self.max_match = S::from_usize(self.max_match.to_usize() * alpha_len);
+ Ok(())
+ }
+
+ /// Computes the total amount of heap used by this NFA in bytes.
+ fn calculate_size(&mut self) {
+ let mut size = (self.trans.len() * size_of::<S>())
+ + (self.matches.len()
+ * size_of::<Vec<(PatternID, PatternLength)>>());
+ for state_matches in &self.matches {
+ size +=
+ state_matches.len() * size_of::<(PatternID, PatternLength)>();
+ }
+ size += self.prefilter.as_ref().map_or(0, |p| p.as_ref().heap_bytes());
+ self.heap_bytes = size;
+ }
+}
+
+/// A builder for configuring the determinization of an NFA into a DFA.
+#[derive(Clone, Debug)]
+pub struct Builder {
+ premultiply: bool,
+ byte_classes: bool,
+}
+
+impl Builder {
+ /// Create a new builder for a DFA.
+ pub fn new() -> Builder {
+ Builder { premultiply: true, byte_classes: true }
+ }
+
+ /// Build a DFA from the given NFA.
+ ///
+ /// This returns an error if the state identifiers exceed their
+ /// representation size. This can only happen when state ids are
+ /// premultiplied (which is enabled by default).
+ pub fn build<S: StateID>(&self, nfa: &NFA<S>) -> Result<DFA<S>> {
+ let byte_classes = if self.byte_classes {
+ nfa.byte_classes().clone()
+ } else {
+ ByteClasses::singletons()
+ };
+ let alphabet_len = byte_classes.alphabet_len();
+ let trans = vec![fail_id(); alphabet_len * nfa.state_len()];
+ let matches = vec![vec![]; nfa.state_len()];
+ let mut repr = Repr {
+ match_kind: nfa.match_kind().clone(),
+ anchored: nfa.anchored(),
+ premultiplied: false,
+ start_id: nfa.start_state(),
+ max_pattern_len: nfa.max_pattern_len(),
+ pattern_count: nfa.pattern_count(),
+ state_count: nfa.state_len(),
+ max_match: fail_id(),
+ heap_bytes: 0,
+ prefilter: nfa.prefilter_obj().map(|p| p.clone()),
+ byte_classes: byte_classes.clone(),
+ trans,
+ matches,
+ };
+ for id in (0..nfa.state_len()).map(S::from_usize) {
+ repr.matches[id.to_usize()].extend_from_slice(nfa.matches(id));
+
+ let fail = nfa.failure_transition(id);
+ nfa.iter_all_transitions(&byte_classes, id, |b, mut next| {
+ if next == fail_id() {
+ next = nfa_next_state_memoized(nfa, &repr, id, fail, b);
+ }
+ repr.set_next_state(id, b, next);
+ });
+ }
+ repr.shuffle_match_states();
+ repr.calculate_size();
+ if self.premultiply {
+ repr.premultiply()?;
+ if byte_classes.is_singleton() {
+ Ok(DFA::Premultiplied(Premultiplied(repr)))
+ } else {
+ Ok(DFA::PremultipliedByteClass(PremultipliedByteClass(repr)))
+ }
+ } else {
+ if byte_classes.is_singleton() {
+ Ok(DFA::Standard(Standard(repr)))
+ } else {
+ Ok(DFA::ByteClass(ByteClass(repr)))
+ }
+ }
+ }
+
+ /// Whether to use byte classes or in the DFA.
+ pub fn byte_classes(&mut self, yes: bool) -> &mut Builder {
+ self.byte_classes = yes;
+ self
+ }
+
+ /// Whether to premultiply state identifier in the DFA.
+ pub fn premultiply(&mut self, yes: bool) -> &mut Builder {
+ self.premultiply = yes;
+ self
+ }
+}
+
+/// This returns the next NFA transition (including resolving failure
+/// transitions), except once it sees a state id less than the id of the DFA
+/// state that is currently being populated, then we no longer need to follow
+/// failure transitions and can instead query the pre-computed state id from
+/// the DFA itself.
+///
+/// In general, this should only be called when a failure transition is seen.
+fn nfa_next_state_memoized<S: StateID>(
+ nfa: &NFA<S>,
+ dfa: &Repr<S>,
+ populating: S,
+ mut current: S,
+ input: u8,
+) -> S {
+ loop {
+ if current < populating {
+ return dfa.next_state(current, input);
+ }
+ let next = nfa.next_state(current, input);
+ if next != fail_id() {
+ return next;
+ }
+ current = nfa.failure_transition(current);
+ }
+}
diff --git a/src/error.rs b/src/error.rs
new file mode 100644
index 0000000..7dace63
--- /dev/null
+++ b/src/error.rs
@@ -0,0 +1,101 @@
+use std::error;
+use std::fmt;
+use std::result;
+
+pub type Result<T> = result::Result<T, Error>;
+
+/// An error that occurred during the construction of an Aho-Corasick
+/// automaton.
+#[derive(Clone, Debug)]
+pub struct Error {
+ kind: ErrorKind,
+}
+
+/// The kind of error that occurred.
+#[derive(Clone, Debug)]
+pub enum ErrorKind {
+ /// An error that occurs when constructing an automaton would require the
+ /// use of a state ID that overflows the chosen state ID representation.
+ /// For example, if one is using `u8` for state IDs and builds a DFA with
+ /// 257 states, then the last state's ID will be `256` which cannot be
+ /// represented with `u8`.
+ StateIDOverflow {
+ /// The maximum possible state ID.
+ max: usize,
+ },
+ /// An error that occurs when premultiplication of state IDs is requested
+ /// when constructing an Aho-Corasick DFA, but doing so would overflow the
+ /// chosen state ID representation.
+ ///
+ /// When `max == requested_max`, then the state ID would overflow `usize`.
+ PremultiplyOverflow {
+ /// The maximum possible state id.
+ max: usize,
+ /// The maximum ID required by premultiplication.
+ requested_max: usize,
+ },
+}
+
+impl Error {
+ /// Return the kind of this error.
+ pub fn kind(&self) -> &ErrorKind {
+ &self.kind
+ }
+
+ pub(crate) fn state_id_overflow(max: usize) -> Error {
+ Error { kind: ErrorKind::StateIDOverflow { max } }
+ }
+
+ pub(crate) fn premultiply_overflow(
+ max: usize,
+ requested_max: usize,
+ ) -> Error {
+ Error { kind: ErrorKind::PremultiplyOverflow { max, requested_max } }
+ }
+}
+
+impl error::Error for Error {
+ fn description(&self) -> &str {
+ match self.kind {
+ ErrorKind::StateIDOverflow { .. } => {
+ "state id representation too small"
+ }
+ ErrorKind::PremultiplyOverflow { .. } => {
+ "state id representation too small for premultiplication"
+ }
+ }
+ }
+}
+
+impl fmt::Display for Error {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ match self.kind {
+ ErrorKind::StateIDOverflow { max } => write!(
+ f,
+ "building the automaton failed because it required \
+ building more states that can be identified, where the \
+ maximum ID for the chosen representation is {}",
+ max,
+ ),
+ ErrorKind::PremultiplyOverflow { max, requested_max } => {
+ if max == requested_max {
+ write!(
+ f,
+ "premultiplication of states requires the ability to \
+ represent a state ID greater than what can fit on \
+ this platform's usize, which is {}",
+ ::std::usize::MAX,
+ )
+ } else {
+ write!(
+ f,
+ "premultiplication of states requires the ability to \
+ represent at least a state ID of {}, but the chosen \
+ representation only permits a maximum state ID of {}",
+ requested_max, max,
+ )
+ }
+ }
+ }
+ }
+}
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..28e984b
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,297 @@
+/*!
+A library for finding occurrences of many patterns at once. This library
+provides multiple pattern search principally through an implementation of the
+[Aho-Corasick algorithm](https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm),
+which builds a fast finite state machine for executing searches in linear time.
+
+Additionally, this library provides a number of configuration options for
+building the automaton that permit controlling the space versus time trade
+off. Other features include simple ASCII case insensitive matching, finding
+overlapping matches, replacements, searching streams and even searching and
+replacing text in streams.
+
+Finally, unlike all other (known) Aho-Corasick implementations, this one
+supports enabling
+[leftmost-first](enum.MatchKind.html#variant.LeftmostFirst)
+or
+[leftmost-longest](enum.MatchKind.html#variant.LeftmostFirst)
+match semantics, using a (seemingly) novel alternative construction algorithm.
+For more details on what match semantics means, see the
+[`MatchKind`](enum.MatchKind.html)
+type.
+
+# Overview
+
+This section gives a brief overview of the primary types in this crate:
+
+* [`AhoCorasick`](struct.AhoCorasick.html) is the primary type and represents
+ an Aho-Corasick automaton. This is the type you use to execute searches.
+* [`AhoCorasickBuilder`](struct.AhoCorasickBuilder.html) can be used to build
+ an Aho-Corasick automaton, and supports configuring a number of options.
+* [`Match`](struct.Match.html) represents a single match reported by an
+ Aho-Corasick automaton. Each match has two pieces of information: the pattern
+ that matched and the start and end byte offsets corresponding to the position
+ in the haystack at which it matched.
+
+Additionally, the [`packed`](packed/index.html) sub-module contains a lower
+level API for using fast vectorized routines for finding a small number of
+patterns in a haystack.
+
+# Example: basic searching
+
+This example shows how to search for occurrences of multiple patterns
+simultaneously. Each match includes the pattern that matched along with the
+byte offsets of the match.
+
+```
+use aho_corasick::AhoCorasick;
+
+let patterns = &["apple", "maple", "Snapple"];
+let haystack = "Nobody likes maple in their apple flavored Snapple.";
+
+let ac = AhoCorasick::new(patterns);
+let mut matches = vec![];
+for mat in ac.find_iter(haystack) {
+ matches.push((mat.pattern(), mat.start(), mat.end()));
+}
+assert_eq!(matches, vec![
+ (1, 13, 18),
+ (0, 28, 33),
+ (2, 43, 50),
+]);
+```
+
+# Example: case insensitivity
+
+This is like the previous example, but matches `Snapple` case insensitively
+using `AhoCorasickBuilder`:
+
+```
+use aho_corasick::AhoCorasickBuilder;
+
+let patterns = &["apple", "maple", "snapple"];
+let haystack = "Nobody likes maple in their apple flavored Snapple.";
+
+let ac = AhoCorasickBuilder::new()
+ .ascii_case_insensitive(true)
+ .build(patterns);
+let mut matches = vec![];
+for mat in ac.find_iter(haystack) {
+ matches.push((mat.pattern(), mat.start(), mat.end()));
+}
+assert_eq!(matches, vec![
+ (1, 13, 18),
+ (0, 28, 33),
+ (2, 43, 50),
+]);
+```
+
+# Example: replacing matches in a stream
+
+This example shows how to execute a search and replace on a stream without
+loading the entire stream into memory first.
+
+```
+use aho_corasick::AhoCorasick;
+
+# fn example() -> Result<(), ::std::io::Error> {
+let patterns = &["fox", "brown", "quick"];
+let replace_with = &["sloth", "grey", "slow"];
+
+// In a real example, these might be `std::fs::File`s instead. All you need to
+// do is supply a pair of `std::io::Read` and `std::io::Write` implementations.
+let rdr = "The quick brown fox.";
+let mut wtr = vec![];
+
+let ac = AhoCorasick::new(patterns);
+ac.stream_replace_all(rdr.as_bytes(), &mut wtr, replace_with)?;
+assert_eq!(b"The slow grey sloth.".to_vec(), wtr);
+# Ok(()) }; example().unwrap()
+```
+
+# Example: finding the leftmost first match
+
+In the textbook description of Aho-Corasick, its formulation is typically
+structured such that it reports all possible matches, even when they overlap
+with another. In many cases, overlapping matches may not be desired, such as
+the case of finding all successive non-overlapping matches like you might with
+a standard regular expression.
+
+Unfortunately the "obvious" way to modify the Aho-Corasick algorithm to do
+this doesn't always work in the expected way, since it will report matches as
+soon as they are seen. For example, consider matching the regex `Samwise|Sam`
+against the text `Samwise`. Most regex engines (that are Perl-like, or
+non-POSIX) will report `Samwise` as a match, but the standard Aho-Corasick
+algorithm modified for reporting non-overlapping matches will report `Sam`.
+
+A novel contribution of this library is the ability to change the match
+semantics of Aho-Corasick (without additional search time overhead) such that
+`Samwise` is reported instead. For example, here's the standard approach:
+
+```
+use aho_corasick::AhoCorasick;
+
+let patterns = &["Samwise", "Sam"];
+let haystack = "Samwise";
+
+let ac = AhoCorasick::new(patterns);
+let mat = ac.find(haystack).expect("should have a match");
+assert_eq!("Sam", &haystack[mat.start()..mat.end()]);
+```
+
+And now here's the leftmost-first version, which matches how a Perl-like
+regex will work:
+
+```
+use aho_corasick::{AhoCorasickBuilder, MatchKind};
+
+let patterns = &["Samwise", "Sam"];
+let haystack = "Samwise";
+
+let ac = AhoCorasickBuilder::new()
+ .match_kind(MatchKind::LeftmostFirst)
+ .build(patterns);
+let mat = ac.find(haystack).expect("should have a match");
+assert_eq!("Samwise", &haystack[mat.start()..mat.end()]);
+```
+
+In addition to leftmost-first semantics, this library also supports
+leftmost-longest semantics, which match the POSIX behavior of a regular
+expression alternation. See
+[`MatchKind`](enum.MatchKind.html)
+for more details.
+
+# Prefilters
+
+While an Aho-Corasick automaton can perform admirably when compared to more
+naive solutions, it is generally slower than more specialized algorithms that
+are accelerated using vector instructions such as SIMD.
+
+For that reason, this library will internally use a "prefilter" to attempt
+to accelerate searches when possible. Currently, this library has fairly
+limited implementation that only applies when there are 3 or fewer unique
+starting bytes among all patterns in an automaton.
+
+While a prefilter is generally good to have on by default since it works well
+in the common case, it can lead to less predictable or even sub-optimal
+performance in some cases. For that reason, prefilters can be disabled via
+[`AhoCorasickBuilder::prefilter`](struct.AhoCorasickBuilder.html#method.prefilter).
+*/
+
+#![deny(missing_docs)]
+
+// We can never be truly no_std, but we could be alloc-only some day, so
+// require the std feature for now.
+#[cfg(not(feature = "std"))]
+compile_error!("`std` feature is currently required to build this crate");
+
+extern crate memchr;
+#[cfg(test)]
+#[macro_use]
+extern crate doc_comment;
+
+#[cfg(test)]
+doctest!("../README.md");
+
+pub use ahocorasick::{
+ AhoCorasick, AhoCorasickBuilder, FindIter, FindOverlappingIter, MatchKind,
+ StreamFindIter,
+};
+pub use error::{Error, ErrorKind};
+pub use state_id::StateID;
+
+mod ahocorasick;
+mod automaton;
+mod buffer;
+mod byte_frequencies;
+mod classes;
+mod dfa;
+mod error;
+mod nfa;
+pub mod packed;
+mod prefilter;
+mod state_id;
+#[cfg(test)]
+mod tests;
+
+/// A representation of a match reported by an Aho-Corasick automaton.
+///
+/// A match has two essential pieces of information: the identifier of the
+/// pattern that matched, along with the start and end offsets of the match
+/// in the haystack.
+///
+/// # Examples
+///
+/// Basic usage:
+///
+/// ```
+/// use aho_corasick::AhoCorasick;
+///
+/// let ac = AhoCorasick::new(&[
+/// "foo", "bar", "baz",
+/// ]);
+/// let mat = ac.find("xxx bar xxx").expect("should have a match");
+/// assert_eq!(1, mat.pattern());
+/// assert_eq!(4, mat.start());
+/// assert_eq!(7, mat.end());
+/// ```
+#[derive(Clone, Debug, Eq, Hash, PartialEq)]
+pub struct Match {
+ /// The pattern id.
+ pattern: usize,
+ /// The length of this match, such that the starting position of the match
+ /// is `end - len`.
+ ///
+ /// We use length here because, other than the pattern id, the only
+ /// information about each pattern that the automaton stores is its length.
+ /// So using the length here is just a bit more natural. But it isn't
+ /// technically required.
+ len: usize,
+ /// The end offset of the match, exclusive.
+ end: usize,
+}
+
+impl Match {
+ /// Returns the identifier of the pattern that matched.
+ ///
+ /// The identifier of a pattern is derived from the position in which it
+ /// was originally inserted into the corresponding automaton. The first
+ /// pattern has identifier `0`, and each subsequent pattern is `1`, `2`
+ /// and so on.
+ #[inline]
+ pub fn pattern(&self) -> usize {
+ self.pattern
+ }
+
+ /// The starting position of the match.
+ #[inline]
+ pub fn start(&self) -> usize {
+ self.end - self.len
+ }
+
+ /// The ending position of the match.
+ #[inline]
+ pub fn end(&self) -> usize {
+ self.end
+ }
+
+ /// Returns true if and only if this match is empty. That is, when
+ /// `start() == end()`.
+ ///
+ /// An empty match can only be returned when the empty string was among
+ /// the patterns used to build the Aho-Corasick automaton.
+ #[inline]
+ pub fn is_empty(&self) -> bool {
+ self.len == 0
+ }
+
+ #[inline]
+ fn increment(&self, by: usize) -> Match {
+ Match { pattern: self.pattern, len: self.len, end: self.end + by }
+ }
+
+ #[inline]
+ fn from_span(id: usize, start: usize, end: usize) -> Match {
+ Match { pattern: id, len: end - start, end }
+ }
+}
diff --git a/src/nfa.rs b/src/nfa.rs
new file mode 100644
index 0000000..809d5ef
--- /dev/null
+++ b/src/nfa.rs
@@ -0,0 +1,1363 @@
+use std::cmp;
+use std::collections::{BTreeSet, VecDeque};
+use std::fmt;
+use std::mem::size_of;
+use std::ops::{Index, IndexMut};
+
+use ahocorasick::MatchKind;
+use automaton::Automaton;
+use classes::{ByteClassBuilder, ByteClasses};
+use error::Result;
+use prefilter::{self, opposite_ascii_case, Prefilter, PrefilterObj};
+use state_id::{dead_id, fail_id, usize_to_state_id, StateID};
+use Match;
+
+/// The identifier for a pattern, which is simply the position of the pattern
+/// in the sequence of patterns given by the caller.
+pub type PatternID = usize;
+
+/// The length of a pattern, in bytes.
+pub type PatternLength = usize;
+
+/// An Aho-Corasick automaton, represented as an NFA.
+///
+/// This is the classical formulation of Aho-Corasick, which involves building
+/// up a prefix trie of a given set of patterns, and then wiring up failure
+/// transitions between states in order to guarantee linear time matching. The
+/// standard formulation is, technically, an NFA because of these failure
+/// transitions. That is, one can see them as enabling the automaton to be in
+/// multiple states at once. Indeed, during search, it is possible to check
+/// the transitions on multiple states for a single input byte.
+///
+/// This particular implementation not only supports the standard style of
+/// matching, but also provides a mode for choosing leftmost-first or
+/// leftmost-longest match semantics. When a leftmost mode is chosen, some
+/// failure transitions that would otherwise be added are elided. See
+/// the documentation of `MatchKind` for more details and examples on how the
+/// match semantics may differ.
+///
+/// If one wants a DFA, then it is necessary to first build an NFA and convert
+/// it into a DFA. Note, however, that because we've constrained ourselves to
+/// matching literal patterns, this does not need to use subset construction
+/// for determinization. Instead, the DFA has at most a number of states
+/// equivalent to the number of NFA states. The only real difference between
+/// them is that all failure transitions are followed and pre-computed. This
+/// uses much more memory, but also executes searches more quickly.
+#[derive(Clone)]
+pub struct NFA<S> {
+ /// The match semantics built into this NFA.
+ match_kind: MatchKind,
+ /// The start state id as an index into `states`.
+ start_id: S,
+ /// The length, in bytes, of the longest pattern in this automaton. This
+ /// information is useful for keeping correct buffer sizes when searching
+ /// on streams.
+ max_pattern_len: usize,
+ /// The total number of patterns added to this automaton, including
+ /// patterns that may never be matched.
+ pattern_count: usize,
+ /// The number of bytes of heap used by this NFA's transition table.
+ heap_bytes: usize,
+ /// A prefilter for quickly skipping to candidate matches, if pertinent.
+ prefilter: Option<PrefilterObj>,
+ /// Whether this automaton anchors all matches to the start of input.
+ anchored: bool,
+ /// A set of equivalence classes in terms of bytes. We compute this while
+ /// building the NFA, but don't use it in the NFA's states. Instead, we
+ /// use this for building the DFA. We store it on the NFA since it's easy
+ /// to compute while visiting the the patterns.
+ byte_classes: ByteClasses,
+ /// A set of states. Each state defines its own transitions, a fail
+ /// transition and a set of indices corresponding to matches.
+ ///
+ /// The first state is always the fail state, which is used only as a
+ /// sentinel. Namely, in the final NFA, no transition into the fail state
+ /// exists. (Well, they do, but they aren't followed. Instead, the state's
+ /// failure transition is followed.)
+ ///
+ /// The second state (index 1) is always the dead state. Dead states are
+ /// in every automaton, but only used when leftmost-{first,longest} match
+ /// semantics are enabled. Specifically, they instruct search to stop
+ /// at specific points in order to report the correct match location. In
+ /// the standard Aho-Corasick construction, there are no transitions to
+ /// the dead state.
+ ///
+ /// The third state (index 2) is generally intended to be the starting or
+ /// "root" state.
+ states: Vec<State<S>>,
+}
+
+impl<S: StateID> NFA<S> {
+ /// Returns the equivalence classes of bytes found while constructing
+ /// this NFA.
+ ///
+ /// Note that the NFA doesn't actually make use of these equivalence
+ /// classes. Instead, these are useful for building the DFA when desired.
+ pub fn byte_classes(&self) -> &ByteClasses {
+ &self.byte_classes
+ }
+
+ /// Returns a prefilter, if one exists.
+ pub fn prefilter_obj(&self) -> Option<&PrefilterObj> {
+ self.prefilter.as_ref()
+ }
+
+ /// Returns the total number of heap bytes used by this NFA's transition
+ /// table.
+ pub fn heap_bytes(&self) -> usize {
+ self.heap_bytes
+ + self.prefilter.as_ref().map_or(0, |p| p.as_ref().heap_bytes())
+ }
+
+ /// Return the length of the longest pattern in this automaton.
+ pub fn max_pattern_len(&self) -> usize {
+ self.max_pattern_len
+ }
+
+ /// Return the total number of patterns added to this automaton.
+ pub fn pattern_count(&self) -> usize {
+ self.pattern_count
+ }
+
+ /// Returns the total number of states in this NFA.
+ pub fn state_len(&self) -> usize {
+ self.states.len()
+ }
+
+ /// Returns the matches for the given state.
+ pub fn matches(&self, id: S) -> &[(PatternID, PatternLength)] {
+ &self.states[id.to_usize()].matches
+ }
+
+ /// Returns an iterator over all transitions in the given state according
+ /// to the given equivalence classes, including transitions to `fail_id()`.
+ /// The number of transitions returned is always equivalent to the number
+ /// of equivalence classes.
+ pub fn iter_all_transitions<F: FnMut(u8, S)>(
+ &self,
+ byte_classes: &ByteClasses,
+ id: S,
+ f: F,
+ ) {
+ self.states[id.to_usize()].trans.iter_all(byte_classes, f);
+ }
+
+ /// Returns the failure transition for the given state.
+ pub fn failure_transition(&self, id: S) -> S {
+ self.states[id.to_usize()].fail
+ }
+
+ /// Returns the next state for the given state and input byte.
+ ///
+ /// Note that this does not follow failure transitions. As such, the id
+ /// returned may be `fail_id`.
+ pub fn next_state(&self, current: S, input: u8) -> S {
+ self.states[current.to_usize()].next_state(input)
+ }
+
+ fn state(&self, id: S) -> &State<S> {
+ &self.states[id.to_usize()]
+ }
+
+ fn state_mut(&mut self, id: S) -> &mut State<S> {
+ &mut self.states[id.to_usize()]
+ }
+
+ fn start(&self) -> &State<S> {
+ self.state(self.start_id)
+ }
+
+ fn start_mut(&mut self) -> &mut State<S> {
+ let id = self.start_id;
+ self.state_mut(id)
+ }
+
+ fn iter_transitions_mut(&mut self, id: S) -> IterTransitionsMut<S> {
+ IterTransitionsMut::new(self, id)
+ }
+
+ fn copy_matches(&mut self, src: S, dst: S) {
+ let (src, dst) =
+ get_two_mut(&mut self.states, src.to_usize(), dst.to_usize());
+ dst.matches.extend_from_slice(&src.matches);
+ }
+
+ fn copy_empty_matches(&mut self, dst: S) {
+ let start_id = self.start_id;
+ self.copy_matches(start_id, dst);
+ }
+
+ fn add_dense_state(&mut self, depth: usize) -> Result<S> {
+ let trans = Transitions::Dense(Dense::new());
+ let id = usize_to_state_id(self.states.len())?;
+ self.states.push(State {
+ trans,
+ // Anchored automatons do not have any failure transitions.
+ fail: if self.anchored { dead_id() } else { self.start_id },
+ depth,
+ matches: vec![],
+ });
+ Ok(id)
+ }
+
+ fn add_sparse_state(&mut self, depth: usize) -> Result<S> {
+ let trans = Transitions::Sparse(vec![]);
+ let id = usize_to_state_id(self.states.len())?;
+ self.states.push(State {
+ trans,
+ // Anchored automatons do not have any failure transitions.
+ fail: if self.anchored { dead_id() } else { self.start_id },
+ depth,
+ matches: vec![],
+ });
+ Ok(id)
+ }
+}
+
+impl<S: StateID> Automaton for NFA<S> {
+ type ID = S;
+
+ fn match_kind(&self) -> &MatchKind {
+ &self.match_kind
+ }
+
+ fn anchored(&self) -> bool {
+ self.anchored
+ }
+
+ fn prefilter(&self) -> Option<&dyn Prefilter> {
+ self.prefilter.as_ref().map(|p| p.as_ref())
+ }
+
+ fn start_state(&self) -> S {
+ self.start_id
+ }
+
+ fn is_valid(&self, id: S) -> bool {
+ id.to_usize() < self.states.len()
+ }
+
+ fn is_match_state(&self, id: S) -> bool {
+ self.states[id.to_usize()].is_match()
+ }
+
+ fn get_match(
+ &self,
+ id: S,
+ match_index: usize,
+ end: usize,
+ ) -> Option<Match> {
+ let state = match self.states.get(id.to_usize()) {
+ None => return None,
+ Some(state) => state,
+ };
+ state.matches.get(match_index).map(|&(id, len)| Match {
+ pattern: id,
+ len,
+ end,
+ })
+ }
+
+ fn match_count(&self, id: S) -> usize {
+ self.states[id.to_usize()].matches.len()
+ }
+
+ fn next_state(&self, mut current: S, input: u8) -> S {
+ // This terminates since:
+ //
+ // 1. `State.fail` never points to fail_id().
+ // 2. All `State.fail` values point to a state closer to `start`.
+ // 3. The start state has no transitions to fail_id().
+ loop {
+ let state = &self.states[current.to_usize()];
+ let next = state.next_state(input);
+ if next != fail_id() {
+ return next;
+ }
+ current = state.fail;
+ }
+ }
+}
+
+/// A representation of an NFA state for an Aho-Corasick automaton.
+///
+/// It contains the transitions to the next state, a failure transition for
+/// cases where there exists no other transition for the current input byte,
+/// the matches implied by visiting this state (if any) and the depth of this
+/// state. The depth of a state is simply the distance from it to the start
+/// state in the automaton, where the depth of the start state is 0.
+#[derive(Clone, Debug)]
+pub struct State<S> {
+ trans: Transitions<S>,
+ fail: S,
+ matches: Vec<(PatternID, PatternLength)>,
+ // TODO: Strictly speaking, this isn't needed for searching. It's only
+ // used when building an NFA that supports leftmost match semantics. We
+ // could drop this from the state and dynamically build a map only when
+ // computing failure transitions, but it's not clear which is better.
+ // Benchmark this.
+ depth: usize,
+}
+
+impl<S: StateID> State<S> {
+ fn heap_bytes(&self) -> usize {
+ self.trans.heap_bytes()
+ + (self.matches.len() * size_of::<(PatternID, PatternLength)>())
+ }
+
+ fn add_match(&mut self, i: PatternID, len: PatternLength) {
+ self.matches.push((i, len));
+ }
+
+ fn is_match(&self) -> bool {
+ !self.matches.is_empty()
+ }
+
+ fn get_longest_match_len(&self) -> Option<usize> {
+ // Why is this true? Because the first match in any matching state
+ // will always correspond to the match added to it during trie
+ // construction (since when we copy matches due to failure transitions,
+ // we always append them). Therefore, it follows that the first match
+ // must always be longest since any subsequent match must be from a
+ // failure transition, and a failure transition by construction points
+ // to a proper suffix. A proper suffix is, by definition, smaller.
+ self.matches.get(0).map(|&(_, len)| len)
+ }
+
+ fn next_state(&self, input: u8) -> S {
+ self.trans.next_state(input)
+ }
+
+ fn set_next_state(&mut self, input: u8, next: S) {
+ self.trans.set_next_state(input, next);
+ }
+}
+
+/// Represents the transitions for a single dense state.
+///
+/// The primary purpose here is to encapsulate index access. Namely, since a
+/// dense representation always contains 256 elements, all values of `u8` are
+/// valid indices.
+#[derive(Clone, Debug)]
+struct Dense<S>(Vec<S>);
+
+impl<S> Dense<S>
+where
+ S: StateID,
+{
+ fn new() -> Self {
+ Dense(vec![fail_id(); 256])
+ }
+
+ #[inline]
+ fn len(&self) -> usize {
+ self.0.len()
+ }
+}
+
+impl<S> Index<u8> for Dense<S> {
+ type Output = S;
+
+ #[inline]
+ fn index(&self, i: u8) -> &S {
+ // SAFETY: This is safe because all dense transitions have
+ // exactly 256 elements, so all u8 values are valid indices.
+ &self.0[i as usize]
+ }
+}
+
+impl<S> IndexMut<u8> for Dense<S> {
+ #[inline]
+ fn index_mut(&mut self, i: u8) -> &mut S {
+ // SAFETY: This is safe because all dense transitions have
+ // exactly 256 elements, so all u8 values are valid indices.
+ &mut self.0[i as usize]
+ }
+}
+
+/// A representation of transitions in an NFA.
+///
+/// Transitions have either a sparse representation, which is slower for
+/// lookups but uses less memory, or a dense representation, which is faster
+/// for lookups but uses more memory. In the sparse representation, the absence
+/// of a state implies a transition to `fail_id()`. Transitions to `dead_id()`
+/// are still explicitly represented.
+///
+/// For the NFA, by default, we use a dense representation for transitions for
+/// states close to the start state because it's likely these are the states
+/// that will be most frequently visited.
+#[derive(Clone, Debug)]
+enum Transitions<S> {
+ Sparse(Vec<(u8, S)>),
+ Dense(Dense<S>),
+}
+
+impl<S: StateID> Transitions<S> {
+ fn heap_bytes(&self) -> usize {
+ match *self {
+ Transitions::Sparse(ref sparse) => {
+ sparse.len() * size_of::<(u8, S)>()
+ }
+ Transitions::Dense(ref dense) => dense.len() * size_of::<S>(),
+ }
+ }
+
+ fn next_state(&self, input: u8) -> S {
+ match *self {
+ Transitions::Sparse(ref sparse) => {
+ for &(b, id) in sparse {
+ if b == input {
+ return id;
+ }
+ }
+ fail_id()
+ }
+ Transitions::Dense(ref dense) => dense[input],
+ }
+ }
+
+ fn set_next_state(&mut self, input: u8, next: S) {
+ match *self {
+ Transitions::Sparse(ref mut sparse) => {
+ match sparse.binary_search_by_key(&input, |&(b, _)| b) {
+ Ok(i) => sparse[i] = (input, next),
+ Err(i) => sparse.insert(i, (input, next)),
+ }
+ }
+ Transitions::Dense(ref mut dense) => {
+ dense[input] = next;
+ }
+ }
+ }
+
+ /// Iterate over transitions in this state while skipping over transitions
+ /// to `fail_id()`.
+ fn iter<F: FnMut(u8, S)>(&self, mut f: F) {
+ match *self {
+ Transitions::Sparse(ref sparse) => {
+ for &(b, id) in sparse {
+ f(b, id);
+ }
+ }
+ Transitions::Dense(ref dense) => {
+ for b in AllBytesIter::new() {
+ let id = dense[b];
+ if id != fail_id() {
+ f(b, id);
+ }
+ }
+ }
+ }
+ }
+
+ /// Iterate over all transitions in this state according to the given
+ /// equivalence classes, including transitions to `fail_id()`.
+ fn iter_all<F: FnMut(u8, S)>(&self, classes: &ByteClasses, mut f: F) {
+ if classes.is_singleton() {
+ match *self {
+ Transitions::Sparse(ref sparse) => {
+ sparse_iter(sparse, f);
+ }
+ Transitions::Dense(ref dense) => {
+ for b in AllBytesIter::new() {
+ f(b, dense[b]);
+ }
+ }
+ }
+ } else {
+ // In this case, we only want to yield a single byte for each
+ // equivalence class.
+ match *self {
+ Transitions::Sparse(ref sparse) => {
+ let mut last_class = None;
+ sparse_iter(sparse, |b, next| {
+ let class = classes.get(b);
+ if last_class != Some(class) {
+ last_class = Some(class);
+ f(b, next);
+ }
+ })
+ }
+ Transitions::Dense(ref dense) => {
+ for b in classes.representatives() {
+ f(b, dense[b]);
+ }
+ }
+ }
+ }
+ }
+}
+
+/// Iterator over transitions in a state, skipping transitions to `fail_id()`.
+///
+/// This abstracts over the representation of NFA transitions, which may be
+/// either in a sparse or dense representation.
+///
+/// This somewhat idiosyncratically borrows the NFA mutably, so that when one
+/// is iterating over transitions, the caller can still mutate the NFA. This
+/// is useful when creating failure transitions.
+#[derive(Debug)]
+struct IterTransitionsMut<'a, S: StateID + 'a> {
+ nfa: &'a mut NFA<S>,
+ state_id: S,
+ cur: usize,
+}
+
+impl<'a, S: StateID> IterTransitionsMut<'a, S> {
+ fn new(nfa: &'a mut NFA<S>, state_id: S) -> IterTransitionsMut<'a, S> {
+ IterTransitionsMut { nfa, state_id, cur: 0 }
+ }
+
+ fn nfa(&mut self) -> &mut NFA<S> {
+ self.nfa
+ }
+}
+
+impl<'a, S: StateID> Iterator for IterTransitionsMut<'a, S> {
+ type Item = (u8, S);
+
+ fn next(&mut self) -> Option<(u8, S)> {
+ match self.nfa.states[self.state_id.to_usize()].trans {
+ Transitions::Sparse(ref sparse) => {
+ if self.cur >= sparse.len() {
+ return None;
+ }
+ let i = self.cur;
+ self.cur += 1;
+ Some(sparse[i])
+ }
+ Transitions::Dense(ref dense) => {
+ while self.cur < dense.len() {
+ // There are always exactly 255 transitions in dense repr.
+ debug_assert!(self.cur < 256);
+
+ let b = self.cur as u8;
+ let id = dense[b];
+ self.cur += 1;
+ if id != fail_id() {
+ return Some((b, id));
+ }
+ }
+ None
+ }
+ }
+ }
+}
+
+/// A simple builder for configuring the NFA construction of Aho-Corasick.
+#[derive(Clone, Debug)]
+pub struct Builder {
+ dense_depth: usize,
+ match_kind: MatchKind,
+ prefilter: bool,
+ anchored: bool,
+ ascii_case_insensitive: bool,
+}
+
+impl Default for Builder {
+ fn default() -> Builder {
+ Builder {
+ dense_depth: 2,
+ match_kind: MatchKind::default(),
+ prefilter: true,
+ anchored: false,
+ ascii_case_insensitive: false,
+ }
+ }
+}
+
+impl Builder {
+ pub fn new() -> Builder {
+ Builder::default()
+ }
+
+ pub fn build<I, P, S: StateID>(&self, patterns: I) -> Result<NFA<S>>
+ where
+ I: IntoIterator<Item = P>,
+ P: AsRef<[u8]>,
+ {
+ Compiler::new(self)?.compile(patterns)
+ }
+
+ pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder {
+ self.match_kind = kind;
+ self
+ }
+
+ pub fn dense_depth(&mut self, depth: usize) -> &mut Builder {
+ self.dense_depth = depth;
+ self
+ }
+
+ pub fn prefilter(&mut self, yes: bool) -> &mut Builder {
+ self.prefilter = yes;
+ self
+ }
+
+ pub fn anchored(&mut self, yes: bool) -> &mut Builder {
+ self.anchored = yes;
+ self
+ }
+
+ pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder {
+ self.ascii_case_insensitive = yes;
+ self
+ }
+}
+
+/// A compiler uses a builder configuration and builds up the NFA formulation
+/// of an Aho-Corasick automaton. This roughly corresponds to the standard
+/// formulation described in textbooks.
+#[derive(Debug)]
+struct Compiler<'a, S: StateID> {
+ builder: &'a Builder,
+ prefilter: prefilter::Builder,
+ nfa: NFA<S>,
+ byte_classes: ByteClassBuilder,
+}
+
+impl<'a, S: StateID> Compiler<'a, S> {
+ fn new(builder: &'a Builder) -> Result<Compiler<'a, S>> {
+ Ok(Compiler {
+ builder,
+ prefilter: prefilter::Builder::new(builder.match_kind)
+ .ascii_case_insensitive(builder.ascii_case_insensitive),
+ nfa: NFA {
+ match_kind: builder.match_kind,
+ start_id: usize_to_state_id(2)?,
+ max_pattern_len: 0,
+ pattern_count: 0,
+ heap_bytes: 0,
+ prefilter: None,
+ anchored: builder.anchored,
+ byte_classes: ByteClasses::singletons(),
+ states: vec![],
+ },
+ byte_classes: ByteClassBuilder::new(),
+ })
+ }
+
+ fn compile<I, P>(mut self, patterns: I) -> Result<NFA<S>>
+ where
+ I: IntoIterator<Item = P>,
+ P: AsRef<[u8]>,
+ {
+ self.add_state(0)?; // the fail state, which is never entered
+ self.add_state(0)?; // the dead state, only used for leftmost
+ self.add_state(0)?; // the start state
+ self.build_trie(patterns)?;
+ self.add_start_state_loop();
+ self.add_dead_state_loop();
+ if !self.builder.anchored {
+ if self.match_kind().is_leftmost() {
+ self.fill_failure_transitions_leftmost();
+ } else {
+ self.fill_failure_transitions_standard();
+ }
+ }
+ self.close_start_state_loop();
+ self.nfa.byte_classes = self.byte_classes.build();
+ if !self.builder.anchored {
+ self.nfa.prefilter = self.prefilter.build();
+ }
+ self.calculate_size();
+ Ok(self.nfa)
+ }
+
+ /// This sets up the initial prefix trie that makes up the Aho-Corasick
+ /// automaton. Effectively, it creates the basic structure of the
+ /// automaton, where every pattern given has a path from the start state to
+ /// the end of the pattern.
+ fn build_trie<I, P>(&mut self, patterns: I) -> Result<()>
+ where
+ I: IntoIterator<Item = P>,
+ P: AsRef<[u8]>,
+ {
+ 'PATTERNS: for (pati, pat) in patterns.into_iter().enumerate() {
+ let pat = pat.as_ref();
+ self.nfa.max_pattern_len =
+ cmp::max(self.nfa.max_pattern_len, pat.len());
+ self.nfa.pattern_count += 1;
+
+ let mut prev = self.nfa.start_id;
+ let mut saw_match = false;
+ for (depth, &b) in pat.iter().enumerate() {
+ // When leftmost-first match semantics are requested, we
+ // specifically stop adding patterns when a previously added
+ // pattern is a prefix of it. We avoid adding it because
+ // leftmost-first semantics imply that the pattern can never
+ // match. This is not just an optimization to save space! It
+ // is necessary for correctness. In fact, this is the only
+ // difference in the automaton between the implementations for
+ // leftmost-first and leftmost-longest.
+ saw_match = saw_match || self.nfa.state(prev).is_match();
+ if self.builder.match_kind.is_leftmost_first() && saw_match {
+ // Skip to the next pattern immediately. This avoids
+ // incorrectly adding a match after this loop terminates.
+ continue 'PATTERNS;
+ }
+
+ // Add this byte to our equivalence classes. We don't use these
+ // for NFA construction. These are instead used only if we're
+ // building a DFA. They would technically be useful for the
+ // NFA, but it would require a second pass over the patterns.
+ self.byte_classes.set_range(b, b);
+ if self.builder.ascii_case_insensitive {
+ let b = opposite_ascii_case(b);
+ self.byte_classes.set_range(b, b);
+ }
+
+ // If the transition from prev using the current byte already
+ // exists, then just move through it. Otherwise, add a new
+ // state. We track the depth here so that we can determine
+ // how to represent transitions. States near the start state
+ // use a dense representation that uses more memory but is
+ // faster. Other states use a sparse representation that uses
+ // less memory but is slower.
+ let next = self.nfa.state(prev).next_state(b);
+ if next != fail_id() {
+ prev = next;
+ } else {
+ let next = self.add_state(depth + 1)?;
+ self.nfa.state_mut(prev).set_next_state(b, next);
+ if self.builder.ascii_case_insensitive {
+ let b = opposite_ascii_case(b);
+ self.nfa.state_mut(prev).set_next_state(b, next);
+ }
+ prev = next;
+ }
+ }
+ // Once the pattern has been added, log the match in the final
+ // state that it reached.
+ self.nfa.state_mut(prev).add_match(pati, pat.len());
+ // ... and hand it to the prefilter builder, if applicable.
+ if self.builder.prefilter {
+ self.prefilter.add(pat);
+ }
+ }
+ Ok(())
+ }
+
+ /// This routine creates failure transitions according to the standard
+ /// textbook formulation of the Aho-Corasick algorithm.
+ ///
+ /// Building failure transitions is the most interesting part of building
+ /// the Aho-Corasick automaton, because they are what allow searches to
+ /// be performed in linear time. Specifically, a failure transition is
+ /// a single transition associated with each state that points back to
+ /// the longest proper suffix of the pattern being searched. The failure
+ /// transition is followed whenever there exists no transition on the
+ /// current state for the current input byte. If there is no other proper
+ /// suffix, then the failure transition points back to the starting state.
+ ///
+ /// For example, let's say we built an Aho-Corasick automaton with the
+ /// following patterns: 'abcd' and 'cef'. The trie looks like this:
+ ///
+ /// ```ignore
+ /// a - S1 - b - S2 - c - S3 - d - S4*
+ /// /
+ /// S0 - c - S5 - e - S6 - f - S7*
+ /// ```
+ ///
+ /// At this point, it should be fairly straight-forward to see how this
+ /// trie can be used in a simplistic way. At any given position in the
+ /// text we're searching (called the "subject" string), all we need to do
+ /// is follow the transitions in the trie by consuming one transition for
+ /// each byte in the subject string. If we reach a match state, then we can
+ /// report that location as a match.
+ ///
+ /// The trick comes when searching a subject string like 'abcef'. We'll
+ /// initially follow the transition from S0 to S1 and wind up in S3 after
+ /// observng the 'c' byte. At this point, the next byte is 'e' but state
+ /// S3 has no transition for 'e', so the search fails. We then would need
+ /// to restart the search at the next position in 'abcef', which
+ /// corresponds to 'b'. The match would fail, but the next search starting
+ /// at 'c' would finally succeed. The problem with this approach is that
+ /// we wind up searching the subject string potentially many times. In
+ /// effect, this makes the algorithm have worst case `O(n * m)` complexity,
+ /// where `n ~ len(subject)` and `m ~ len(all patterns)`. We would instead
+ /// like to achieve a `O(n + m)` worst case complexity.
+ ///
+ /// This is where failure transitions come in. Instead of dying at S3 in
+ /// the first search, the automaton can instruct the search to move to
+ /// another part of the automaton that corresponds to a suffix of what
+ /// we've seen so far. Recall that we've seen 'abc' in the subject string,
+ /// and the automaton does indeed have a non-empty suffix, 'c', that could
+ /// potentially lead to another match. Thus, the actual Aho-Corasick
+ /// automaton for our patterns in this case looks like this:
+ ///
+ /// ```ignore
+ /// a - S1 - b - S2 - c - S3 - d - S4*
+ /// / /
+ /// / ----------------
+ /// / /
+ /// S0 - c - S5 - e - S6 - f - S7*
+ /// ```
+ ///
+ /// That is, we have a failure transition from S3 to S5, which is followed
+ /// exactly in cases when we are in state S3 but see any byte other than
+ /// 'd' (that is, we've "failed" to find a match in this portion of our
+ /// trie). We know we can transition back to S5 because we've already seen
+ /// a 'c' byte, so we don't need to re-scan it. We can then pick back up
+ /// with the search starting at S5 and complete our match.
+ ///
+ /// Adding failure transitions to a trie is fairly simple, but subtle. The
+ /// key issue is that you might have multiple failure transition that you
+ /// need to follow. For example, look at the trie for the patterns
+ /// 'abcd', 'b', 'bcd' and 'cd':
+ ///
+ /// ```ignore
+ /// - a - S1 - b - S2 - c - S3 - d - S4*
+ /// /
+ /// S0 - b - S5* - c - S6 - d - S7*
+ /// \
+ /// - c - S8 - d - S9*
+ /// ```
+ ///
+ /// The failure transitions for this trie are defined from S2 to S5,
+ /// S3 to S6 and S6 to S8. Moreover, state S2 needs to track that it
+ /// corresponds to a match, since its failure transition to S5 is itself
+ /// a match state.
+ ///
+ /// Perhaps simplest way to think about adding these failure transitions
+ /// is recursively. That is, if you know the failure transitions for every
+ /// possible previous state that could be visited (e.g., when computing the
+ /// failure transition for S3, you already know the failure transitions
+ /// for S0, S1 and S2), then you can simply follow the failure transition
+ /// of the previous state and check whether the incoming transition is
+ /// defined after following the failure transition.
+ ///
+ /// For example, when determining the failure state for S3, by our
+ /// assumptions, we already know that there is a failure transition from
+ /// S2 (the previous state) to S5. So we follow that transition and check
+ /// whether the transition connecting S2 to S3 is defined. Indeed, it is,
+ /// as there is a transition from S5 to S6 for the byte 'c'. If no such
+ /// transition existed, we could keep following the failure transitions
+ /// until we reach the start state, which is the failure transition for
+ /// every state that has no corresponding proper suffix.
+ ///
+ /// We don't actually use recursion to implement this, but instead, use a
+ /// breadth first search of the automaton. Our base case is the start
+ /// state, whose failure transition is just a transition to itself.
+ fn fill_failure_transitions_standard(&mut self) {
+ // Initialize the queue for breadth first search with all transitions
+ // out of the start state. We handle the start state specially because
+ // we only want to follow non-self transitions. If we followed self
+ // transitions, then this would never terminate.
+ let mut queue = VecDeque::new();
+ let mut seen = self.queued_set();
+ for b in AllBytesIter::new() {
+ let next = self.nfa.start().next_state(b);
+ if next != self.nfa.start_id {
+ if !seen.contains(next) {
+ queue.push_back(next);
+ seen.insert(next);
+ }
+ }
+ }
+ while let Some(id) = queue.pop_front() {
+ let mut it = self.nfa.iter_transitions_mut(id);
+ while let Some((b, next)) = it.next() {
+ if !seen.contains(next) {
+ queue.push_back(next);
+ seen.insert(next);
+ }
+
+ let mut fail = it.nfa().state(id).fail;
+ while it.nfa().state(fail).next_state(b) == fail_id() {
+ fail = it.nfa().state(fail).fail;
+ }
+ fail = it.nfa().state(fail).next_state(b);
+ it.nfa().state_mut(next).fail = fail;
+ it.nfa().copy_matches(fail, next);
+ }
+ // If the start state is a match state, then this automaton can
+ // match the empty string. This implies all states are match states
+ // since every position matches the empty string, so copy the
+ // matches from the start state to every state. Strictly speaking,
+ // this is only necessary for overlapping matches since each
+ // non-empty non-start match state needs to report empty matches
+ // in addition to its own. For the non-overlapping case, such
+ // states only report the first match, which is never empty since
+ // it isn't a start state.
+ it.nfa().copy_empty_matches(id);
+ }
+ }
+
+ /// This routine is just like fill_failure_transitions_standard, except
+ /// it adds failure transitions in a way that preserves leftmost match
+ /// semantics (for both leftmost-first and leftmost-longest).
+ ///
+ /// The algorithms are so similar that it would be possible to write it
+ /// generically. But doing so without overhead would require a bit of
+ /// ceremony, so we just copy it and add in the extra leftmost logic.
+ /// Moreover, the standard algorithm above is so simple that it feels like
+ /// a crime to disturb it.
+ ///
+ /// In effect, this proceeds just like the standard approach, but we
+ /// specifically add only a subset of all failure transitions. Namely, we
+ /// only add failure transitions that either do not occur after a match
+ /// or failure transitions that do occur after a match but preserve the
+ /// match. The comments in the implementation below should help.
+ ///
+ /// N.B. The only differences in the automaton between leftmost-first and
+ /// leftmost-longest are in trie construction. Otherwise, both have exactly
+ /// the same set of failure transitions. leftmost-longest adds everything
+ /// to the trie, where as leftmost-first skips any patterns for which there
+ /// exists a prefix of it that was added earlier.
+ ///
+ /// N.B. I came up with this algorithm on my own, and after scouring all of
+ /// the other AC implementations I know of (Perl, Snort, many on GitHub).
+ /// I couldn't find any that implement leftmost semantics like this.
+ /// Perl of course needs leftmost-first semantics, but they implement it
+ /// with a seeming hack at *search* time instead of encoding it into the
+ /// automaton. There are also a couple Java libraries that support leftmost
+ /// longest semantics, but they do it by building a queue of matches at
+ /// search time, which is even worse than what Perl is doing. ---AG
+ fn fill_failure_transitions_leftmost(&mut self) {
+ /// Represents an item in our queue of states to process.
+ ///
+ /// Fundamentally, this queue serves the same purpose as the queue
+ /// for filling failure transitions using the standard formulation.
+ /// In the leftmost case, though, we need to track a bit more
+ /// information. See comments below.
+ #[derive(Clone, Copy, Debug)]
+ struct QueuedState<S> {
+ /// The id of the state to visit.
+ id: S,
+ /// The depth at which the first match was observed in the path
+ /// to this state. Note that this corresponds to the depth at
+ /// which the beginning of the match was detected. If no match
+ /// has been seen, then this is None.
+ match_at_depth: Option<usize>,
+ }
+
+ impl<S: StateID> QueuedState<S> {
+ /// Create a queued state corresponding to the given NFA's start
+ /// state.
+ fn start(nfa: &NFA<S>) -> QueuedState<S> {
+ let match_at_depth =
+ if nfa.start().is_match() { Some(0) } else { None };
+ QueuedState { id: nfa.start_id, match_at_depth }
+ }
+
+ /// Return the next state to queue up. The given id must be a state
+ /// corresponding to a single transition from this queued state.
+ fn next_queued_state(
+ &self,
+ nfa: &NFA<S>,
+ id: S,
+ ) -> QueuedState<S> {
+ let match_at_depth = self.next_match_at_depth(nfa, id);
+ QueuedState { id, match_at_depth }
+ }
+
+ /// Return the earliest depth at which a match has occurred for
+ /// the given state. The given state must correspond to a single
+ /// transition from this queued state.
+ fn next_match_at_depth(
+ &self,
+ nfa: &NFA<S>,
+ next: S,
+ ) -> Option<usize> {
+ // This is a little tricky. If the previous state has already
+ // seen a match or if `next` isn't a match state, then nothing
+ // needs to change since a later state cannot find an earlier
+ // match.
+ match self.match_at_depth {
+ Some(x) => return Some(x),
+ None if nfa.state(next).is_match() => {}
+ None => return None,
+ }
+ let depth = nfa.state(next).depth
+ - nfa.state(next).get_longest_match_len().unwrap()
+ + 1;
+ Some(depth)
+ }
+ }
+
+ // Initialize the queue for breadth first search with all transitions
+ // out of the start state. We handle the start state specially because
+ // we only want to follow non-self transitions. If we followed self
+ // transitions, then this would never terminate.
+ let mut queue: VecDeque<QueuedState<S>> = VecDeque::new();
+ let mut seen = self.queued_set();
+ let start = QueuedState::start(&self.nfa);
+ for b in AllBytesIter::new() {
+ let next_id = self.nfa.start().next_state(b);
+ if next_id != start.id {
+ let next = start.next_queued_state(&self.nfa, next_id);
+ if !seen.contains(next.id) {
+ queue.push_back(next);
+ seen.insert(next.id);
+ }
+ // If a state immediately following the start state is a match
+ // state, then we never want to follow its failure transition
+ // since the failure transition necessarily leads back to the
+ // start state, which we never want to do for leftmost matching
+ // after a match has been found.
+ //
+ // N.B. This is a special case of the more general handling
+ // found below.
+ if self.nfa.state(next_id).is_match() {
+ self.nfa.state_mut(next_id).fail = dead_id();
+ }
+ }
+ }
+ while let Some(item) = queue.pop_front() {
+ let mut any_trans = false;
+ let mut it = self.nfa.iter_transitions_mut(item.id);
+ while let Some((b, next_id)) = it.next() {
+ any_trans = true;
+
+ // Queue up the next state.
+ let next = item.next_queued_state(it.nfa(), next_id);
+ if !seen.contains(next.id) {
+ queue.push_back(next);
+ seen.insert(next.id);
+ }
+
+ // Find the failure state for next. Same as standard.
+ let mut fail = it.nfa().state(item.id).fail;
+ while it.nfa().state(fail).next_state(b) == fail_id() {
+ fail = it.nfa().state(fail).fail;
+ }
+ fail = it.nfa().state(fail).next_state(b);
+
+ // This is the key difference from the standard formulation.
+ // Namely, if we've seen a match, then we only want a failure
+ // transition if the failure transition preserves the match
+ // we've seen. In general, this is not true of all failure
+ // transitions since they can point back to any suffix of what
+ // we've seen so far. Instead, we only want to point back to
+ // suffixes that contain any match we've seen.
+ //
+ // We achieve this by comparing the depth of the failure
+ // transition with the number of states between this state
+ // and the beginning of the earliest match detected. If the
+ // depth of the failure state is smaller than this difference,
+ // then it cannot contain the match. If it's bigger or equal
+ // to the difference, then it necessarily includes the match
+ // we've seen since all failure transitions correspond to a
+ // suffix.
+ //
+ // If we've determined that we don't want the failure
+ // transition, then we set this state's failure transition to
+ // the dead state. In other words, when a search hits this
+ // state, it will not continue and correctly stop. (N.B. A
+ // dead state is different than a fail state. A dead state
+ // MUST be preceded by a match and acts as a sentinel to search
+ // routines to terminate.)
+ //
+ // Understanding this is tricky, and it took me several days
+ // to think through this and get it right. If you want to grok
+ // it, then I'd recommend: 1) switch the implementation to
+ // always use the standard algorithm for filling in failure
+ // transitions, 2) run the test suite and 3) examine the test
+ // failures. Write out the automatons for them and try to work
+ // backwards by figuring out which failure transitions should
+ // be removed. You should arrive at the same rule used below.
+ if let Some(match_depth) = next.match_at_depth {
+ let fail_depth = it.nfa().state(fail).depth;
+ let next_depth = it.nfa().state(next.id).depth;
+ if next_depth - match_depth + 1 > fail_depth {
+ it.nfa().state_mut(next.id).fail = dead_id();
+ continue;
+ }
+ assert_ne!(
+ start.id,
+ it.nfa().state(next.id).fail,
+ "states that are match states or follow match \
+ states should never have a failure transition \
+ back to the start state in leftmost searching",
+ );
+ }
+ it.nfa().state_mut(next.id).fail = fail;
+ it.nfa().copy_matches(fail, next.id);
+ }
+ // If there are no transitions for this state and if it's a match
+ // state, then we must set its failure transition to the dead
+ // state since we never want it to restart the search.
+ if !any_trans && it.nfa().state(item.id).is_match() {
+ it.nfa().state_mut(item.id).fail = dead_id();
+ }
+ // We don't need to copy empty matches from the start state here
+ // because that's only necessary for overlapping matches and
+ // leftmost match kinds don't support overlapping matches.
+ }
+ }
+
+ /// Returns a set that tracked queued states.
+ ///
+ /// This is only necessary when ASCII case insensitivity is enabled, since
+ /// it is the only way to visit the same state twice. Otherwise, this
+ /// returns an inert set that nevers adds anything and always reports
+ /// `false` for every member test.
+ fn queued_set(&self) -> QueuedSet<S> {
+ if self.builder.ascii_case_insensitive {
+ QueuedSet::active()
+ } else {
+ QueuedSet::inert()
+ }
+ }
+
+ /// Set the failure transitions on the start state to loop back to the
+ /// start state. This effectively permits the Aho-Corasick automaton to
+ /// match at any position. This is also required for finding the next
+ /// state to terminate, namely, finding the next state should never return
+ /// a fail_id.
+ ///
+ /// This must be done after building the initial trie, since trie
+ /// construction depends on transitions to `fail_id` to determine whether a
+ /// state already exists or not.
+ fn add_start_state_loop(&mut self) {
+ let start_id = self.nfa.start_id;
+ let start = self.nfa.start_mut();
+ for b in AllBytesIter::new() {
+ if start.next_state(b) == fail_id() {
+ start.set_next_state(b, start_id);
+ }
+ }
+ }
+
+ /// Remove the start state loop by rewriting any transitions on the start
+ /// state back to the start state with transitions to the dead state.
+ ///
+ /// The loop is only closed when two conditions are met: the start state
+ /// is a match state and the match kind is leftmost-first or
+ /// leftmost-longest. (Alternatively, if this is an anchored automaton,
+ /// then the start state is always closed, regardless of aforementioned
+ /// conditions.)
+ ///
+ /// The reason for this is that under leftmost semantics, a start state
+ /// that is also a match implies that we should never restart the search
+ /// process. We allow normal transitions out of the start state, but if
+ /// none exist, we transition to the dead state, which signals that
+ /// searching should stop.
+ fn close_start_state_loop(&mut self) {
+ if self.builder.anchored
+ || (self.match_kind().is_leftmost() && self.nfa.start().is_match())
+ {
+ let start_id = self.nfa.start_id;
+ let start = self.nfa.start_mut();
+ for b in AllBytesIter::new() {
+ if start.next_state(b) == start_id {
+ start.set_next_state(b, dead_id());
+ }
+ }
+ }
+ }
+
+ /// Sets all transitions on the dead state to point back to the dead state.
+ /// Normally, missing transitions map back to the failure state, but the
+ /// point of the dead state is to act as a sink that can never be escaped.
+ fn add_dead_state_loop(&mut self) {
+ let dead = self.nfa.state_mut(dead_id());
+ for b in AllBytesIter::new() {
+ dead.set_next_state(b, dead_id());
+ }
+ }
+
+ /// Computes the total amount of heap used by this NFA in bytes.
+ fn calculate_size(&mut self) {
+ let mut size = 0;
+ for state in &self.nfa.states {
+ size += state.heap_bytes();
+ }
+ self.nfa.heap_bytes = size;
+ }
+
+ /// Add a new state to the underlying NFA with the given depth. The depth
+ /// is used to determine how to represent the transitions.
+ ///
+ /// If adding the new state would overflow the chosen state ID
+ /// representation, then this returns an error.
+ fn add_state(&mut self, depth: usize) -> Result<S> {
+ if depth < self.builder.dense_depth {
+ self.nfa.add_dense_state(depth)
+ } else {
+ self.nfa.add_sparse_state(depth)
+ }
+ }
+
+ /// Returns the match kind configured on the underlying builder.
+ fn match_kind(&self) -> MatchKind {
+ self.builder.match_kind
+ }
+}
+
+/// A set of state identifiers used to avoid revisiting the same state multiple
+/// times when filling in failure transitions.
+///
+/// This set has an "inert" and an "active" mode. When inert, the set never
+/// stores anything and always returns `false` for every member test. This is
+/// useful to avoid the performance and memory overhead of maintaining this
+/// set when it is not needed.
+#[derive(Debug)]
+struct QueuedSet<S> {
+ set: Option<BTreeSet<S>>,
+}
+
+impl<S: StateID> QueuedSet<S> {
+ /// Return an inert set that returns `false` for every state ID membership
+ /// test.
+ fn inert() -> QueuedSet<S> {
+ QueuedSet { set: None }
+ }
+
+ /// Return an active set that tracks state ID membership.
+ fn active() -> QueuedSet<S> {
+ QueuedSet { set: Some(BTreeSet::new()) }
+ }
+
+ /// Inserts the given state ID into this set. (If the set is inert, then
+ /// this is a no-op.)
+ fn insert(&mut self, state_id: S) {
+ if let Some(ref mut set) = self.set {
+ set.insert(state_id);
+ }
+ }
+
+ /// Returns true if and only if the given state ID is in this set. If the
+ /// set is inert, this always returns false.
+ fn contains(&self, state_id: S) -> bool {
+ match self.set {
+ None => false,
+ Some(ref set) => set.contains(&state_id),
+ }
+ }
+}
+
+/// An iterator over every byte value.
+///
+/// We use this instead of (0..256).map(|b| b as u8) because this optimizes
+/// better in debug builds.
+///
+/// We also use this instead of 0..=255 because we're targeting Rust 1.24 and
+/// inclusive range syntax was stabilized in Rust 1.26. We can get rid of this
+/// once our MSRV is Rust 1.26 or newer.
+#[derive(Debug)]
+struct AllBytesIter(u16);
+
+impl AllBytesIter {
+ fn new() -> AllBytesIter {
+ AllBytesIter(0)
+ }
+}
+
+impl Iterator for AllBytesIter {
+ type Item = u8;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ if self.0 >= 256 {
+ None
+ } else {
+ let b = self.0 as u8;
+ self.0 += 1;
+ Some(b)
+ }
+ }
+}
+
+impl<S: StateID> fmt::Debug for NFA<S> {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ writeln!(f, "NFA(")?;
+ writeln!(f, "match_kind: {:?}", self.match_kind)?;
+ writeln!(f, "prefilter: {:?}", self.prefilter)?;
+ writeln!(f, "{}", "-".repeat(79))?;
+ for (id, s) in self.states.iter().enumerate() {
+ let mut trans = vec![];
+ s.trans.iter(|byte, next| {
+ // The start state has a bunch of uninteresting transitions
+ // back into itself. It's questionable to hide them since they
+ // are critical to understanding the automaton, but they are
+ // very noisy without better formatting for contiugous ranges
+ // to the same state.
+ if id == self.start_id.to_usize() && next == self.start_id {
+ return;
+ }
+ // Similarly, the dead state has a bunch of uninteresting
+ // transitions too.
+ if id == dead_id() {
+ return;
+ }
+ trans.push(format!("{} => {}", escape(byte), next.to_usize()));
+ });
+ writeln!(f, "{:04}: {}", id, trans.join(", "))?;
+
+ let matches: Vec<String> = s
+ .matches
+ .iter()
+ .map(|&(pattern_id, _)| pattern_id.to_string())
+ .collect();
+ writeln!(f, " matches: {}", matches.join(", "))?;
+ writeln!(f, " fail: {}", s.fail.to_usize())?;
+ writeln!(f, " depth: {}", s.depth)?;
+ }
+ writeln!(f, "{}", "-".repeat(79))?;
+ writeln!(f, ")")?;
+ Ok(())
+ }
+}
+
+/// Iterate over all possible byte transitions given a sparse set.
+fn sparse_iter<S: StateID, F: FnMut(u8, S)>(trans: &[(u8, S)], mut f: F) {
+ let mut byte = 0u16;
+ for &(b, id) in trans {
+ while byte < (b as u16) {
+ f(byte as u8, fail_id());
+ byte += 1;
+ }
+ f(b, id);
+ byte += 1;
+ }
+ for b in byte..256 {
+ f(b as u8, fail_id());
+ }
+}
+
+/// Safely return two mutable borrows to two different locations in the given
+/// slice.
+///
+/// This panics if i == j.
+fn get_two_mut<T>(xs: &mut [T], i: usize, j: usize) -> (&mut T, &mut T) {
+ assert!(i != j, "{} must not be equal to {}", i, j);
+ if i < j {
+ let (before, after) = xs.split_at_mut(j);
+ (&mut before[i], &mut after[0])
+ } else {
+ let (before, after) = xs.split_at_mut(i);
+ (&mut after[0], &mut before[j])
+ }
+}
+
+/// Return the given byte as its escaped string form.
+fn escape(b: u8) -> String {
+ use std::ascii;
+
+ String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap()
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn scratch() {
+ let nfa: NFA<usize> = Builder::new()
+ .dense_depth(0)
+ // .match_kind(MatchKind::LeftmostShortest)
+ // .match_kind(MatchKind::LeftmostLongest)
+ .match_kind(MatchKind::LeftmostFirst)
+ // .build(&["abcd", "ce", "b"])
+ // .build(&["ab", "bc"])
+ // .build(&["b", "bcd", "ce"])
+ // .build(&["abc", "bx"])
+ // .build(&["abc", "bd", "ab"])
+ // .build(&["abcdefghi", "hz", "abcdefgh"])
+ // .build(&["abcd", "bce", "b"])
+ .build(&["abcdefg", "bcde", "bcdef"])
+ .unwrap();
+ println!("{:?}", nfa);
+ }
+}
diff --git a/src/packed/api.rs b/src/packed/api.rs
new file mode 100644
index 0000000..3a316b5
--- /dev/null
+++ b/src/packed/api.rs
@@ -0,0 +1,632 @@
+use std::u16;
+
+use packed::pattern::Patterns;
+use packed::rabinkarp::RabinKarp;
+use packed::teddy::{self, Teddy};
+use Match;
+
+/// This is a limit placed on the total number of patterns we're willing to try
+/// and match at once. As more sophisticated algorithms are added, this number
+/// may be increased.
+const PATTERN_LIMIT: usize = 128;
+
+/// A knob for controlling the match semantics of a packed multiple string
+/// searcher.
+///
+/// This differs from the
+/// [`MatchKind`](../enum.MatchKind.html)
+/// type in the top-level crate module in that it doesn't support
+/// "standard" match semantics, and instead only supports leftmost-first or
+/// leftmost-longest. Namely, "standard" semantics cannot be easily supported
+/// by packed searchers.
+///
+/// For more information on the distinction between leftmost-first and
+/// leftmost-longest, see the docs on the top-level `MatchKind` type.
+///
+/// Unlike the top-level `MatchKind` type, the default match semantics for this
+/// type are leftmost-first.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum MatchKind {
+ /// Use leftmost-first match semantics, which reports leftmost matches.
+ /// When there are multiple possible leftmost matches, the match
+ /// corresponding to the pattern that appeared earlier when constructing
+ /// the automaton is reported.
+ ///
+ /// This is the default.
+ LeftmostFirst,
+ /// Use leftmost-longest match semantics, which reports leftmost matches.
+ /// When there are multiple possible leftmost matches, the longest match
+ /// is chosen.
+ LeftmostLongest,
+ /// Hints that destructuring should not be exhaustive.
+ ///
+ /// This enum may grow additional variants, so this makes sure clients
+ /// don't count on exhaustive matching. (Otherwise, adding a new variant
+ /// could break existing code.)
+ #[doc(hidden)]
+ __Nonexhaustive,
+}
+
+impl Default for MatchKind {
+ fn default() -> MatchKind {
+ MatchKind::LeftmostFirst
+ }
+}
+
+/// The configuration for a packed multiple pattern searcher.
+///
+/// The configuration is currently limited only to being able to select the
+/// match semantics (leftmost-first or leftmost-longest) of a searcher. In the
+/// future, more knobs may be made available.
+///
+/// A configuration produces a [`packed::Builder`](struct.Builder.html), which
+/// in turn can be used to construct a
+/// [`packed::Searcher`](struct.Searcher.html) for searching.
+///
+/// # Example
+///
+/// This example shows how to use leftmost-longest semantics instead of the
+/// default (leftmost-first).
+///
+/// ```
+/// use aho_corasick::packed::{Config, MatchKind};
+///
+/// # fn example() -> Option<()> {
+/// let searcher = Config::new()
+/// .match_kind(MatchKind::LeftmostLongest)
+/// .builder()
+/// .add("foo")
+/// .add("foobar")
+/// .build()?;
+/// let matches: Vec<usize> = searcher
+/// .find_iter("foobar")
+/// .map(|mat| mat.pattern())
+/// .collect();
+/// assert_eq!(vec![1], matches);
+/// # Some(()) }
+/// # if cfg!(target_arch = "x86_64") {
+/// # example().unwrap()
+/// # } else {
+/// # assert!(example().is_none());
+/// # }
+/// ```
+#[derive(Clone, Debug)]
+pub struct Config {
+ kind: MatchKind,
+ force: Option<ForceAlgorithm>,
+ force_teddy_fat: Option<bool>,
+ force_avx: Option<bool>,
+}
+
+/// An internal option for forcing the use of a particular packed algorithm.
+///
+/// When an algorithm is forced, if a searcher could not be constructed for it,
+/// then no searcher will be returned even if an alternative algorithm would
+/// work.
+#[derive(Clone, Debug)]
+enum ForceAlgorithm {
+ Teddy,
+ RabinKarp,
+}
+
+impl Default for Config {
+ fn default() -> Config {
+ Config::new()
+ }
+}
+
+impl Config {
+ /// Create a new default configuration. A default configuration uses
+ /// leftmost-first match semantics.
+ pub fn new() -> Config {
+ Config {
+ kind: MatchKind::LeftmostFirst,
+ force: None,
+ force_teddy_fat: None,
+ force_avx: None,
+ }
+ }
+
+ /// Create a packed builder from this configuration. The builder can be
+ /// used to accumulate patterns and create a
+ /// [`Searcher`](struct.Searcher.html)
+ /// from them.
+ pub fn builder(&self) -> Builder {
+ Builder::from_config(self.clone())
+ }
+
+ /// Set the match semantics for this configuration.
+ pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config {
+ self.kind = kind;
+ self
+ }
+
+ /// An undocumented method for forcing the use of the Teddy algorithm.
+ ///
+ /// This is only exposed for more precise testing and benchmarks. Callers
+ /// should not use it as it is not part of the API stability guarantees of
+ /// this crate.
+ #[doc(hidden)]
+ pub fn force_teddy(&mut self, yes: bool) -> &mut Config {
+ if yes {
+ self.force = Some(ForceAlgorithm::Teddy);
+ } else {
+ self.force = None;
+ }
+ self
+ }
+
+ /// An undocumented method for forcing the use of the Fat Teddy algorithm.
+ ///
+ /// This is only exposed for more precise testing and benchmarks. Callers
+ /// should not use it as it is not part of the API stability guarantees of
+ /// this crate.
+ #[doc(hidden)]
+ pub fn force_teddy_fat(&mut self, yes: Option<bool>) -> &mut Config {
+ self.force_teddy_fat = yes;
+ self
+ }
+
+ /// An undocumented method for forcing the use of SSE (`Some(false)`) or
+ /// AVX (`Some(true)`) algorithms.
+ ///
+ /// This is only exposed for more precise testing and benchmarks. Callers
+ /// should not use it as it is not part of the API stability guarantees of
+ /// this crate.
+ #[doc(hidden)]
+ pub fn force_avx(&mut self, yes: Option<bool>) -> &mut Config {
+ self.force_avx = yes;
+ self
+ }
+
+ /// An undocumented method for forcing the use of the Rabin-Karp algorithm.
+ ///
+ /// This is only exposed for more precise testing and benchmarks. Callers
+ /// should not use it as it is not part of the API stability guarantees of
+ /// this crate.
+ #[doc(hidden)]
+ pub fn force_rabin_karp(&mut self, yes: bool) -> &mut Config {
+ if yes {
+ self.force = Some(ForceAlgorithm::RabinKarp);
+ } else {
+ self.force = None;
+ }
+ self
+ }
+}
+
+/// A builder for constructing a packed searcher from a collection of patterns.
+///
+/// # Example
+///
+/// This example shows how to use a builder to construct a searcher. By
+/// default, leftmost-first match semantics are used.
+///
+/// ```
+/// use aho_corasick::packed::{Builder, MatchKind};
+///
+/// # fn example() -> Option<()> {
+/// let searcher = Builder::new()
+/// .add("foobar")
+/// .add("foo")
+/// .build()?;
+/// let matches: Vec<usize> = searcher
+/// .find_iter("foobar")
+/// .map(|mat| mat.pattern())
+/// .collect();
+/// assert_eq!(vec![0], matches);
+/// # Some(()) }
+/// # if cfg!(target_arch = "x86_64") {
+/// # example().unwrap()
+/// # } else {
+/// # assert!(example().is_none());
+/// # }
+/// ```
+#[derive(Clone, Debug)]
+pub struct Builder {
+ /// The configuration of this builder and subsequent matcher.
+ config: Config,
+ /// Set to true if the builder detects that a matcher cannot be built.
+ inert: bool,
+ /// The patterns provided by the caller.
+ patterns: Patterns,
+}
+
+impl Builder {
+ /// Create a new builder for constructing a multi-pattern searcher. This
+ /// constructor uses the default configuration.
+ pub fn new() -> Builder {
+ Builder::from_config(Config::new())
+ }
+
+ fn from_config(config: Config) -> Builder {
+ Builder { config, inert: false, patterns: Patterns::new() }
+ }
+
+ /// Build a searcher from the patterns added to this builder so far.
+ pub fn build(&self) -> Option<Searcher> {
+ if self.inert || self.patterns.is_empty() {
+ return None;
+ }
+ let mut patterns = self.patterns.clone();
+ patterns.set_match_kind(self.config.kind);
+ let rabinkarp = RabinKarp::new(&patterns);
+ // Effectively, we only want to return a searcher if we can use Teddy,
+ // since Teddy is our only fast packed searcher at the moment.
+ // Rabin-Karp is only used when searching haystacks smaller than what
+ // Teddy can support. Thus, the only way to get a Rabin-Karp searcher
+ // is to force it using undocumented APIs (for tests/benchmarks).
+ let (search_kind, minimum_len) = match self.config.force {
+ None | Some(ForceAlgorithm::Teddy) => {
+ let teddy = match self.build_teddy(&patterns) {
+ None => return None,
+ Some(teddy) => teddy,
+ };
+ let minimum_len = teddy.minimum_len();
+ (SearchKind::Teddy(teddy), minimum_len)
+ }
+ Some(ForceAlgorithm::RabinKarp) => (SearchKind::RabinKarp, 0),
+ };
+ Some(Searcher {
+ config: self.config.clone(),
+ patterns,
+ rabinkarp,
+ search_kind,
+ minimum_len,
+ })
+ }
+
+ fn build_teddy(&self, patterns: &Patterns) -> Option<Teddy> {
+ teddy::Builder::new()
+ .avx(self.config.force_avx)
+ .fat(self.config.force_teddy_fat)
+ .build(&patterns)
+ }
+
+ /// Add the given pattern to this set to match.
+ ///
+ /// The order in which patterns are added is significant. Namely, when
+ /// using leftmost-first match semantics, then when multiple patterns can
+ /// match at a particular location, the pattern that was added first is
+ /// used as the match.
+ ///
+ /// If the number of patterns added exceeds the amount supported by packed
+ /// searchers, then the builder will stop accumulating patterns and render
+ /// itself inert. At this point, constructing a searcher will always return
+ /// `None`.
+ pub fn add<P: AsRef<[u8]>>(&mut self, pattern: P) -> &mut Builder {
+ if self.inert {
+ return self;
+ } else if self.patterns.len() >= PATTERN_LIMIT {
+ self.inert = true;
+ self.patterns.reset();
+ return self;
+ }
+ // Just in case PATTERN_LIMIT increases beyond u16::MAX.
+ assert!(self.patterns.len() <= u16::MAX as usize);
+
+ let pattern = pattern.as_ref();
+ if pattern.is_empty() {
+ self.inert = true;
+ self.patterns.reset();
+ return self;
+ }
+ self.patterns.add(pattern);
+ self
+ }
+
+ /// Add the given iterator of patterns to this set to match.
+ ///
+ /// The iterator must yield elements that can be converted into a `&[u8]`.
+ ///
+ /// The order in which patterns are added is significant. Namely, when
+ /// using leftmost-first match semantics, then when multiple patterns can
+ /// match at a particular location, the pattern that was added first is
+ /// used as the match.
+ ///
+ /// If the number of patterns added exceeds the amount supported by packed
+ /// searchers, then the builder will stop accumulating patterns and render
+ /// itself inert. At this point, constructing a searcher will always return
+ /// `None`.
+ pub fn extend<I, P>(&mut self, patterns: I) -> &mut Builder
+ where
+ I: IntoIterator<Item = P>,
+ P: AsRef<[u8]>,
+ {
+ for p in patterns {
+ self.add(p);
+ }
+ self
+ }
+}
+
+impl Default for Builder {
+ fn default() -> Builder {
+ Builder::new()
+ }
+}
+
+/// A packed searcher for quickly finding occurrences of multiple patterns.
+///
+/// If callers need more flexible construction, or if one wants to change the
+/// match semantics (either leftmost-first or leftmost-longest), then one can
+/// use the [`Config`](struct.Config.html) and/or
+/// [`Builder`](struct.Builder.html) types for more fine grained control.
+///
+/// # Example
+///
+/// This example shows how to create a searcher from an iterator of patterns.
+/// By default, leftmost-first match semantics are used.
+///
+/// ```
+/// use aho_corasick::packed::{MatchKind, Searcher};
+///
+/// # fn example() -> Option<()> {
+/// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
+/// let matches: Vec<usize> = searcher
+/// .find_iter("foobar")
+/// .map(|mat| mat.pattern())
+/// .collect();
+/// assert_eq!(vec![0], matches);
+/// # Some(()) }
+/// # if cfg!(target_arch = "x86_64") {
+/// # example().unwrap()
+/// # } else {
+/// # assert!(example().is_none());
+/// # }
+/// ```
+#[derive(Clone, Debug)]
+pub struct Searcher {
+ config: Config,
+ patterns: Patterns,
+ rabinkarp: RabinKarp,
+ search_kind: SearchKind,
+ minimum_len: usize,
+}
+
+#[derive(Clone, Debug)]
+enum SearchKind {
+ Teddy(Teddy),
+ RabinKarp,
+}
+
+impl Searcher {
+ /// A convenience function for constructing a searcher from an iterator
+ /// of things that can be converted to a `&[u8]`.
+ ///
+ /// If a searcher could not be constructed (either because of an
+ /// unsupported CPU or because there are too many patterns), then `None`
+ /// is returned.
+ ///
+ /// # Example
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::packed::{MatchKind, Searcher};
+ ///
+ /// # fn example() -> Option<()> {
+ /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
+ /// let matches: Vec<usize> = searcher
+ /// .find_iter("foobar")
+ /// .map(|mat| mat.pattern())
+ /// .collect();
+ /// assert_eq!(vec![0], matches);
+ /// # Some(()) }
+ /// # if cfg!(target_arch = "x86_64") {
+ /// # example().unwrap()
+ /// # } else {
+ /// # assert!(example().is_none());
+ /// # }
+ /// ```
+ pub fn new<I, P>(patterns: I) -> Option<Searcher>
+ where
+ I: IntoIterator<Item = P>,
+ P: AsRef<[u8]>,
+ {
+ Builder::new().extend(patterns).build()
+ }
+
+ /// Return the first occurrence of any of the patterns in this searcher,
+ /// according to its match semantics, in the given haystack. The `Match`
+ /// returned will include the identifier of the pattern that matched, which
+ /// corresponds to the index of the pattern (starting from `0`) in which it
+ /// was added.
+ ///
+ /// # Example
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::packed::{MatchKind, Searcher};
+ ///
+ /// # fn example() -> Option<()> {
+ /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
+ /// let mat = searcher.find("foobar")?;
+ /// assert_eq!(0, mat.pattern());
+ /// assert_eq!(0, mat.start());
+ /// assert_eq!(6, mat.end());
+ /// # Some(()) }
+ /// # if cfg!(target_arch = "x86_64") {
+ /// # example().unwrap()
+ /// # } else {
+ /// # assert!(example().is_none());
+ /// # }
+ /// ```
+ pub fn find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match> {
+ self.find_at(haystack, 0)
+ }
+
+ /// Return the first occurrence of any of the patterns in this searcher,
+ /// according to its match semantics, in the given haystack starting from
+ /// the given position.
+ ///
+ /// The `Match` returned will include the identifier of the pattern that
+ /// matched, which corresponds to the index of the pattern (starting from
+ /// `0`) in which it was added. The offsets in the `Match` will be relative
+ /// to the start of `haystack` (and not `at`).
+ ///
+ /// # Example
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::packed::{MatchKind, Searcher};
+ ///
+ /// # fn example() -> Option<()> {
+ /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
+ /// let mat = searcher.find_at("foofoobar", 3)?;
+ /// assert_eq!(0, mat.pattern());
+ /// assert_eq!(3, mat.start());
+ /// assert_eq!(9, mat.end());
+ /// # Some(()) }
+ /// # if cfg!(target_arch = "x86_64") {
+ /// # example().unwrap()
+ /// # } else {
+ /// # assert!(example().is_none());
+ /// # }
+ /// ```
+ pub fn find_at<B: AsRef<[u8]>>(
+ &self,
+ haystack: B,
+ at: usize,
+ ) -> Option<Match> {
+ let haystack = haystack.as_ref();
+ match self.search_kind {
+ SearchKind::Teddy(ref teddy) => {
+ if haystack[at..].len() < teddy.minimum_len() {
+ return self.slow_at(haystack, at);
+ }
+ teddy.find_at(&self.patterns, haystack, at)
+ }
+ SearchKind::RabinKarp => {
+ self.rabinkarp.find_at(&self.patterns, haystack, at)
+ }
+ }
+ }
+
+ /// Return an iterator of non-overlapping occurrences of the patterns in
+ /// this searcher, according to its match semantics, in the given haystack.
+ ///
+ /// # Example
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::packed::{MatchKind, Searcher};
+ ///
+ /// # fn example() -> Option<()> {
+ /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
+ /// let matches: Vec<usize> = searcher
+ /// .find_iter("foobar fooba foofoo")
+ /// .map(|mat| mat.pattern())
+ /// .collect();
+ /// assert_eq!(vec![0, 1, 1, 1], matches);
+ /// # Some(()) }
+ /// # if cfg!(target_arch = "x86_64") {
+ /// # example().unwrap()
+ /// # } else {
+ /// # assert!(example().is_none());
+ /// # }
+ /// ```
+ pub fn find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>(
+ &'a self,
+ haystack: &'b B,
+ ) -> FindIter<'a, 'b> {
+ FindIter { searcher: self, haystack: haystack.as_ref(), at: 0 }
+ }
+
+ /// Returns the match kind used by this packed searcher.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::packed::{MatchKind, Searcher};
+ ///
+ /// # fn example() -> Option<()> {
+ /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
+ /// // leftmost-first is the default.
+ /// assert_eq!(&MatchKind::LeftmostFirst, searcher.match_kind());
+ /// # Some(()) }
+ /// # if cfg!(target_arch = "x86_64") {
+ /// # example().unwrap()
+ /// # } else {
+ /// # assert!(example().is_none());
+ /// # }
+ /// ```
+ pub fn match_kind(&self) -> &MatchKind {
+ self.patterns.match_kind()
+ }
+
+ /// Returns the minimum length of a haystack that is required in order for
+ /// packed searching to be effective.
+ ///
+ /// In some cases, the underlying packed searcher may not be able to search
+ /// very short haystacks. When that occurs, the implementation will defer
+ /// to a slower non-packed searcher (which is still generally faster than
+ /// Aho-Corasick for a small number of patterns). However, callers may
+ /// want to avoid ever using the slower variant, which one can do by
+ /// never passing a haystack shorter than the minimum length returned by
+ /// this method.
+ pub fn minimum_len(&self) -> usize {
+ self.minimum_len
+ }
+
+ /// Returns the approximate total amount of heap used by this searcher, in
+ /// units of bytes.
+ pub fn heap_bytes(&self) -> usize {
+ self.patterns.heap_bytes()
+ + self.rabinkarp.heap_bytes()
+ + self.search_kind.heap_bytes()
+ }
+
+ /// Use a slow (non-packed) searcher.
+ ///
+ /// This is useful when a packed searcher could be constructed, but could
+ /// not be used to search a specific haystack. For example, if Teddy was
+ /// built but the haystack is smaller than ~34 bytes, then Teddy might not
+ /// be able to run.
+ fn slow_at(&self, haystack: &[u8], at: usize) -> Option<Match> {
+ self.rabinkarp.find_at(&self.patterns, haystack, at)
+ }
+}
+
+impl SearchKind {
+ fn heap_bytes(&self) -> usize {
+ match *self {
+ SearchKind::Teddy(ref ted) => ted.heap_bytes(),
+ SearchKind::RabinKarp => 0,
+ }
+ }
+}
+
+/// An iterator over non-overlapping matches from a packed searcher.
+///
+/// The lifetime `'s` refers to the lifetime of the underlying
+/// [`Searcher`](struct.Searcher.html), while the lifetime `'h` refers to the
+/// lifetime of the haystack being searched.
+#[derive(Debug)]
+pub struct FindIter<'s, 'h> {
+ searcher: &'s Searcher,
+ haystack: &'h [u8],
+ at: usize,
+}
+
+impl<'s, 'h> Iterator for FindIter<'s, 'h> {
+ type Item = Match;
+
+ fn next(&mut self) -> Option<Match> {
+ if self.at > self.haystack.len() {
+ return None;
+ }
+ match self.searcher.find_at(&self.haystack, self.at) {
+ None => None,
+ Some(c) => {
+ self.at = c.end;
+ Some(c)
+ }
+ }
+ }
+}
diff --git a/src/packed/mod.rs b/src/packed/mod.rs
new file mode 100644
index 0000000..5a3aa2e
--- /dev/null
+++ b/src/packed/mod.rs
@@ -0,0 +1,117 @@
+/*!
+A lower level API for packed multiple substring search, principally for a small
+number of patterns.
+
+This sub-module provides vectorized routines for quickly finding matches of a
+small number of patterns. In general, users of this crate shouldn't need to
+interface with this module directory, as the primary
+[`AhoCorasick`](../struct.AhoCorasick.html)
+searcher will use these routines automatically as a prefilter when applicable.
+However, in some cases, callers may want to bypass the Aho-Corasick machinery
+entirely and use this vectorized searcher directly.
+
+# Overview
+
+The primary types in this sub-module are:
+
+* [`Searcher`](struct.Searcher.html) executes the actual search algorithm to
+ report matches in a haystack.
+* [`Builder`](struct.Builder.html) accumulates patterns incrementally and can
+ construct a `Searcher`.
+* [`Config`](struct.Config.html) permits tuning the searcher, and itself will
+ produce a `Builder` (which can then be used to build a `Searcher`).
+ Currently, the only tuneable knob are the match semantics, but this may be
+ expanded in the future.
+
+# Examples
+
+This example shows how to create a searcher from an iterator of patterns.
+By default, leftmost-first match semantics are used. (See the top-level
+[`MatchKind`](../enum.MatchKind.html) type for more details about match
+semantics, which apply similarly to packed substring search.)
+
+```
+use aho_corasick::packed::{MatchKind, Searcher};
+
+# fn example() -> Option<()> {
+let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
+let matches: Vec<usize> = searcher
+ .find_iter("foobar")
+ .map(|mat| mat.pattern())
+ .collect();
+assert_eq!(vec![0], matches);
+# Some(()) }
+# if cfg!(target_arch = "x86_64") {
+# example().unwrap()
+# } else {
+# assert!(example().is_none());
+# }
+```
+
+This example shows how to use [`Config`](struct.Config.html) to change the
+match semantics to leftmost-longest:
+
+```
+use aho_corasick::packed::{Config, MatchKind};
+
+# fn example() -> Option<()> {
+let searcher = Config::new()
+ .match_kind(MatchKind::LeftmostLongest)
+ .builder()
+ .add("foo")
+ .add("foobar")
+ .build()?;
+let matches: Vec<usize> = searcher
+ .find_iter("foobar")
+ .map(|mat| mat.pattern())
+ .collect();
+assert_eq!(vec![1], matches);
+# Some(()) }
+# if cfg!(target_arch = "x86_64") {
+# example().unwrap()
+# } else {
+# assert!(example().is_none());
+# }
+```
+
+# Packed substring searching
+
+Packed substring searching refers to the use of SIMD (Single Instruction,
+Multiple Data) to accelerate the detection of matches in a haystack. Unlike
+conventional algorithms, such as Aho-Corasick, SIMD algorithms for substring
+search tend to do better with a small number of patterns, where as Aho-Corasick
+generally maintains reasonably consistent performance regardless of the number
+of patterns you give it. Because of this, the vectorized searcher in this
+sub-module cannot be used as a general purpose searcher, since building the
+searcher may fail. However, in exchange, when searching for a small number of
+patterns, searching can be quite a bit faster than Aho-Corasick (sometimes by
+an order of magnitude).
+
+The key take away here is that constructing a searcher from a list of patterns
+is a fallible operation. While the precise conditions under which building a
+searcher can fail is specifically an implementation detail, here are some
+common reasons:
+
+* Too many patterns were given. Typically, the limit is on the order of 100 or
+ so, but this limit may fluctuate based on available CPU features.
+* The available packed algorithms require CPU features that aren't available.
+ For example, currently, this crate only provides packed algorithms for
+ `x86_64`. Therefore, constructing a packed searcher on any other target
+ (e.g., ARM) will always fail.
+* Zero patterns were given, or one of the patterns given was empty. Packed
+ searchers require at least one pattern and that all patterns are non-empty.
+* Something else about the nature of the patterns (typically based on
+ heuristics) suggests that a packed searcher would perform very poorly, so
+ no searcher is built.
+*/
+
+pub use packed::api::{Builder, Config, FindIter, MatchKind, Searcher};
+
+mod api;
+mod pattern;
+mod rabinkarp;
+mod teddy;
+#[cfg(test)]
+mod tests;
+#[cfg(target_arch = "x86_64")]
+mod vector;
diff --git a/src/packed/pattern.rs b/src/packed/pattern.rs
new file mode 100644
index 0000000..dfb07e9
--- /dev/null
+++ b/src/packed/pattern.rs
@@ -0,0 +1,318 @@
+use std::cmp;
+use std::fmt;
+use std::mem;
+use std::u16;
+use std::usize;
+
+use packed::api::MatchKind;
+
+/// The type used for representing a pattern identifier.
+///
+/// We don't use `usize` here because our packed searchers don't scale to
+/// huge numbers of patterns, so we keep things a bit smaller.
+pub type PatternID = u16;
+
+/// A non-empty collection of non-empty patterns to search for.
+///
+/// This collection of patterns is what is passed around to both execute
+/// searches and to construct the searchers themselves. Namely, this permits
+/// searches to avoid copying all of the patterns, and allows us to keep only
+/// one copy throughout all packed searchers.
+///
+/// Note that this collection is not a set. The same pattern can appear more
+/// than once.
+#[derive(Clone, Debug)]
+pub struct Patterns {
+ /// The match semantics supported by this collection of patterns.
+ ///
+ /// The match semantics determines the order of the iterator over patterns.
+ /// For leftmost-first, patterns are provided in the same order as were
+ /// provided by the caller. For leftmost-longest, patterns are provided in
+ /// descending order of length, with ties broken by the order in which they
+ /// were provided by the caller.
+ kind: MatchKind,
+ /// The collection of patterns, indexed by their identifier.
+ by_id: Vec<Vec<u8>>,
+ /// The order of patterns defined for iteration, given by pattern
+ /// identifiers. The order of `by_id` and `order` is always the same for
+ /// leftmost-first semantics, but may be different for leftmost-longest
+ /// semantics.
+ order: Vec<PatternID>,
+ /// The length of the smallest pattern, in bytes.
+ minimum_len: usize,
+ /// The largest pattern identifier. This should always be equivalent to
+ /// the number of patterns minus one in this collection.
+ max_pattern_id: PatternID,
+ /// The total number of pattern bytes across the entire collection. This
+ /// is used for reporting total heap usage in constant time.
+ total_pattern_bytes: usize,
+}
+
+impl Patterns {
+ /// Create a new collection of patterns for the given match semantics. The
+ /// ID of each pattern is the index of the pattern at which it occurs in
+ /// the `by_id` slice.
+ ///
+ /// If any of the patterns in the slice given are empty, then this panics.
+ /// Similarly, if the number of patterns given is zero, then this also
+ /// panics.
+ pub fn new() -> Patterns {
+ Patterns {
+ kind: MatchKind::default(),
+ by_id: vec![],
+ order: vec![],
+ minimum_len: usize::MAX,
+ max_pattern_id: 0,
+ total_pattern_bytes: 0,
+ }
+ }
+
+ /// Add a pattern to this collection.
+ ///
+ /// This panics if the pattern given is empty.
+ pub fn add(&mut self, bytes: &[u8]) {
+ assert!(!bytes.is_empty());
+ assert!(self.by_id.len() <= u16::MAX as usize);
+
+ let id = self.by_id.len() as u16;
+ self.max_pattern_id = id;
+ self.order.push(id);
+ self.by_id.push(bytes.to_vec());
+ self.minimum_len = cmp::min(self.minimum_len, bytes.len());
+ self.total_pattern_bytes += bytes.len();
+ }
+
+ /// Set the match kind semantics for this collection of patterns.
+ ///
+ /// If the kind is not set, then the default is leftmost-first.
+ pub fn set_match_kind(&mut self, kind: MatchKind) {
+ match kind {
+ MatchKind::LeftmostFirst => {
+ self.order.sort();
+ }
+ MatchKind::LeftmostLongest => {
+ let (order, by_id) = (&mut self.order, &mut self.by_id);
+ order.sort_by(|&id1, &id2| {
+ by_id[id1 as usize]
+ .len()
+ .cmp(&by_id[id2 as usize].len())
+ .reverse()
+ });
+ }
+ MatchKind::__Nonexhaustive => unreachable!(),
+ }
+ }
+
+ /// Return the number of patterns in this collection.
+ ///
+ /// This is guaranteed to be greater than zero.
+ pub fn len(&self) -> usize {
+ self.by_id.len()
+ }
+
+ /// Returns true if and only if this collection of patterns is empty.
+ pub fn is_empty(&self) -> bool {
+ self.len() == 0
+ }
+
+ /// Returns the approximate total amount of heap used by these patterns, in
+ /// units of bytes.
+ pub fn heap_bytes(&self) -> usize {
+ self.order.len() * mem::size_of::<PatternID>()
+ + self.by_id.len() * mem::size_of::<Vec<u8>>()
+ + self.total_pattern_bytes
+ }
+
+ /// Clears all heap memory associated with this collection of patterns and
+ /// resets all state such that it is a valid empty collection.
+ pub fn reset(&mut self) {
+ self.kind = MatchKind::default();
+ self.by_id.clear();
+ self.order.clear();
+ self.minimum_len = usize::MAX;
+ self.max_pattern_id = 0;
+ }
+
+ /// Return the maximum pattern identifier in this collection. This can be
+ /// useful in searchers for ensuring that the collection of patterns they
+ /// are provided at search time and at build time have the same size.
+ pub fn max_pattern_id(&self) -> PatternID {
+ assert_eq!((self.max_pattern_id + 1) as usize, self.len());
+ self.max_pattern_id
+ }
+
+ /// Returns the length, in bytes, of the smallest pattern.
+ ///
+ /// This is guaranteed to be at least one.
+ pub fn minimum_len(&self) -> usize {
+ self.minimum_len
+ }
+
+ /// Returns the match semantics used by these patterns.
+ pub fn match_kind(&self) -> &MatchKind {
+ &self.kind
+ }
+
+ /// Return the pattern with the given identifier. If such a pattern does
+ /// not exist, then this panics.
+ pub fn get(&self, id: PatternID) -> Pattern {
+ Pattern(&self.by_id[id as usize])
+ }
+
+ /// Return the pattern with the given identifier without performing bounds
+ /// checks.
+ ///
+ /// # Safety
+ ///
+ /// Callers must ensure that a pattern with the given identifier exists
+ /// before using this method.
+ #[cfg(target_arch = "x86_64")]
+ pub unsafe fn get_unchecked(&self, id: PatternID) -> Pattern {
+ Pattern(self.by_id.get_unchecked(id as usize))
+ }
+
+ /// Return an iterator over all the patterns in this collection, in the
+ /// order in which they should be matched.
+ ///
+ /// Specifically, in a naive multi-pattern matcher, the following is
+ /// guaranteed to satisfy the match semantics of this collection of
+ /// patterns:
+ ///
+ /// ```ignore
+ /// for i in 0..haystack.len():
+ /// for p in patterns.iter():
+ /// if haystack[i..].starts_with(p.bytes()):
+ /// return Match(p.id(), i, i + p.bytes().len())
+ /// ```
+ ///
+ /// Namely, among the patterns in a collection, if they are matched in
+ /// the order provided by this iterator, then the result is guaranteed
+ /// to satisfy the correct match semantics. (Either leftmost-first or
+ /// leftmost-longest.)
+ pub fn iter(&self) -> PatternIter {
+ PatternIter { patterns: self, i: 0 }
+ }
+}
+
+/// An iterator over the patterns in the `Patterns` collection.
+///
+/// The order of the patterns provided by this iterator is consistent with the
+/// match semantics of the originating collection of patterns.
+///
+/// The lifetime `'p` corresponds to the lifetime of the collection of patterns
+/// this is iterating over.
+#[derive(Debug)]
+pub struct PatternIter<'p> {
+ patterns: &'p Patterns,
+ i: usize,
+}
+
+impl<'p> Iterator for PatternIter<'p> {
+ type Item = (PatternID, Pattern<'p>);
+
+ fn next(&mut self) -> Option<(PatternID, Pattern<'p>)> {
+ if self.i >= self.patterns.len() {
+ return None;
+ }
+ let id = self.patterns.order[self.i];
+ let p = self.patterns.get(id);
+ self.i += 1;
+ Some((id, p))
+ }
+}
+
+/// A pattern that is used in packed searching.
+#[derive(Clone)]
+pub struct Pattern<'a>(&'a [u8]);
+
+impl<'a> fmt::Debug for Pattern<'a> {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ f.debug_struct("Pattern")
+ .field("lit", &String::from_utf8_lossy(&self.0))
+ .finish()
+ }
+}
+
+impl<'p> Pattern<'p> {
+ /// Returns the length of this pattern, in bytes.
+ pub fn len(&self) -> usize {
+ self.0.len()
+ }
+
+ /// Returns the bytes of this pattern.
+ pub fn bytes(&self) -> &[u8] {
+ &self.0
+ }
+
+ /// Returns the first `len` low nybbles from this pattern. If this pattern
+ /// is shorter than `len`, then this panics.
+ #[cfg(target_arch = "x86_64")]
+ pub fn low_nybbles(&self, len: usize) -> Vec<u8> {
+ let mut nybs = vec![];
+ for &b in self.bytes().iter().take(len) {
+ nybs.push(b & 0xF);
+ }
+ nybs
+ }
+
+ /// Returns true if this pattern is a prefix of the given bytes.
+ #[inline(always)]
+ pub fn is_prefix(&self, bytes: &[u8]) -> bool {
+ self.len() <= bytes.len() && self.equals(&bytes[..self.len()])
+ }
+
+ /// Returns true if and only if this pattern equals the given bytes.
+ #[inline(always)]
+ pub fn equals(&self, bytes: &[u8]) -> bool {
+ // Why not just use memcmp for this? Well, memcmp requires calling out
+ // to libc, and this routine is called in fairly hot code paths. Other
+ // than just calling out to libc, it also seems to result in worse
+ // codegen. By rolling our own memcpy in pure Rust, it seems to appear
+ // more friendly to the optimizer.
+ //
+ // This results in an improvement in just about every benchmark. Some
+ // smaller than others, but in some cases, up to 30% faster.
+
+ if self.len() != bytes.len() {
+ return false;
+ }
+ if self.len() < 8 {
+ for (&b1, &b2) in self.bytes().iter().zip(bytes) {
+ if b1 != b2 {
+ return false;
+ }
+ }
+ return true;
+ }
+ // When we have 8 or more bytes to compare, then proceed in chunks of
+ // 8 at a time using unaligned loads.
+ let mut p1 = self.bytes().as_ptr();
+ let mut p2 = bytes.as_ptr();
+ let p1end = self.bytes()[self.len() - 8..].as_ptr();
+ let p2end = bytes[bytes.len() - 8..].as_ptr();
+ // SAFETY: Via the conditional above, we know that both `p1` and `p2`
+ // have the same length, so `p1 < p1end` implies that `p2 < p2end`.
+ // Thus, derefencing both `p1` and `p2` in the loop below is safe.
+ //
+ // Moreover, we set `p1end` and `p2end` to be 8 bytes before the actual
+ // end of of `p1` and `p2`. Thus, the final dereference outside of the
+ // loop is guaranteed to be valid.
+ //
+ // Finally, we needn't worry about 64-bit alignment here, since we
+ // do unaligned loads.
+ unsafe {
+ while p1 < p1end {
+ let v1 = (p1 as *const u64).read_unaligned();
+ let v2 = (p2 as *const u64).read_unaligned();
+ if v1 != v2 {
+ return false;
+ }
+ p1 = p1.add(8);
+ p2 = p2.add(8);
+ }
+ let v1 = (p1end as *const u64).read_unaligned();
+ let v2 = (p2end as *const u64).read_unaligned();
+ v1 == v2
+ }
+ }
+}
diff --git a/src/packed/rabinkarp.rs b/src/packed/rabinkarp.rs
new file mode 100644
index 0000000..3992296
--- /dev/null
+++ b/src/packed/rabinkarp.rs
@@ -0,0 +1,185 @@
+use std::mem;
+
+use packed::pattern::{PatternID, Patterns};
+use Match;
+
+/// The type of the rolling hash used in the Rabin-Karp algorithm.
+type Hash = usize;
+
+/// The number of buckets to store our patterns in. We don't want this to be
+/// too big in order to avoid wasting memory, but we don't want it to be too
+/// small either to avoid spending too much time confirming literals.
+///
+/// The number of buckets MUST be a power of two. Otherwise, determining the
+/// bucket from a hash will slow down the code considerably. Using a power
+/// of two means `hash % NUM_BUCKETS` can compile down to a simple `and`
+/// instruction.
+const NUM_BUCKETS: usize = 64;
+
+/// An implementation of the Rabin-Karp algorithm. The main idea of this
+/// algorithm is to maintain a rolling hash as it moves through the input, and
+/// then check whether that hash corresponds to the same hash for any of the
+/// patterns we're looking for.
+///
+/// A draw back of naively scaling Rabin-Karp to multiple patterns is that
+/// it requires all of the patterns to be the same length, which in turn
+/// corresponds to the number of bytes to hash. We adapt this to work for
+/// multiple patterns of varying size by fixing the number of bytes to hash
+/// to be the length of the smallest pattern. We also split the patterns into
+/// several buckets to hopefully make the confirmation step faster.
+///
+/// Wikipedia has a decent explanation, if a bit heavy on the theory:
+/// https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm
+///
+/// But ESMAJ provides something a bit more concrete:
+/// http://www-igm.univ-mlv.fr/~lecroq/string/node5.html
+#[derive(Clone, Debug)]
+pub struct RabinKarp {
+ /// The order of patterns in each bucket is significant. Namely, they are
+ /// arranged such that the first one to match is the correct match. This
+ /// may not necessarily correspond to the order provided by the caller.
+ /// For example, if leftmost-longest semantics are used, then the patterns
+ /// are sorted by their length in descending order. If leftmost-first
+ /// semantics are used, then the patterns are sorted by their pattern ID
+ /// in ascending order (which corresponds to the caller's order).
+ buckets: Vec<Vec<(Hash, PatternID)>>,
+ /// The length of the hashing window. Generally, this corresponds to the
+ /// length of the smallest pattern.
+ hash_len: usize,
+ /// The factor to subtract out of a hash before updating it with a new
+ /// byte.
+ hash_2pow: usize,
+ /// The maximum identifier of a pattern. This is used as a sanity check
+ /// to ensure that the patterns provided by the caller are the same as
+ /// the patterns that were used to compile the matcher. This sanity check
+ /// possibly permits safely eliminating bounds checks regardless of what
+ /// patterns are provided by the caller.
+ ///
+ /// (Currently, we don't use this to elide bounds checks since it doesn't
+ /// result in a measurable performance improvement, but we do use it for
+ /// better failure modes.)
+ max_pattern_id: PatternID,
+}
+
+impl RabinKarp {
+ /// Compile a new Rabin-Karp matcher from the patterns given.
+ ///
+ /// This panics if any of the patterns in the collection are empty, or if
+ /// the collection is itself empty.
+ pub fn new(patterns: &Patterns) -> RabinKarp {
+ assert!(patterns.len() >= 1);
+ let hash_len = patterns.minimum_len();
+ assert!(hash_len >= 1);
+
+ let mut hash_2pow = 1usize;
+ for _ in 1..hash_len {
+ hash_2pow = hash_2pow.wrapping_shl(1);
+ }
+
+ let mut rk = RabinKarp {
+ buckets: vec![vec![]; NUM_BUCKETS],
+ hash_len,
+ hash_2pow,
+ max_pattern_id: patterns.max_pattern_id(),
+ };
+ for (id, pat) in patterns.iter() {
+ let hash = rk.hash(&pat.bytes()[..rk.hash_len]);
+ let bucket = hash % NUM_BUCKETS;
+ rk.buckets[bucket].push((hash, id));
+ }
+ rk
+ }
+
+ /// Return the first matching pattern in the given haystack, begining the
+ /// search at `at`.
+ pub fn find_at(
+ &self,
+ patterns: &Patterns,
+ haystack: &[u8],
+ mut at: usize,
+ ) -> Option<Match> {
+ assert_eq!(NUM_BUCKETS, self.buckets.len());
+ assert_eq!(
+ self.max_pattern_id,
+ patterns.max_pattern_id(),
+ "Rabin-Karp must be called with same patterns it was built with",
+ );
+
+ if at + self.hash_len > haystack.len() {
+ return None;
+ }
+ let mut hash = self.hash(&haystack[at..at + self.hash_len]);
+ loop {
+ let bucket = &self.buckets[hash % NUM_BUCKETS];
+ for &(phash, pid) in bucket {
+ if phash == hash {
+ if let Some(c) = self.verify(patterns, pid, haystack, at) {
+ return Some(c);
+ }
+ }
+ }
+ if at + self.hash_len >= haystack.len() {
+ return None;
+ }
+ hash = self.update_hash(
+ hash,
+ haystack[at],
+ haystack[at + self.hash_len],
+ );
+ at += 1;
+ }
+ }
+
+ /// Returns the approximate total amount of heap used by this searcher, in
+ /// units of bytes.
+ pub fn heap_bytes(&self) -> usize {
+ let num_patterns = self.max_pattern_id as usize + 1;
+ self.buckets.len() * mem::size_of::<Vec<(Hash, PatternID)>>()
+ + num_patterns * mem::size_of::<(Hash, PatternID)>()
+ }
+
+ /// Verify whether the pattern with the given id matches at
+ /// `haystack[at..]`.
+ ///
+ /// We tag this function as `cold` because it helps improve codegen.
+ /// Intuitively, it would seem like inlining it would be better. However,
+ /// the only time this is called and a match is not found is when there
+ /// there is a hash collision, or when a prefix of a pattern matches but
+ /// the entire pattern doesn't match. This is hopefully fairly rare, and
+ /// if it does occur a lot, it's going to be slow no matter what we do.
+ #[cold]
+ fn verify(
+ &self,
+ patterns: &Patterns,
+ id: PatternID,
+ haystack: &[u8],
+ at: usize,
+ ) -> Option<Match> {
+ let pat = patterns.get(id);
+ if pat.is_prefix(&haystack[at..]) {
+ Some(Match::from_span(id as usize, at, at + pat.len()))
+ } else {
+ None
+ }
+ }
+
+ /// Hash the given bytes.
+ fn hash(&self, bytes: &[u8]) -> Hash {
+ assert_eq!(self.hash_len, bytes.len());
+
+ let mut hash = 0usize;
+ for &b in bytes {
+ hash = hash.wrapping_shl(1).wrapping_add(b as usize);
+ }
+ hash
+ }
+
+ /// Update the hash given based on removing `old_byte` at the beginning
+ /// of some byte string, and appending `new_byte` to the end of that same
+ /// byte string.
+ fn update_hash(&self, prev: Hash, old_byte: u8, new_byte: u8) -> Hash {
+ prev.wrapping_sub((old_byte as usize).wrapping_mul(self.hash_2pow))
+ .wrapping_shl(1)
+ .wrapping_add(new_byte as usize)
+ }
+}
diff --git a/src/packed/teddy/README.md b/src/packed/teddy/README.md
new file mode 100644
index 0000000..0c42383
--- /dev/null
+++ b/src/packed/teddy/README.md
@@ -0,0 +1,386 @@
+Teddy is a simd accelerated multiple substring matching algorithm. The name
+and the core ideas in the algorithm were learned from the [Hyperscan][1_u]
+project. The implementation in this repository was mostly motivated for use in
+accelerating regex searches by searching for small sets of required literals
+extracted from the regex.
+
+
+# Background
+
+The key idea of Teddy is to do *packed* substring matching. In the literature,
+packed substring matching is the idea of examining multiple bytes in a haystack
+at a time to detect matches. Implementations of, for example, memchr (which
+detects matches of a single byte) have been doing this for years. Only
+recently, with the introduction of various SIMD instructions, has this been
+extended to substring matching. The PCMPESTRI instruction (and its relatives),
+for example, implements substring matching in hardware. It is, however, limited
+to substrings of length 16 bytes or fewer, but this restriction is fine in a
+regex engine, since we rarely care about the performance difference between
+searching for a 16 byte literal and a 16 + N literal; 16 is already long
+enough. The key downside of the PCMPESTRI instruction, on current (2016) CPUs
+at least, is its latency and throughput. As a result, it is often faster to
+do substring search with a Boyer-Moore (or Two-Way) variant and a well placed
+memchr to quickly skip through the haystack.
+
+There are fewer results from the literature on packed substring matching,
+and even fewer for packed multiple substring matching. Ben-Kiki et al. [2]
+describes use of PCMPESTRI for substring matching, but is mostly theoretical
+and hand-waves performance. There is other theoretical work done by Bille [3]
+as well.
+
+The rest of the work in the field, as far as I'm aware, is by Faro and Kulekci
+and is generally focused on multiple pattern search. Their first paper [4a]
+introduces the concept of a fingerprint, which is computed for every block of
+N bytes in every pattern. The haystack is then scanned N bytes at a time and
+a fingerprint is computed in the same way it was computed for blocks in the
+patterns. If the fingerprint corresponds to one that was found in a pattern,
+then a verification step follows to confirm that one of the substrings with the
+corresponding fingerprint actually matches at the current location. Various
+implementation tricks are employed to make sure the fingerprint lookup is fast;
+typically by truncating the fingerprint. (This may, of course, provoke more
+steps in the verification process, so a balance must be struck.)
+
+The main downside of [4a] is that the minimum substring length is 32 bytes,
+presumably because of how the algorithm uses certain SIMD instructions. This
+essentially makes it useless for general purpose regex matching, where a small
+number of short patterns is far more likely.
+
+Faro and Kulekci published another paper [4b] that is conceptually very similar
+to [4a]. The key difference is that it uses the CRC32 instruction (introduced
+as part of SSE 4.2) to compute fingerprint values. This also enables the
+algorithm to work effectively on substrings as short as 7 bytes with 4 byte
+windows. 7 bytes is unfortunately still too long. The window could be
+technically shrunk to 2 bytes, thereby reducing minimum length to 3, but the
+small window size ends up negating most performance benefits—and it's likely
+the common case in a general purpose regex engine.
+
+Faro and Kulekci also published [4c] that appears to be intended as a
+replacement to using PCMPESTRI. In particular, it is specifically motivated by
+the high throughput/latency time of PCMPESTRI and therefore chooses other SIMD
+instructions that are faster. While this approach works for short substrings,
+I personally couldn't see a way to generalize it to multiple substring search.
+
+Faro and Kulekci have another paper [4d] that I haven't been able to read
+because it is behind a paywall.
+
+
+# Teddy
+
+Finally, we get to Teddy. If the above literature review is complete, then it
+appears that Teddy is a novel algorithm. More than that, in my experience, it
+completely blows away the competition for short substrings, which is exactly
+what we want in a general purpose regex engine. Again, the algorithm appears
+to be developed by the authors of [Hyperscan][1_u]. Hyperscan was open sourced
+late 2015, and no earlier history could be found. Therefore, tracking the exact
+provenance of the algorithm with respect to the published literature seems
+difficult.
+
+At a high level, Teddy works somewhat similarly to the fingerprint algorithms
+published by Faro and Kulekci, but Teddy does it in a way that scales a bit
+better. Namely:
+
+1. Teddy's core algorithm scans the haystack in 16 (for SSE, or 32 for AVX)
+ byte chunks. 16 (or 32) is significant because it corresponds to the number
+ of bytes in a SIMD vector.
+2. Bitwise operations are performed on each chunk to discover if any region of
+ it matches a set of precomputed fingerprints from the patterns. If there are
+ matches, then a verification step is performed. In this implementation, our
+ verification step is naive. This can be improved upon.
+
+The details to make this work are quite clever. First, we must choose how to
+pick our fingerprints. In Hyperscan's implementation, I *believe* they use the
+last N bytes of each substring, where N must be at least the minimum length of
+any substring in the set being searched. In this implementation, we use the
+first N bytes of each substring. (The tradeoffs between these choices aren't
+yet clear to me.) We then must figure out how to quickly test whether an
+occurrence of any fingerprint from the set of patterns appears in a 16 byte
+block from the haystack. To keep things simple, let's assume N = 1 and examine
+some examples to motivate the approach. Here are our patterns:
+
+```ignore
+foo
+bar
+baz
+```
+
+The corresponding fingerprints, for N = 1, are `f`, `b` and `b`. Now let's set
+our 16 byte block to:
+
+```ignore
+bat cat foo bump
+xxxxxxxxxxxxxxxx
+```
+
+To cut to the chase, Teddy works by using bitsets. In particular, Teddy creates
+a mask that allows us to quickly compute membership of a fingerprint in a 16
+byte block that also tells which pattern the fingerprint corresponds to. In
+this case, our fingerprint is a single byte, so an appropriate abstraction is
+a map from a single byte to a list of patterns that contain that fingerprint:
+
+```ignore
+f |--> foo
+b |--> bar, baz
+```
+
+Now, all we need to do is figure out how to represent this map in vector space
+and use normal SIMD operations to perform a lookup. The first simplification
+we can make is to represent our patterns as bit fields occupying a single
+byte. This is important, because a single SIMD vector can store 16 bytes.
+
+```ignore
+f |--> 00000001
+b |--> 00000010, 00000100
+```
+
+How do we perform lookup though? It turns out that SSSE3 introduced a very cool
+instruction called PSHUFB. The instruction takes two SIMD vectors, `A` and `B`,
+and returns a third vector `C`. All vectors are treated as 16 8-bit integers.
+`C` is formed by `C[i] = A[B[i]]`. (This is a bit of a simplification, but true
+for the purposes of this algorithm. For full details, see [Intel's Intrinsics
+Guide][5_u].) This essentially lets us use the values in `B` to lookup values
+in `A`.
+
+If we could somehow cause `B` to contain our 16 byte block from the haystack,
+and if `A` could contain our bitmasks, then we'd end up with something like
+this for `A`:
+
+```ignore
+ 0x00 0x01 ... 0x62 ... 0x66 ... 0xFF
+A = 0 0 00000110 00000001 0
+```
+
+And if `B` contains our window from our haystack, we could use shuffle to take
+the values from `B` and use them to look up our bitsets in `A`. But of course,
+we can't do this because `A` in the above example contains 256 bytes, which
+is much larger than the size of a SIMD vector.
+
+Nybbles to the rescue! A nybble is 4 bits. Instead of one mask to hold all of
+our bitsets, we can use two masks, where one mask corresponds to the lower four
+bits of our fingerprint and the other mask corresponds to the upper four bits.
+So our map now looks like:
+
+```ignore
+'f' & 0xF = 0x6 |--> 00000001
+'f' >> 4 = 0x6 |--> 00000111
+'b' & 0xF = 0x2 |--> 00000110
+'b' >> 4 = 0x6 |--> 00000111
+```
+
+Notice that the bitsets for each nybble correspond to the union of all
+fingerprints that contain that nybble. For example, both `f` and `b` have the
+same upper 4 bits but differ on the lower 4 bits. Putting this together, we
+have `A0`, `A1` and `B`, where `A0` is our mask for the lower nybble, `A1` is
+our mask for the upper nybble and `B` is our 16 byte block from the haystack:
+
+```ignore
+ 0x00 0x01 0x02 0x03 ... 0x06 ... 0xF
+A0 = 0 0 00000110 0 00000001 0
+A1 = 0 0 0 0 00000111 0
+B = b a t _ t p
+B = 0x62 0x61 0x74 0x20 0x74 0x70
+```
+
+But of course, we can't use `B` with `PSHUFB` yet, since its values are 8 bits,
+and we need indexes that are at most 4 bits (corresponding to one of 16
+values). We can apply the same transformation to split `B` into lower and upper
+nybbles as we did `A`. As before, `B0` corresponds to the lower nybbles and
+`B1` corresponds to the upper nybbles:
+
+```ignore
+ b a t _ c a t _ f o o _ b u m p
+B0 = 0x2 0x1 0x4 0x0 0x3 0x1 0x4 0x0 0x6 0xF 0xF 0x0 0x2 0x5 0xD 0x0
+B1 = 0x6 0x6 0x7 0x2 0x6 0x6 0x7 0x2 0x6 0x6 0x6 0x2 0x6 0x7 0x6 0x7
+```
+
+And now we have a nice correspondence. `B0` can index `A0` and `B1` can index
+`A1`. Here's what we get when we apply `C0 = PSHUFB(A0, B0)`:
+
+```ignore
+ b a ... f o ... p
+ A0[0x2] A0[0x1] A0[0x6] A0[0xF] A0[0x0]
+C0 = 00000110 0 00000001 0 0
+```
+
+And `C1 = PSHUFB(A1, B1)`:
+
+```ignore
+ b a ... f o ... p
+ A1[0x6] A1[0x6] A1[0x6] A1[0x6] A1[0x7]
+C1 = 00000111 00000111 00000111 00000111 0
+```
+
+Notice how neither one of `C0` or `C1` is guaranteed to report fully correct
+results all on its own. For example, `C1` claims that `b` is a fingerprint for
+the pattern `foo` (since `A1[0x6] = 00000111`), and that `o` is a fingerprint
+for all of our patterns. But if we combined `C0` and `C1` with an `AND`
+operation:
+
+```ignore
+ b a ... f o ... p
+C = 00000110 0 00000001 0 0
+```
+
+Then we now have that `C[i]` contains a bitset corresponding to the matching
+fingerprints in a haystack's 16 byte block, where `i` is the `ith` byte in that
+block.
+
+Once we have that, we can look for the position of the least significant bit
+in `C`. (Least significant because we only target `x86_64` here, which is
+always little endian. Thus, the least significant bytes correspond to bytes
+in our haystack at a lower address.) That position, modulo `8`, gives us
+the pattern that the fingerprint matches. That position, integer divided by
+`8`, also gives us the byte offset that the fingerprint occurs in inside the
+16 byte haystack block. Using those two pieces of information, we can run a
+verification procedure that tries to match all substrings containing that
+fingerprint at that position in the haystack.
+
+
+# Implementation notes
+
+The problem with the algorithm as described above is that it uses a single byte
+for a fingerprint. This will work well if the fingerprints are rare in the
+haystack (e.g., capital letters or special characters in normal English text),
+but if the fingerprints are common, you'll wind up spending too much time in
+the verification step, which effectively negates the performance benefits of
+scanning 16 bytes at a time. Remember, the key to the performance of this
+algorithm is to do as little work as possible per 16 (or 32) bytes.
+
+This algorithm can be extrapolated in a relatively straight-forward way to use
+larger fingerprints. That is, instead of a single byte prefix, we might use a
+two or three byte prefix. The implementation here implements N = {1, 2, 3}
+and always picks the largest N possible. The rationale is that the bigger the
+fingerprint, the fewer verification steps we'll do. Of course, if N is too
+large, then we'll end up doing too much on each step.
+
+The way to extend it is:
+
+1. Add a mask for each byte in the fingerprint. (Remember that each mask is
+ composed of two SIMD vectors.) This results in a value of `C` for each byte
+ in the fingerprint while searching.
+2. When testing each 16 (or 32) byte block, each value of `C` must be shifted
+ so that they are aligned. Once aligned, they should all be `AND`'d together.
+ This will give you only the bitsets corresponding to the full match of the
+ fingerprint. To do this, one needs to save the last byte (for N=2) or last
+ two bytes (for N=3) from the previous iteration, and then line them up with
+ the first one or two bytes of the next iteration.
+
+## Verification
+
+Verification generally follows the procedure outlined above. The tricky parts
+are in the right formulation of operations to get our bits out of our vectors.
+We have a limited set of operations available to us on SIMD vectors as 128-bit
+or 256-bit numbers, so we wind up needing to rip out 2 (or 4) 64-bit integers
+from our vectors, and then run our verification step on each of those. The
+verification step looks at the least significant bit set, and from its
+position, we can derive the byte offset and bucket. (Again, as described
+above.) Once we know the bucket, we do a fairly naive exhaustive search for
+every literal in that bucket. (Hyperscan is a bit smarter here and uses a hash
+table, but I haven't had time to thoroughly explore that. A few initial
+half-hearted attempts resulted in worse performance.)
+
+## AVX
+
+The AVX version of Teddy extrapolates almost perfectly from the SSE version.
+The only hickup is that PALIGNR is used to align chunks in the 16-bit version,
+and there is no equivalent instruction in AVX. AVX does have VPALIGNR, but it
+only works within 128-bit lanes. So there's a bit of tomfoolery to get around
+this by shuffling the vectors before calling VPALIGNR.
+
+The only other aspect to AVX is that since our masks are still fundamentally
+16-bytes (0x0-0xF), they are duplicated to 32-bytes, so that they can apply to
+32-byte chunks.
+
+## Fat Teddy
+
+In the version of Teddy described above, 8 buckets are used to group patterns
+that we want to search for. However, when AVX is available, we can extend the
+number of buckets to 16 by permitting each byte in our masks to use 16-bits
+instead of 8-bits to represent the buckets it belongs to. (This variant is also
+in Hyperscan.) However, what we give up is the ability to scan 32 bytes at a
+time, even though we're using AVX. Instead, we have to scan 16 bytes at a time.
+What we gain, though, is (hopefully) less work in our verification routine.
+It patterns are more spread out across more buckets, then there should overall
+be fewer false positives. In general, Fat Teddy permits us to grow our capacity
+a bit and search for more literals before Teddy gets overwhelmed.
+
+The tricky part of Fat Teddy is in how we adjust our masks and our verification
+procedure. For the masks, we simply represent the first 8 buckets in each of
+the low 16 bytes, and then the second 8 buckets in each of the high 16 bytes.
+Then, in the search loop, instead of loading 32 bytes from the haystack, we
+load the same 16 bytes from the haystack into both the low and high 16 byte
+portions of our 256-bit vector. So for example, a mask might look like this:
+
+ bits: 00100001 00000000 ... 11000000 00000000 00000001 ... 00000000
+ byte: 31 30 16 15 14 0
+ offset: 15 14 0 15 14 0
+ buckets: 8-15 8-15 8-15 0-7 0-7 0-7
+
+Where `byte` is the position in the vector (higher numbers corresponding to
+more significant bits), `offset` is the corresponding position in the haystack
+chunk, and `buckets` corresponds to the bucket assignments for that particular
+byte.
+
+In particular, notice that the bucket assignments for offset `0` are spread
+out between bytes `0` and `16`. This works well for the chunk-by-chunk search
+procedure, but verification really wants to process all bucket assignments for
+each offset at once. Otherwise, we might wind up finding a match at offset
+`1` in one the first 8 buckets, when we really should have reported a match
+at offset `0` in one of the second 8 buckets. (Because we want the leftmost
+match.)
+
+Thus, for verification, we rearrange the above vector such that it is a
+sequence of 16-bit integers, where the least significant 16-bit integer
+corresponds to all of the bucket assignments for offset `0`. So with the
+above vector, the least significant 16-bit integer would be
+
+ 11000000 000000
+
+which was taken from bytes `16` and `0`. Then the verification step pretty much
+runs as described, except with 16 buckets instead of 8.
+
+
+# References
+
+- **[1]** [Hyperscan on GitHub](https://github.com/01org/hyperscan),
+ [webpage](https://01.org/hyperscan)
+- **[2a]** Ben-Kiki, O., Bille, P., Breslauer, D., Gasieniec, L., Grossi, R.,
+ & Weimann, O. (2011).
+ _Optimal packed string matching_.
+ In LIPIcs-Leibniz International Proceedings in Informatics (Vol. 13).
+ Schloss Dagstuhl-Leibniz-Zentrum fuer Informatik.
+ DOI: 10.4230/LIPIcs.FSTTCS.2011.423.
+ [PDF](http://drops.dagstuhl.de/opus/volltexte/2011/3355/pdf/37.pdf).
+- **[2b]** Ben-Kiki, O., Bille, P., Breslauer, D., Ga̧sieniec, L., Grossi, R.,
+ & Weimann, O. (2014).
+ _Towards optimal packed string matching_.
+ Theoretical Computer Science, 525, 111-129.
+ DOI: 10.1016/j.tcs.2013.06.013.
+ [PDF](http://www.cs.haifa.ac.il/~oren/Publications/bpsm.pdf).
+- **[3]** Bille, P. (2011).
+ _Fast searching in packed strings_.
+ Journal of Discrete Algorithms, 9(1), 49-56.
+ DOI: 10.1016/j.jda.2010.09.003.
+ [PDF](http://www.sciencedirect.com/science/article/pii/S1570866710000353).
+- **[4a]** Faro, S., & Külekci, M. O. (2012, October).
+ _Fast multiple string matching using streaming SIMD extensions technology_.
+ In String Processing and Information Retrieval (pp. 217-228).
+ Springer Berlin Heidelberg.
+ DOI: 10.1007/978-3-642-34109-0_23.
+ [PDF](http://www.dmi.unict.it/~faro/papers/conference/faro32.pdf).
+- **[4b]** Faro, S., & Külekci, M. O. (2013, September).
+ _Towards a Very Fast Multiple String Matching Algorithm for Short Patterns_.
+ In Stringology (pp. 78-91).
+ [PDF](http://www.dmi.unict.it/~faro/papers/conference/faro36.pdf).
+- **[4c]** Faro, S., & Külekci, M. O. (2013, January).
+ _Fast packed string matching for short patterns_.
+ In Proceedings of the Meeting on Algorithm Engineering & Expermiments
+ (pp. 113-121).
+ Society for Industrial and Applied Mathematics.
+ [PDF](http://arxiv.org/pdf/1209.6449.pdf).
+- **[4d]** Faro, S., & Külekci, M. O. (2014).
+ _Fast and flexible packed string matching_.
+ Journal of Discrete Algorithms, 28, 61-72.
+ DOI: 10.1016/j.jda.2014.07.003.
+
+[1_u]: https://github.com/01org/hyperscan
+[5_u]: https://software.intel.com/sites/landingpage/IntrinsicsGuide
diff --git a/src/packed/teddy/compile.rs b/src/packed/teddy/compile.rs
new file mode 100644
index 0000000..a7a48b7
--- /dev/null
+++ b/src/packed/teddy/compile.rs
@@ -0,0 +1,414 @@
+// See the README in this directory for an explanation of the Teddy algorithm.
+
+use std::cmp;
+use std::collections::BTreeMap;
+use std::fmt;
+
+use packed::pattern::{PatternID, Patterns};
+use packed::teddy::Teddy;
+
+/// A builder for constructing a Teddy matcher.
+///
+/// The builder primarily permits fine grained configuration of the Teddy
+/// matcher. Most options are made only available for testing/benchmarking
+/// purposes. In reality, options are automatically determined by the nature
+/// and number of patterns given to the builder.
+#[derive(Clone, Debug)]
+pub struct Builder {
+ /// When none, this is automatically determined. Otherwise, `false` means
+ /// slim Teddy is used (8 buckets) and `true` means fat Teddy is used
+ /// (16 buckets). Fat Teddy requires AVX2, so if that CPU feature isn't
+ /// available and Fat Teddy was requested, no matcher will be built.
+ fat: Option<bool>,
+ /// When none, this is automatically determined. Otherwise, `false` means
+ /// that 128-bit vectors will be used (up to SSSE3 instructions) where as
+ /// `true` means that 256-bit vectors will be used. As with `fat`, if
+ /// 256-bit vectors are requested and they aren't available, then a
+ /// searcher will not be built.
+ avx: Option<bool>,
+}
+
+impl Default for Builder {
+ fn default() -> Builder {
+ Builder::new()
+ }
+}
+
+impl Builder {
+ /// Create a new builder for configuring a Teddy matcher.
+ pub fn new() -> Builder {
+ Builder { fat: None, avx: None }
+ }
+
+ /// Build a matcher for the set of patterns given. If a matcher could not
+ /// be built, then `None` is returned.
+ ///
+ /// Generally, a matcher isn't built if the necessary CPU features aren't
+ /// available, an unsupported target or if the searcher is believed to be
+ /// slower than standard techniques (i.e., if there are too many literals).
+ pub fn build(&self, patterns: &Patterns) -> Option<Teddy> {
+ self.build_imp(patterns)
+ }
+
+ /// Require the use of Fat (true) or Slim (false) Teddy. Fat Teddy uses
+ /// 16 buckets where as Slim Teddy uses 8 buckets. More buckets are useful
+ /// for a larger set of literals.
+ ///
+ /// `None` is the default, which results in an automatic selection based
+ /// on the number of literals and available CPU features.
+ pub fn fat(&mut self, yes: Option<bool>) -> &mut Builder {
+ self.fat = yes;
+ self
+ }
+
+ /// Request the use of 256-bit vectors (true) or 128-bit vectors (false).
+ /// Generally, a larger vector size is better since it either permits
+ /// matching more patterns or matching more bytes in the haystack at once.
+ ///
+ /// `None` is the default, which results in an automatic selection based on
+ /// the number of literals and available CPU features.
+ pub fn avx(&mut self, yes: Option<bool>) -> &mut Builder {
+ self.avx = yes;
+ self
+ }
+
+ fn build_imp(&self, patterns: &Patterns) -> Option<Teddy> {
+ use packed::teddy::runtime;
+
+ // Most of the logic here is just about selecting the optimal settings,
+ // or perhaps even rejecting construction altogether. The choices
+ // we have are: fat (avx only) or not, ssse3 or avx2, and how many
+ // patterns we allow ourselves to search. Additionally, for testing
+ // and benchmarking, we permit callers to try to "force" a setting,
+ // and if the setting isn't allowed (e.g., forcing AVX when AVX isn't
+ // available), then we bail and return nothing.
+
+ if patterns.len() > 64 {
+ return None;
+ }
+ let has_ssse3 = is_x86_feature_detected!("ssse3");
+ let has_avx = is_x86_feature_detected!("avx2");
+ let avx = if self.avx == Some(true) {
+ if !has_avx {
+ return None;
+ }
+ true
+ } else if self.avx == Some(false) {
+ if !has_ssse3 {
+ return None;
+ }
+ false
+ } else if !has_ssse3 && !has_avx {
+ return None;
+ } else {
+ has_avx
+ };
+ let fat = match self.fat {
+ None => avx && patterns.len() > 32,
+ Some(false) => false,
+ Some(true) if !avx => return None,
+ Some(true) => true,
+ };
+
+ let mut compiler = Compiler::new(patterns, fat);
+ compiler.compile();
+ let Compiler { buckets, masks, .. } = compiler;
+ // SAFETY: It is required that the builder only produce Teddy matchers
+ // that are allowed to run on the current CPU, since we later assume
+ // that the presence of (for example) TeddySlim1Mask256 means it is
+ // safe to call functions marked with the `avx2` target feature.
+ match (masks.len(), avx, fat) {
+ (1, false, _) => Some(Teddy {
+ buckets,
+ max_pattern_id: patterns.max_pattern_id(),
+ exec: runtime::Exec::TeddySlim1Mask128(
+ runtime::TeddySlim1Mask128 {
+ mask1: runtime::Mask128::new(masks[0]),
+ },
+ ),
+ }),
+ (1, true, false) => Some(Teddy {
+ buckets,
+ max_pattern_id: patterns.max_pattern_id(),
+ exec: runtime::Exec::TeddySlim1Mask256(
+ runtime::TeddySlim1Mask256 {
+ mask1: runtime::Mask256::new(masks[0]),
+ },
+ ),
+ }),
+ (1, true, true) => Some(Teddy {
+ buckets,
+ max_pattern_id: patterns.max_pattern_id(),
+ exec: runtime::Exec::TeddyFat1Mask256(
+ runtime::TeddyFat1Mask256 {
+ mask1: runtime::Mask256::new(masks[0]),
+ },
+ ),
+ }),
+ (2, false, _) => Some(Teddy {
+ buckets,
+ max_pattern_id: patterns.max_pattern_id(),
+ exec: runtime::Exec::TeddySlim2Mask128(
+ runtime::TeddySlim2Mask128 {
+ mask1: runtime::Mask128::new(masks[0]),
+ mask2: runtime::Mask128::new(masks[1]),
+ },
+ ),
+ }),
+ (2, true, false) => Some(Teddy {
+ buckets,
+ max_pattern_id: patterns.max_pattern_id(),
+ exec: runtime::Exec::TeddySlim2Mask256(
+ runtime::TeddySlim2Mask256 {
+ mask1: runtime::Mask256::new(masks[0]),
+ mask2: runtime::Mask256::new(masks[1]),
+ },
+ ),
+ }),
+ (2, true, true) => Some(Teddy {
+ buckets,
+ max_pattern_id: patterns.max_pattern_id(),
+ exec: runtime::Exec::TeddyFat2Mask256(
+ runtime::TeddyFat2Mask256 {
+ mask1: runtime::Mask256::new(masks[0]),
+ mask2: runtime::Mask256::new(masks[1]),
+ },
+ ),
+ }),
+ (3, false, _) => Some(Teddy {
+ buckets,
+ max_pattern_id: patterns.max_pattern_id(),
+ exec: runtime::Exec::TeddySlim3Mask128(
+ runtime::TeddySlim3Mask128 {
+ mask1: runtime::Mask128::new(masks[0]),
+ mask2: runtime::Mask128::new(masks[1]),
+ mask3: runtime::Mask128::new(masks[2]),
+ },
+ ),
+ }),
+ (3, true, false) => Some(Teddy {
+ buckets,
+ max_pattern_id: patterns.max_pattern_id(),
+ exec: runtime::Exec::TeddySlim3Mask256(
+ runtime::TeddySlim3Mask256 {
+ mask1: runtime::Mask256::new(masks[0]),
+ mask2: runtime::Mask256::new(masks[1]),
+ mask3: runtime::Mask256::new(masks[2]),
+ },
+ ),
+ }),
+ (3, true, true) => Some(Teddy {
+ buckets,
+ max_pattern_id: patterns.max_pattern_id(),
+ exec: runtime::Exec::TeddyFat3Mask256(
+ runtime::TeddyFat3Mask256 {
+ mask1: runtime::Mask256::new(masks[0]),
+ mask2: runtime::Mask256::new(masks[1]),
+ mask3: runtime::Mask256::new(masks[2]),
+ },
+ ),
+ }),
+ _ => unreachable!(),
+ }
+ }
+}
+
+/// A compiler is in charge of allocating patterns into buckets and generating
+/// the masks necessary for searching.
+#[derive(Clone)]
+struct Compiler<'p> {
+ patterns: &'p Patterns,
+ buckets: Vec<Vec<PatternID>>,
+ masks: Vec<Mask>,
+}
+
+impl<'p> Compiler<'p> {
+ /// Create a new Teddy compiler for the given patterns. If `fat` is true,
+ /// then 16 buckets will be used instead of 8.
+ ///
+ /// This panics if any of the patterns given are empty.
+ fn new(patterns: &'p Patterns, fat: bool) -> Compiler<'p> {
+ let mask_len = cmp::min(3, patterns.minimum_len());
+ assert!(1 <= mask_len && mask_len <= 3);
+
+ Compiler {
+ patterns,
+ buckets: vec![vec![]; if fat { 16 } else { 8 }],
+ masks: vec![Mask::default(); mask_len],
+ }
+ }
+
+ /// Compile the patterns in this compiler into buckets and masks.
+ fn compile(&mut self) {
+ let mut lonibble_to_bucket: BTreeMap<Vec<u8>, usize> = BTreeMap::new();
+ for (id, pattern) in self.patterns.iter() {
+ // We try to be slightly clever in how we assign patterns into
+ // buckets. Generally speaking, we want patterns with the same
+ // prefix to be in the same bucket, since it minimizes the amount
+ // of time we spend churning through buckets in the verification
+ // step.
+ //
+ // So we could assign patterns with the same N-prefix (where N
+ // is the size of the mask, which is one of {1, 2, 3}) to the
+ // same bucket. However, case insensitive searches are fairly
+ // common, so we'd for example, ideally want to treat `abc` and
+ // `ABC` as if they shared the same prefix. ASCII has the nice
+ // property that the lower 4 bits of A and a are the same, so we
+ // therefore group patterns with the same low-nybbe-N-prefix into
+ // the same bucket.
+ //
+ // MOREOVER, this is actually necessary for correctness! In
+ // particular, by grouping patterns with the same prefix into the
+ // same bucket, we ensure that we preserve correct leftmost-first
+ // and leftmost-longest match semantics. In addition to the fact
+ // that `patterns.iter()` iterates in the correct order, this
+ // guarantees that all possible ambiguous matches will occur in
+ // the same bucket. The verification routine could be adjusted to
+ // support correct leftmost match semantics regardless of bucket
+ // allocation, but that results in a performance hit. It's much
+ // nicer to be able to just stop as soon as a match is found.
+ let lonybs = pattern.low_nybbles(self.masks.len());
+ if let Some(&bucket) = lonibble_to_bucket.get(&lonybs) {
+ self.buckets[bucket].push(id);
+ } else {
+ // N.B. We assign buckets in reverse because it shouldn't have
+ // any influence on performance, but it does make it harder to
+ // get leftmost match semantics accidentally correct.
+ let bucket = (self.buckets.len() - 1)
+ - (id as usize % self.buckets.len());
+ self.buckets[bucket].push(id);
+ lonibble_to_bucket.insert(lonybs, bucket);
+ }
+ }
+ for (bucket_index, bucket) in self.buckets.iter().enumerate() {
+ for &pat_id in bucket {
+ let pat = self.patterns.get(pat_id);
+ for (i, mask) in self.masks.iter_mut().enumerate() {
+ if self.buckets.len() == 8 {
+ mask.add_slim(bucket_index as u8, pat.bytes()[i]);
+ } else {
+ mask.add_fat(bucket_index as u8, pat.bytes()[i]);
+ }
+ }
+ }
+ }
+ }
+}
+
+impl<'p> fmt::Debug for Compiler<'p> {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ let mut buckets = vec![vec![]; self.buckets.len()];
+ for (i, bucket) in self.buckets.iter().enumerate() {
+ for &patid in bucket {
+ buckets[i].push(self.patterns.get(patid));
+ }
+ }
+ f.debug_struct("Compiler")
+ .field("buckets", &buckets)
+ .field("masks", &self.masks)
+ .finish()
+ }
+}
+
+/// Mask represents the low and high nybble masks that will be used during
+/// search. Each mask is 32 bytes wide, although only the first 16 bytes are
+/// used for the SSSE3 runtime.
+///
+/// Each byte in the mask corresponds to a 8-bit bitset, where bit `i` is set
+/// if and only if the corresponding nybble is in the ith bucket. The index of
+/// the byte (0-15, inclusive) corresponds to the nybble.
+///
+/// Each mask is used as the target of a shuffle, where the indices for the
+/// shuffle are taken from the haystack. AND'ing the shuffles for both the
+/// low and high masks together also results in 8-bit bitsets, but where bit
+/// `i` is set if and only if the correspond *byte* is in the ith bucket.
+///
+/// During compilation, masks are just arrays. But during search, these masks
+/// are represented as 128-bit or 256-bit vectors.
+///
+/// (See the README is this directory for more details.)
+#[derive(Clone, Copy, Default)]
+pub struct Mask {
+ lo: [u8; 32],
+ hi: [u8; 32],
+}
+
+impl Mask {
+ /// Update this mask by adding the given byte to the given bucket. The
+ /// given bucket must be in the range 0-7.
+ ///
+ /// This is for "slim" Teddy, where there are only 8 buckets.
+ fn add_slim(&mut self, bucket: u8, byte: u8) {
+ assert!(bucket < 8);
+
+ let byte_lo = (byte & 0xF) as usize;
+ let byte_hi = ((byte >> 4) & 0xF) as usize;
+ // When using 256-bit vectors, we need to set this bucket assignment in
+ // the low and high 128-bit portions of the mask. This allows us to
+ // process 32 bytes at a time. Namely, AVX2 shuffles operate on each
+ // of the 128-bit lanes, rather than the full 256-bit vector at once.
+ self.lo[byte_lo] |= 1 << bucket;
+ self.lo[byte_lo + 16] |= 1 << bucket;
+ self.hi[byte_hi] |= 1 << bucket;
+ self.hi[byte_hi + 16] |= 1 << bucket;
+ }
+
+ /// Update this mask by adding the given byte to the given bucket. The
+ /// given bucket must be in the range 0-15.
+ ///
+ /// This is for "fat" Teddy, where there are 16 buckets.
+ fn add_fat(&mut self, bucket: u8, byte: u8) {
+ assert!(bucket < 16);
+
+ let byte_lo = (byte & 0xF) as usize;
+ let byte_hi = ((byte >> 4) & 0xF) as usize;
+ // Unlike slim teddy, fat teddy only works with AVX2. For fat teddy,
+ // the high 128 bits of our mask correspond to buckets 8-15, while the
+ // low 128 bits correspond to buckets 0-7.
+ if bucket < 8 {
+ self.lo[byte_lo] |= 1 << bucket;
+ self.hi[byte_hi] |= 1 << bucket;
+ } else {
+ self.lo[byte_lo + 16] |= 1 << (bucket % 8);
+ self.hi[byte_hi + 16] |= 1 << (bucket % 8);
+ }
+ }
+
+ /// Return the low 128 bits of the low-nybble mask.
+ pub fn lo128(&self) -> [u8; 16] {
+ let mut tmp = [0; 16];
+ tmp.copy_from_slice(&self.lo[..16]);
+ tmp
+ }
+
+ /// Return the full low-nybble mask.
+ pub fn lo256(&self) -> [u8; 32] {
+ self.lo
+ }
+
+ /// Return the low 128 bits of the high-nybble mask.
+ pub fn hi128(&self) -> [u8; 16] {
+ let mut tmp = [0; 16];
+ tmp.copy_from_slice(&self.hi[..16]);
+ tmp
+ }
+
+ /// Return the full high-nybble mask.
+ pub fn hi256(&self) -> [u8; 32] {
+ self.hi
+ }
+}
+
+impl fmt::Debug for Mask {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ let (mut parts_lo, mut parts_hi) = (vec![], vec![]);
+ for i in 0..32 {
+ parts_lo.push(format!("{:02}: {:08b}", i, self.lo[i]));
+ parts_hi.push(format!("{:02}: {:08b}", i, self.hi[i]));
+ }
+ f.debug_struct("Mask")
+ .field("lo", &parts_lo)
+ .field("hi", &parts_hi)
+ .finish()
+ }
+}
diff --git a/src/packed/teddy/mod.rs b/src/packed/teddy/mod.rs
new file mode 100644
index 0000000..b896023
--- /dev/null
+++ b/src/packed/teddy/mod.rs
@@ -0,0 +1,62 @@
+#[cfg(target_arch = "x86_64")]
+pub use packed::teddy::compile::Builder;
+#[cfg(not(target_arch = "x86_64"))]
+pub use packed::teddy::fallback::Builder;
+#[cfg(not(target_arch = "x86_64"))]
+pub use packed::teddy::fallback::Teddy;
+#[cfg(target_arch = "x86_64")]
+pub use packed::teddy::runtime::Teddy;
+
+#[cfg(target_arch = "x86_64")]
+mod compile;
+#[cfg(target_arch = "x86_64")]
+mod runtime;
+
+#[cfg(not(target_arch = "x86_64"))]
+mod fallback {
+ use packed::pattern::Patterns;
+ use Match;
+
+ #[derive(Clone, Debug, Default)]
+ pub struct Builder(());
+
+ impl Builder {
+ pub fn new() -> Builder {
+ Builder(())
+ }
+
+ pub fn build(&self, _: &Patterns) -> Option<Teddy> {
+ None
+ }
+
+ pub fn fat(&mut self, _: Option<bool>) -> &mut Builder {
+ self
+ }
+
+ pub fn avx(&mut self, _: Option<bool>) -> &mut Builder {
+ self
+ }
+ }
+
+ #[derive(Clone, Debug)]
+ pub struct Teddy(());
+
+ impl Teddy {
+ pub fn find_at(
+ &self,
+ _: &Patterns,
+ _: &[u8],
+ _: usize,
+ ) -> Option<Match> {
+ None
+ }
+
+ pub fn minimum_len(&self) -> usize {
+ 0
+ }
+
+ pub fn heap_bytes(&self) -> usize {
+ 0
+ }
+ }
+}
diff --git a/src/packed/teddy/runtime.rs b/src/packed/teddy/runtime.rs
new file mode 100644
index 0000000..a736948
--- /dev/null
+++ b/src/packed/teddy/runtime.rs
@@ -0,0 +1,1204 @@
+// See the README in this directory for an explanation of the Teddy algorithm.
+// It is strongly recommended to peruse the README before trying to grok this
+// code, as its use of SIMD is pretty opaque, although I tried to add comments
+// where appropriate.
+//
+// Moreover, while there is a lot of code in this file, most of it is
+// repeated variants of the same thing. Specifically, there are three Teddy
+// variants: Slim 128-bit Teddy (8 buckets), Slim 256-bit Teddy (8 buckets)
+// and Fat 256-bit Teddy (16 buckets). For each variant, there are three
+// implementations, corresponding to mask lengths of 1, 2 and 3. Bringing it to
+// a total of nine variants. Each one is structured roughly the same:
+//
+// while at <= len(haystack) - CHUNK_SIZE:
+// let candidate = find_candidate_in_chunk(haystack, at)
+// if not all zeroes(candidate):
+// if match = verify(haystack, at, candidate):
+// return match
+//
+// For the most part, this remains unchanged. The parts that vary are the
+// verification routine (for slim vs fat Teddy) and the candidate extraction
+// (based on the number of masks).
+//
+// In the code below, a "candidate" corresponds to a single vector with 8-bit
+// lanes. Each lane is itself an 8-bit bitset, where the ith bit is set in the
+// jth lane if and only if the byte occurring at position `j` is in the
+// bucket `i` (where the `j`th position is the position in the current window
+// of the haystack, which is always 16 or 32 bytes). Note to be careful here:
+// the ith bit and the jth lane correspond to the least significant bits of the
+// vector. So when visualizing how the current window of bytes is stored in a
+// vector, you often need to flip it around. For example, the text `abcd` in a
+// 4-byte vector would look like this:
+//
+// 01100100 01100011 01100010 01100001
+// d c b a
+//
+// When the mask length is 1, then finding the candidate is pretty straight
+// forward: you just apply the shuffle indices (from the haystack window) to
+// the masks, and then AND them together, as described in the README. But for
+// masks of length 2 and 3, you need to keep a little state. Specifically,
+// you need to store the final 1 (for mask length 2) or 2 (for mask length 3)
+// bytes of the candidate for use when searching the next window. This is for
+// handling matches that span two windows.
+//
+// With respect to the repeated code, it would likely be possible to reduce
+// the number of copies of code below using polymorphism, but I find this
+// formulation clearer instead of needing to reason through generics. However,
+// I admit, there may be a simpler generic construction that I'm missing.
+//
+// All variants are fairly heavily tested in src/packed/tests.rs.
+
+use std::arch::x86_64::*;
+use std::mem;
+
+use packed::pattern::{PatternID, Patterns};
+use packed::teddy::compile;
+use packed::vector::*;
+use Match;
+
+/// The Teddy runtime.
+///
+/// A Teddy runtime can be used to quickly search for occurrences of one or
+/// more patterns. While it does not scale to an arbitrary number of patterns
+/// like Aho-Corasick, it does find occurrences for a small set of patterns
+/// much more quickly than Aho-Corasick.
+///
+/// Teddy cannot run on small haystacks below a certain size, which is
+/// dependent on the type of matcher used. This size can be queried via the
+/// `minimum_len` method. Violating this will result in a panic.
+///
+/// Finally, when callers use a Teddy runtime, they must provide precisely the
+/// patterns used to construct the Teddy matcher. Violating this will result
+/// in either a panic or incorrect results, but will never sacrifice memory
+/// safety.
+#[derive(Clone, Debug)]
+pub struct Teddy {
+ /// The allocation of patterns in buckets. This only contains the IDs of
+ /// patterns. In order to do full verification, callers must provide the
+ /// actual patterns when using Teddy.
+ pub buckets: Vec<Vec<PatternID>>,
+ /// The maximum identifier of a pattern. This is used as a sanity check to
+ /// ensure that the patterns provided by the caller are the same as the
+ /// patterns that were used to compile the matcher. This sanity check
+ /// permits safely eliminating bounds checks regardless of what patterns
+ /// are provided by the caller.
+ ///
+ /// Note that users of the aho-corasick crate cannot get this wrong. Only
+ /// code internal to this crate can get it wrong, since neither `Patterns`
+ /// type nor the Teddy runtime are public API items.
+ pub max_pattern_id: PatternID,
+ /// The actual runtime to use.
+ pub exec: Exec,
+}
+
+impl Teddy {
+ /// Return the first occurrence of a match in the given haystack after or
+ /// starting at `at`.
+ ///
+ /// The patterns provided must be precisely the same patterns given to the
+ /// Teddy builder, otherwise this may panic or produce incorrect results.
+ ///
+ /// All matches are consistent with the match semantics (leftmost-first or
+ /// leftmost-longest) set on `pats`.
+ pub fn find_at(
+ &self,
+ pats: &Patterns,
+ haystack: &[u8],
+ at: usize,
+ ) -> Option<Match> {
+ // This assert is a bit subtle, but it's an important guarantee.
+ // Namely, if the maximum pattern ID seen by Teddy is the same as the
+ // one in the patterns given, then we are guaranteed that every pattern
+ // ID in all Teddy buckets are valid indices into `pats`. While this
+ // is nominally true, there is no guarantee that callers provide the
+ // same `pats` to both the Teddy builder and the searcher, which would
+ // otherwise make `find_at` unsafe to call. But this assert lets us
+ // keep this routine safe and eliminate an important bounds check in
+ // verification.
+ assert_eq!(
+ self.max_pattern_id,
+ pats.max_pattern_id(),
+ "teddy must be called with same patterns it was built with",
+ );
+ // SAFETY: The haystack must have at least a minimum number of bytes
+ // for Teddy to be able to work. The minimum number varies depending on
+ // which matcher is used below. If this is violated, then it's possible
+ // for searching to do out-of-bounds writes.
+ assert!(haystack[at..].len() >= self.minimum_len());
+ // SAFETY: The various Teddy matchers are always safe to call because
+ // the Teddy builder guarantees that a particular Exec variant is
+ // built only when it can be run the current CPU. That is, the Teddy
+ // builder will not produce a Exec::TeddySlim1Mask256 unless AVX2 is
+ // enabled. That is, our dynamic CPU feature detection is performed
+ // once in the builder, and we rely on the type system to avoid needing
+ // to do it again.
+ unsafe {
+ match self.exec {
+ Exec::TeddySlim1Mask128(ref e) => {
+ e.find_at(pats, self, haystack, at)
+ }
+ Exec::TeddySlim1Mask256(ref e) => {
+ e.find_at(pats, self, haystack, at)
+ }
+ Exec::TeddyFat1Mask256(ref e) => {
+ e.find_at(pats, self, haystack, at)
+ }
+ Exec::TeddySlim2Mask128(ref e) => {
+ e.find_at(pats, self, haystack, at)
+ }
+ Exec::TeddySlim2Mask256(ref e) => {
+ e.find_at(pats, self, haystack, at)
+ }
+ Exec::TeddyFat2Mask256(ref e) => {
+ e.find_at(pats, self, haystack, at)
+ }
+ Exec::TeddySlim3Mask128(ref e) => {
+ e.find_at(pats, self, haystack, at)
+ }
+ Exec::TeddySlim3Mask256(ref e) => {
+ e.find_at(pats, self, haystack, at)
+ }
+ Exec::TeddyFat3Mask256(ref e) => {
+ e.find_at(pats, self, haystack, at)
+ }
+ }
+ }
+ }
+
+ /// Returns the minimum length of a haystack that must be provided by
+ /// callers to this Teddy searcher. Providing a haystack shorter than this
+ /// will result in a panic, but will never violate memory safety.
+ pub fn minimum_len(&self) -> usize {
+ // SAFETY: These values must be correct in order to ensure safety.
+ // The Teddy runtime assumes their haystacks have at least these
+ // lengths. Violating this will sacrifice memory safety.
+ match self.exec {
+ Exec::TeddySlim1Mask128(_) => 16,
+ Exec::TeddySlim1Mask256(_) => 32,
+ Exec::TeddyFat1Mask256(_) => 16,
+ Exec::TeddySlim2Mask128(_) => 17,
+ Exec::TeddySlim2Mask256(_) => 33,
+ Exec::TeddyFat2Mask256(_) => 17,
+ Exec::TeddySlim3Mask128(_) => 18,
+ Exec::TeddySlim3Mask256(_) => 34,
+ Exec::TeddyFat3Mask256(_) => 34,
+ }
+ }
+
+ /// Returns the approximate total amount of heap used by this searcher, in
+ /// units of bytes.
+ pub fn heap_bytes(&self) -> usize {
+ let num_patterns = self.max_pattern_id as usize + 1;
+ self.buckets.len() * mem::size_of::<Vec<PatternID>>()
+ + num_patterns * mem::size_of::<PatternID>()
+ }
+
+ /// Runs the verification routine for Slim 128-bit Teddy.
+ ///
+ /// The candidate given should be a collection of 8-bit bitsets (one bitset
+ /// per lane), where the ith bit is set in the jth lane if and only if the
+ /// byte occurring at `at + j` in `haystack` is in the bucket `i`.
+ ///
+ /// This is not safe to call unless the SSSE3 target feature is enabled.
+ /// The `target_feature` attribute is not applied since this function is
+ /// always forcefully inlined.
+ #[inline(always)]
+ unsafe fn verify128(
+ &self,
+ pats: &Patterns,
+ haystack: &[u8],
+ at: usize,
+ cand: __m128i,
+ ) -> Option<Match> {
+ debug_assert!(!is_all_zeroes128(cand));
+ debug_assert_eq!(8, self.buckets.len());
+
+ // Convert the candidate into 64-bit chunks, and then verify each of
+ // those chunks.
+ let parts = unpack64x128(cand);
+ for (i, &part) in parts.iter().enumerate() {
+ let pos = at + i * 8;
+ if let Some(m) = self.verify64(pats, 8, haystack, pos, part) {
+ return Some(m);
+ }
+ }
+ None
+ }
+
+ /// Runs the verification routine for Slim 256-bit Teddy.
+ ///
+ /// The candidate given should be a collection of 8-bit bitsets (one bitset
+ /// per lane), where the ith bit is set in the jth lane if and only if the
+ /// byte occurring at `at + j` in `haystack` is in the bucket `i`.
+ ///
+ /// This is not safe to call unless the AVX2 target feature is enabled.
+ /// The `target_feature` attribute is not applied since this function is
+ /// always forcefully inlined.
+ #[inline(always)]
+ unsafe fn verify256(
+ &self,
+ pats: &Patterns,
+ haystack: &[u8],
+ at: usize,
+ cand: __m256i,
+ ) -> Option<Match> {
+ debug_assert!(!is_all_zeroes256(cand));
+ debug_assert_eq!(8, self.buckets.len());
+
+ // Convert the candidate into 64-bit chunks, and then verify each of
+ // those chunks.
+ let parts = unpack64x256(cand);
+ for (i, &part) in parts.iter().enumerate() {
+ let pos = at + i * 8;
+ if let Some(m) = self.verify64(pats, 8, haystack, pos, part) {
+ return Some(m);
+ }
+ }
+ None
+ }
+
+ /// Runs the verification routine for Fat 256-bit Teddy.
+ ///
+ /// The candidate given should be a collection of 8-bit bitsets (one bitset
+ /// per lane), where the ith bit is set in the jth lane if and only if the
+ /// byte occurring at `at + (j < 16 ? j : j - 16)` in `haystack` is in the
+ /// bucket `j < 16 ? i : i + 8`.
+ ///
+ /// This is not safe to call unless the AVX2 target feature is enabled.
+ /// The `target_feature` attribute is not applied since this function is
+ /// always forcefully inlined.
+ #[inline(always)]
+ unsafe fn verify_fat256(
+ &self,
+ pats: &Patterns,
+ haystack: &[u8],
+ at: usize,
+ cand: __m256i,
+ ) -> Option<Match> {
+ debug_assert!(!is_all_zeroes256(cand));
+ debug_assert_eq!(16, self.buckets.len());
+
+ // This is a bit tricky, but we basically want to convert our
+ // candidate, which looks like this
+ //
+ // a31 a30 ... a17 a16 a15 a14 ... a01 a00
+ //
+ // where each a(i) is an 8-bit bitset corresponding to the activated
+ // buckets, to this
+ //
+ // a31 a15 a30 a14 a29 a13 ... a18 a02 a17 a01 a16 a00
+ //
+ // Namely, for Fat Teddy, the high 128-bits of the candidate correspond
+ // to the same bytes in the haystack in the low 128-bits (so we only
+ // scan 16 bytes at a time), but are for buckets 8-15 instead of 0-7.
+ //
+ // The verification routine wants to look at all potentially matching
+ // buckets before moving on to the next lane. So for example, both
+ // a16 and a00 both correspond to the first byte in our window; a00
+ // contains buckets 0-7 and a16 contains buckets 8-15. Specifically,
+ // a16 should be checked before a01. So the transformation shown above
+ // allows us to use our normal verification procedure with one small
+ // change: we treat each bitset as 16 bits instead of 8 bits.
+
+ // Swap the 128-bit lanes in the candidate vector.
+ let swap = _mm256_permute4x64_epi64(cand, 0x4E);
+ // Interleave the bytes from the low 128-bit lanes, starting with
+ // cand first.
+ let r1 = _mm256_unpacklo_epi8(cand, swap);
+ // Interleave the bytes from the high 128-bit lanes, starting with
+ // cand first.
+ let r2 = _mm256_unpackhi_epi8(cand, swap);
+ // Now just take the 2 low 64-bit integers from both r1 and r2. We
+ // can drop the high 64-bit integers because they are a mirror image
+ // of the low 64-bit integers. All we care about are the low 128-bit
+ // lanes of r1 and r2. Combined, they contain all our 16-bit bitsets
+ // laid out in the desired order, as described above.
+ let parts = unpacklo64x256(r1, r2);
+ for (i, &part) in parts.iter().enumerate() {
+ let pos = at + i * 4;
+ if let Some(m) = self.verify64(pats, 16, haystack, pos, part) {
+ return Some(m);
+ }
+ }
+ None
+ }
+
+ /// Verify whether there are any matches starting at or after `at` in the
+ /// given `haystack`. The candidate given should correspond to either 8-bit
+ /// (for 8 buckets) or 16-bit (16 buckets) bitsets.
+ #[inline(always)]
+ fn verify64(
+ &self,
+ pats: &Patterns,
+ bucket_count: usize,
+ haystack: &[u8],
+ at: usize,
+ mut cand: u64,
+ ) -> Option<Match> {
+ // N.B. While the bucket count is known from self.buckets.len(),
+ // requiring it as a parameter makes it easier for the optimizer to
+ // know its value, and thus produce more efficient codegen.
+ debug_assert!(bucket_count == 8 || bucket_count == 16);
+ while cand != 0 {
+ let bit = cand.trailing_zeros() as usize;
+ cand &= !(1 << bit);
+
+ let at = at + (bit / bucket_count);
+ let bucket = bit % bucket_count;
+ if let Some(m) = self.verify_bucket(pats, haystack, bucket, at) {
+ return Some(m);
+ }
+ }
+ None
+ }
+
+ /// Verify whether there are any matches starting at `at` in the given
+ /// `haystack` corresponding only to patterns in the given bucket.
+ #[inline(always)]
+ fn verify_bucket(
+ &self,
+ pats: &Patterns,
+ haystack: &[u8],
+ bucket: usize,
+ at: usize,
+ ) -> Option<Match> {
+ // Forcing this function to not inline and be "cold" seems to help
+ // the codegen for Teddy overall. Interestingly, this is good for a
+ // 16% boost in the sherlock/packed/teddy/name/alt1 benchmark (among
+ // others). Overall, this seems like a problem with codegen, since
+ // creating the Match itself is a very small amount of code.
+ #[cold]
+ #[inline(never)]
+ fn match_from_span(
+ pati: PatternID,
+ start: usize,
+ end: usize,
+ ) -> Match {
+ Match::from_span(pati as usize, start, end)
+ }
+
+ // N.B. The bounds check for this bucket lookup *should* be elided
+ // since we assert the number of buckets in each `find_at` routine,
+ // and the compiler can prove that the `% 8` (or `% 16`) in callers
+ // of this routine will always be in bounds.
+ for &pati in &self.buckets[bucket] {
+ // SAFETY: This is safe because we are guaranteed that every
+ // index in a Teddy bucket is a valid index into `pats`. This
+ // guarantee is upheld by the assert checking `max_pattern_id` in
+ // the beginning of `find_at` above.
+ //
+ // This explicit bounds check elision is (amazingly) good for a
+ // 25-50% boost in some benchmarks, particularly ones with a lot
+ // of short literals.
+ let pat = unsafe { pats.get_unchecked(pati) };
+ if pat.is_prefix(&haystack[at..]) {
+ return Some(match_from_span(pati, at, at + pat.len()));
+ }
+ }
+ None
+ }
+}
+
+/// Exec represents the different search strategies supported by the Teddy
+/// runtime.
+///
+/// This enum is an important safety abstraction. Namely, callers should only
+/// construct a variant in this enum if it is safe to execute its corresponding
+/// target features on the current CPU. The 128-bit searchers require SSSE3,
+/// while the 256-bit searchers require AVX2.
+#[derive(Clone, Debug)]
+pub enum Exec {
+ TeddySlim1Mask128(TeddySlim1Mask128),
+ TeddySlim1Mask256(TeddySlim1Mask256),
+ TeddyFat1Mask256(TeddyFat1Mask256),
+ TeddySlim2Mask128(TeddySlim2Mask128),
+ TeddySlim2Mask256(TeddySlim2Mask256),
+ TeddyFat2Mask256(TeddyFat2Mask256),
+ TeddySlim3Mask128(TeddySlim3Mask128),
+ TeddySlim3Mask256(TeddySlim3Mask256),
+ TeddyFat3Mask256(TeddyFat3Mask256),
+}
+
+// Most of the code below remains undocumented because they are effectively
+// repeated versions of themselves. The general structure is described in the
+// README and in the comments above.
+
+#[derive(Clone, Debug)]
+pub struct TeddySlim1Mask128 {
+ pub mask1: Mask128,
+}
+
+impl TeddySlim1Mask128 {
+ #[target_feature(enable = "ssse3")]
+ unsafe fn find_at(
+ &self,
+ pats: &Patterns,
+ teddy: &Teddy,
+ haystack: &[u8],
+ mut at: usize,
+ ) -> Option<Match> {
+ debug_assert!(haystack[at..].len() >= teddy.minimum_len());
+ // This assert helps eliminate bounds checks for bucket lookups in
+ // Teddy::verify_bucket, which has a small (3-4%) performance boost.
+ assert_eq!(8, teddy.buckets.len());
+
+ let len = haystack.len();
+ while at <= len - 16 {
+ let c = self.candidate(haystack, at);
+ if !is_all_zeroes128(c) {
+ if let Some(m) = teddy.verify128(pats, haystack, at, c) {
+ return Some(m);
+ }
+ }
+ at += 16;
+ }
+ if at < len {
+ at = len - 16;
+ let c = self.candidate(haystack, at);
+ if !is_all_zeroes128(c) {
+ if let Some(m) = teddy.verify128(pats, haystack, at, c) {
+ return Some(m);
+ }
+ }
+ }
+ None
+ }
+
+ #[inline(always)]
+ unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m128i {
+ debug_assert!(haystack[at..].len() >= 16);
+
+ let chunk = loadu128(haystack, at);
+ members1m128(chunk, self.mask1)
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct TeddySlim1Mask256 {
+ pub mask1: Mask256,
+}
+
+impl TeddySlim1Mask256 {
+ #[target_feature(enable = "avx2")]
+ unsafe fn find_at(
+ &self,
+ pats: &Patterns,
+ teddy: &Teddy,
+ haystack: &[u8],
+ mut at: usize,
+ ) -> Option<Match> {
+ debug_assert!(haystack[at..].len() >= teddy.minimum_len());
+ // This assert helps eliminate bounds checks for bucket lookups in
+ // Teddy::verify_bucket, which has a small (3-4%) performance boost.
+ assert_eq!(8, teddy.buckets.len());
+
+ let len = haystack.len();
+ while at <= len - 32 {
+ let c = self.candidate(haystack, at);
+ if !is_all_zeroes256(c) {
+ if let Some(m) = teddy.verify256(pats, haystack, at, c) {
+ return Some(m);
+ }
+ }
+ at += 32;
+ }
+ if at < len {
+ at = len - 32;
+ let c = self.candidate(haystack, at);
+ if !is_all_zeroes256(c) {
+ if let Some(m) = teddy.verify256(pats, haystack, at, c) {
+ return Some(m);
+ }
+ }
+ }
+ None
+ }
+
+ #[inline(always)]
+ unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m256i {
+ debug_assert!(haystack[at..].len() >= 32);
+
+ let chunk = loadu256(haystack, at);
+ members1m256(chunk, self.mask1)
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct TeddyFat1Mask256 {
+ pub mask1: Mask256,
+}
+
+impl TeddyFat1Mask256 {
+ #[target_feature(enable = "avx2")]
+ unsafe fn find_at(
+ &self,
+ pats: &Patterns,
+ teddy: &Teddy,
+ haystack: &[u8],
+ mut at: usize,
+ ) -> Option<Match> {
+ debug_assert!(haystack[at..].len() >= teddy.minimum_len());
+ // This assert helps eliminate bounds checks for bucket lookups in
+ // Teddy::verify_bucket, which has a small (3-4%) performance boost.
+ assert_eq!(16, teddy.buckets.len());
+
+ let len = haystack.len();
+ while at <= len - 16 {
+ let c = self.candidate(haystack, at);
+ if !is_all_zeroes256(c) {
+ if let Some(m) = teddy.verify_fat256(pats, haystack, at, c) {
+ return Some(m);
+ }
+ }
+ at += 16;
+ }
+ if at < len {
+ at = len - 16;
+ let c = self.candidate(haystack, at);
+ if !is_all_zeroes256(c) {
+ if let Some(m) = teddy.verify_fat256(pats, haystack, at, c) {
+ return Some(m);
+ }
+ }
+ }
+ None
+ }
+
+ #[inline(always)]
+ unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m256i {
+ debug_assert!(haystack[at..].len() >= 16);
+
+ let chunk = _mm256_broadcastsi128_si256(loadu128(haystack, at));
+ members1m256(chunk, self.mask1)
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct TeddySlim2Mask128 {
+ pub mask1: Mask128,
+ pub mask2: Mask128,
+}
+
+impl TeddySlim2Mask128 {
+ #[target_feature(enable = "ssse3")]
+ unsafe fn find_at(
+ &self,
+ pats: &Patterns,
+ teddy: &Teddy,
+ haystack: &[u8],
+ mut at: usize,
+ ) -> Option<Match> {
+ debug_assert!(haystack[at..].len() >= teddy.minimum_len());
+ // This assert helps eliminate bounds checks for bucket lookups in
+ // Teddy::verify_bucket, which has a small (3-4%) performance boost.
+ assert_eq!(8, teddy.buckets.len());
+
+ at += 1;
+ let len = haystack.len();
+ let mut prev0 = ones128();
+ while at <= len - 16 {
+ let c = self.candidate(haystack, at, &mut prev0);
+ if !is_all_zeroes128(c) {
+ if let Some(m) = teddy.verify128(pats, haystack, at - 1, c) {
+ return Some(m);
+ }
+ }
+ at += 16;
+ }
+ if at < len {
+ at = len - 16;
+ prev0 = ones128();
+
+ let c = self.candidate(haystack, at, &mut prev0);
+ if !is_all_zeroes128(c) {
+ if let Some(m) = teddy.verify128(pats, haystack, at - 1, c) {
+ return Some(m);
+ }
+ }
+ }
+ None
+ }
+
+ #[inline(always)]
+ unsafe fn candidate(
+ &self,
+ haystack: &[u8],
+ at: usize,
+ prev0: &mut __m128i,
+ ) -> __m128i {
+ debug_assert!(haystack[at..].len() >= 16);
+
+ let chunk = loadu128(haystack, at);
+ let (res0, res1) = members2m128(chunk, self.mask1, self.mask2);
+ let res0prev0 = _mm_alignr_epi8(res0, *prev0, 15);
+ _mm_and_si128(res0prev0, res1)
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct TeddySlim2Mask256 {
+ pub mask1: Mask256,
+ pub mask2: Mask256,
+}
+
+impl TeddySlim2Mask256 {
+ #[target_feature(enable = "avx2")]
+ unsafe fn find_at(
+ &self,
+ pats: &Patterns,
+ teddy: &Teddy,
+ haystack: &[u8],
+ mut at: usize,
+ ) -> Option<Match> {
+ debug_assert!(haystack[at..].len() >= teddy.minimum_len());
+ // This assert helps eliminate bounds checks for bucket lookups in
+ // Teddy::verify_bucket, which has a small (3-4%) performance boost.
+ assert_eq!(8, teddy.buckets.len());
+
+ at += 1;
+ let len = haystack.len();
+ let mut prev0 = ones256();
+ while at <= len - 32 {
+ let c = self.candidate(haystack, at, &mut prev0);
+ if !is_all_zeroes256(c) {
+ if let Some(m) = teddy.verify256(pats, haystack, at - 1, c) {
+ return Some(m);
+ }
+ }
+ at += 32;
+ }
+ if at < len {
+ at = len - 32;
+ prev0 = ones256();
+
+ let c = self.candidate(haystack, at, &mut prev0);
+ if !is_all_zeroes256(c) {
+ if let Some(m) = teddy.verify256(pats, haystack, at - 1, c) {
+ return Some(m);
+ }
+ }
+ }
+ None
+ }
+
+ #[inline(always)]
+ unsafe fn candidate(
+ &self,
+ haystack: &[u8],
+ at: usize,
+ prev0: &mut __m256i,
+ ) -> __m256i {
+ debug_assert!(haystack[at..].len() >= 32);
+
+ let chunk = loadu256(haystack, at);
+ let (res0, res1) = members2m256(chunk, self.mask1, self.mask2);
+ let res0prev0 = alignr256_15(res0, *prev0);
+ let res = _mm256_and_si256(res0prev0, res1);
+ *prev0 = res0;
+ res
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct TeddyFat2Mask256 {
+ pub mask1: Mask256,
+ pub mask2: Mask256,
+}
+
+impl TeddyFat2Mask256 {
+ #[target_feature(enable = "avx2")]
+ unsafe fn find_at(
+ &self,
+ pats: &Patterns,
+ teddy: &Teddy,
+ haystack: &[u8],
+ mut at: usize,
+ ) -> Option<Match> {
+ debug_assert!(haystack[at..].len() >= teddy.minimum_len());
+ // This assert helps eliminate bounds checks for bucket lookups in
+ // Teddy::verify_bucket, which has a small (3-4%) performance boost.
+ assert_eq!(16, teddy.buckets.len());
+
+ at += 1;
+ let len = haystack.len();
+ let mut prev0 = ones256();
+ while at <= len - 16 {
+ let c = self.candidate(haystack, at, &mut prev0);
+ if !is_all_zeroes256(c) {
+ if let Some(m) = teddy.verify_fat256(pats, haystack, at - 1, c)
+ {
+ return Some(m);
+ }
+ }
+ at += 16;
+ }
+ if at < len {
+ at = len - 16;
+ prev0 = ones256();
+
+ let c = self.candidate(haystack, at, &mut prev0);
+ if !is_all_zeroes256(c) {
+ if let Some(m) = teddy.verify_fat256(pats, haystack, at - 1, c)
+ {
+ return Some(m);
+ }
+ }
+ }
+ None
+ }
+
+ #[inline(always)]
+ unsafe fn candidate(
+ &self,
+ haystack: &[u8],
+ at: usize,
+ prev0: &mut __m256i,
+ ) -> __m256i {
+ debug_assert!(haystack[at..].len() >= 16);
+
+ let chunk = _mm256_broadcastsi128_si256(loadu128(haystack, at));
+ let (res0, res1) = members2m256(chunk, self.mask1, self.mask2);
+ let res0prev0 = _mm256_alignr_epi8(res0, *prev0, 15);
+ let res = _mm256_and_si256(res0prev0, res1);
+ *prev0 = res0;
+ res
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct TeddySlim3Mask128 {
+ pub mask1: Mask128,
+ pub mask2: Mask128,
+ pub mask3: Mask128,
+}
+
+impl TeddySlim3Mask128 {
+ #[target_feature(enable = "ssse3")]
+ unsafe fn find_at(
+ &self,
+ pats: &Patterns,
+ teddy: &Teddy,
+ haystack: &[u8],
+ mut at: usize,
+ ) -> Option<Match> {
+ debug_assert!(haystack[at..].len() >= teddy.minimum_len());
+ // This assert helps eliminate bounds checks for bucket lookups in
+ // Teddy::verify_bucket, which has a small (3-4%) performance boost.
+ assert_eq!(8, teddy.buckets.len());
+
+ at += 2;
+ let len = haystack.len();
+ let (mut prev0, mut prev1) = (ones128(), ones128());
+ while at <= len - 16 {
+ let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
+ if !is_all_zeroes128(c) {
+ if let Some(m) = teddy.verify128(pats, haystack, at - 2, c) {
+ return Some(m);
+ }
+ }
+ at += 16;
+ }
+ if at < len {
+ at = len - 16;
+ prev0 = ones128();
+ prev1 = ones128();
+
+ let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
+ if !is_all_zeroes128(c) {
+ if let Some(m) = teddy.verify128(pats, haystack, at - 2, c) {
+ return Some(m);
+ }
+ }
+ }
+ None
+ }
+
+ #[inline(always)]
+ unsafe fn candidate(
+ &self,
+ haystack: &[u8],
+ at: usize,
+ prev0: &mut __m128i,
+ prev1: &mut __m128i,
+ ) -> __m128i {
+ debug_assert!(haystack[at..].len() >= 16);
+
+ let chunk = loadu128(haystack, at);
+ let (res0, res1, res2) =
+ members3m128(chunk, self.mask1, self.mask2, self.mask3);
+ let res0prev0 = _mm_alignr_epi8(res0, *prev0, 14);
+ let res1prev1 = _mm_alignr_epi8(res1, *prev1, 15);
+ let res = _mm_and_si128(_mm_and_si128(res0prev0, res1prev1), res2);
+ *prev0 = res0;
+ *prev1 = res1;
+ res
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct TeddySlim3Mask256 {
+ pub mask1: Mask256,
+ pub mask2: Mask256,
+ pub mask3: Mask256,
+}
+
+impl TeddySlim3Mask256 {
+ #[target_feature(enable = "avx2")]
+ unsafe fn find_at(
+ &self,
+ pats: &Patterns,
+ teddy: &Teddy,
+ haystack: &[u8],
+ mut at: usize,
+ ) -> Option<Match> {
+ debug_assert!(haystack[at..].len() >= teddy.minimum_len());
+ // This assert helps eliminate bounds checks for bucket lookups in
+ // Teddy::verify_bucket, which has a small (3-4%) performance boost.
+ assert_eq!(8, teddy.buckets.len());
+
+ at += 2;
+ let len = haystack.len();
+ let (mut prev0, mut prev1) = (ones256(), ones256());
+ while at <= len - 32 {
+ let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
+ if !is_all_zeroes256(c) {
+ if let Some(m) = teddy.verify256(pats, haystack, at - 2, c) {
+ return Some(m);
+ }
+ }
+ at += 32;
+ }
+ if at < len {
+ at = len - 32;
+ prev0 = ones256();
+ prev1 = ones256();
+
+ let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
+ if !is_all_zeroes256(c) {
+ if let Some(m) = teddy.verify256(pats, haystack, at - 2, c) {
+ return Some(m);
+ }
+ }
+ }
+ None
+ }
+
+ #[inline(always)]
+ unsafe fn candidate(
+ &self,
+ haystack: &[u8],
+ at: usize,
+ prev0: &mut __m256i,
+ prev1: &mut __m256i,
+ ) -> __m256i {
+ debug_assert!(haystack[at..].len() >= 32);
+
+ let chunk = loadu256(haystack, at);
+ let (res0, res1, res2) =
+ members3m256(chunk, self.mask1, self.mask2, self.mask3);
+ let res0prev0 = alignr256_14(res0, *prev0);
+ let res1prev1 = alignr256_15(res1, *prev1);
+ let res =
+ _mm256_and_si256(_mm256_and_si256(res0prev0, res1prev1), res2);
+ *prev0 = res0;
+ *prev1 = res1;
+ res
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct TeddyFat3Mask256 {
+ pub mask1: Mask256,
+ pub mask2: Mask256,
+ pub mask3: Mask256,
+}
+
+impl TeddyFat3Mask256 {
+ #[target_feature(enable = "avx2")]
+ unsafe fn find_at(
+ &self,
+ pats: &Patterns,
+ teddy: &Teddy,
+ haystack: &[u8],
+ mut at: usize,
+ ) -> Option<Match> {
+ debug_assert!(haystack[at..].len() >= teddy.minimum_len());
+ // This assert helps eliminate bounds checks for bucket lookups in
+ // Teddy::verify_bucket, which has a small (3-4%) performance boost.
+ assert_eq!(16, teddy.buckets.len());
+
+ at += 2;
+ let len = haystack.len();
+ let (mut prev0, mut prev1) = (ones256(), ones256());
+ while at <= len - 16 {
+ let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
+ if !is_all_zeroes256(c) {
+ if let Some(m) = teddy.verify_fat256(pats, haystack, at - 2, c)
+ {
+ return Some(m);
+ }
+ }
+ at += 16;
+ }
+ if at < len {
+ at = len - 16;
+ prev0 = ones256();
+ prev1 = ones256();
+
+ let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
+ if !is_all_zeroes256(c) {
+ if let Some(m) = teddy.verify_fat256(pats, haystack, at - 2, c)
+ {
+ return Some(m);
+ }
+ }
+ }
+ None
+ }
+
+ #[inline(always)]
+ unsafe fn candidate(
+ &self,
+ haystack: &[u8],
+ at: usize,
+ prev0: &mut __m256i,
+ prev1: &mut __m256i,
+ ) -> __m256i {
+ debug_assert!(haystack[at..].len() >= 16);
+
+ let chunk = _mm256_broadcastsi128_si256(loadu128(haystack, at));
+ let (res0, res1, res2) =
+ members3m256(chunk, self.mask1, self.mask2, self.mask3);
+ let res0prev0 = _mm256_alignr_epi8(res0, *prev0, 14);
+ let res1prev1 = _mm256_alignr_epi8(res1, *prev1, 15);
+ let res =
+ _mm256_and_si256(_mm256_and_si256(res0prev0, res1prev1), res2);
+ *prev0 = res0;
+ *prev1 = res1;
+ res
+ }
+}
+
+/// A 128-bit mask for the low and high nybbles in a set of patterns. Each
+/// lane `j` corresponds to a bitset where the `i`th bit is set if and only if
+/// the nybble `j` is in the bucket `i` at a particular position.
+#[derive(Clone, Copy, Debug)]
+pub struct Mask128 {
+ lo: __m128i,
+ hi: __m128i,
+}
+
+impl Mask128 {
+ /// Create a new SIMD mask from the mask produced by the Teddy builder.
+ pub fn new(mask: compile::Mask) -> Mask128 {
+ // SAFETY: This is safe since [u8; 16] has the same representation
+ // as __m128i.
+ unsafe {
+ Mask128 {
+ lo: mem::transmute(mask.lo128()),
+ hi: mem::transmute(mask.hi128()),
+ }
+ }
+ }
+}
+
+/// A 256-bit mask for the low and high nybbles in a set of patterns. Each
+/// lane `j` corresponds to a bitset where the `i`th bit is set if and only if
+/// the nybble `j` is in the bucket `i` at a particular position.
+///
+/// This is slightly tweaked dependending on whether Slim or Fat Teddy is being
+/// used. For Slim Teddy, the bitsets in the lower 128-bits are the same as
+/// the bitsets in the higher 128-bits, so that we can search 32 bytes at a
+/// time. (Remember, the nybbles in the haystack are used as indices into these
+/// masks, and 256-bit shuffles only operate on 128-bit lanes.)
+///
+/// For Fat Teddy, the bitsets are not repeated, but instead, the high 128
+/// bits correspond to buckets 8-15. So that a bitset `00100010` has buckets
+/// 1 and 5 set if it's in the lower 128 bits, but has buckets 9 and 13 set
+/// if it's in the higher 128 bits.
+#[derive(Clone, Copy, Debug)]
+pub struct Mask256 {
+ lo: __m256i,
+ hi: __m256i,
+}
+
+impl Mask256 {
+ /// Create a new SIMD mask from the mask produced by the Teddy builder.
+ pub fn new(mask: compile::Mask) -> Mask256 {
+ // SAFETY: This is safe since [u8; 32] has the same representation
+ // as __m256i.
+ unsafe {
+ Mask256 {
+ lo: mem::transmute(mask.lo256()),
+ hi: mem::transmute(mask.hi256()),
+ }
+ }
+ }
+}
+
+// The "members" routines below are responsible for taking a chunk of bytes,
+// a number of nybble masks and returning the result of using the masks to
+// lookup bytes in the chunk. The results of the high and low nybble masks are
+// AND'ed together, such that each candidate returned is a vector, with byte
+// sized lanes, and where each lane is an 8-bit bitset corresponding to the
+// buckets that contain the corresponding byte.
+//
+// In the case of masks of length greater than 1, callers will need to keep
+// the results from the previous haystack's window, and then shift the vectors
+// so that they all line up. Then they can be AND'ed together.
+
+/// Return a candidate for Slim 128-bit Teddy, where `chunk` corresponds to a
+/// 16-byte window of the haystack (where the least significant byte
+/// corresponds to the start of the window), and `mask1` corresponds to a
+/// low/high mask for the first byte of all patterns that are being searched.
+#[target_feature(enable = "ssse3")]
+unsafe fn members1m128(chunk: __m128i, mask1: Mask128) -> __m128i {
+ let lomask = _mm_set1_epi8(0xF);
+ let hlo = _mm_and_si128(chunk, lomask);
+ let hhi = _mm_and_si128(_mm_srli_epi16(chunk, 4), lomask);
+ _mm_and_si128(
+ _mm_shuffle_epi8(mask1.lo, hlo),
+ _mm_shuffle_epi8(mask1.hi, hhi),
+ )
+}
+
+/// Return a candidate for Slim 256-bit Teddy, where `chunk` corresponds to a
+/// 32-byte window of the haystack (where the least significant byte
+/// corresponds to the start of the window), and `mask1` corresponds to a
+/// low/high mask for the first byte of all patterns that are being searched.
+///
+/// Note that this can also be used for Fat Teddy, where the high 128 bits in
+/// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte
+/// window in the haystack.
+#[target_feature(enable = "avx2")]
+unsafe fn members1m256(chunk: __m256i, mask1: Mask256) -> __m256i {
+ let lomask = _mm256_set1_epi8(0xF);
+ let hlo = _mm256_and_si256(chunk, lomask);
+ let hhi = _mm256_and_si256(_mm256_srli_epi16(chunk, 4), lomask);
+ _mm256_and_si256(
+ _mm256_shuffle_epi8(mask1.lo, hlo),
+ _mm256_shuffle_epi8(mask1.hi, hhi),
+ )
+}
+
+/// Return candidates for Slim 128-bit Teddy, where `chunk` corresponds
+/// to a 16-byte window of the haystack (where the least significant byte
+/// corresponds to the start of the window), and the masks correspond to a
+/// low/high mask for the first and second bytes of all patterns that are being
+/// searched. The vectors returned correspond to candidates for the first and
+/// second bytes in the patterns represented by the masks.
+#[target_feature(enable = "ssse3")]
+unsafe fn members2m128(
+ chunk: __m128i,
+ mask1: Mask128,
+ mask2: Mask128,
+) -> (__m128i, __m128i) {
+ let lomask = _mm_set1_epi8(0xF);
+ let hlo = _mm_and_si128(chunk, lomask);
+ let hhi = _mm_and_si128(_mm_srli_epi16(chunk, 4), lomask);
+ let res0 = _mm_and_si128(
+ _mm_shuffle_epi8(mask1.lo, hlo),
+ _mm_shuffle_epi8(mask1.hi, hhi),
+ );
+ let res1 = _mm_and_si128(
+ _mm_shuffle_epi8(mask2.lo, hlo),
+ _mm_shuffle_epi8(mask2.hi, hhi),
+ );
+ (res0, res1)
+}
+
+/// Return candidates for Slim 256-bit Teddy, where `chunk` corresponds
+/// to a 32-byte window of the haystack (where the least significant byte
+/// corresponds to the start of the window), and the masks correspond to a
+/// low/high mask for the first and second bytes of all patterns that are being
+/// searched. The vectors returned correspond to candidates for the first and
+/// second bytes in the patterns represented by the masks.
+///
+/// Note that this can also be used for Fat Teddy, where the high 128 bits in
+/// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte
+/// window in the haystack.
+#[target_feature(enable = "avx2")]
+unsafe fn members2m256(
+ chunk: __m256i,
+ mask1: Mask256,
+ mask2: Mask256,
+) -> (__m256i, __m256i) {
+ let lomask = _mm256_set1_epi8(0xF);
+ let hlo = _mm256_and_si256(chunk, lomask);
+ let hhi = _mm256_and_si256(_mm256_srli_epi16(chunk, 4), lomask);
+ let res0 = _mm256_and_si256(
+ _mm256_shuffle_epi8(mask1.lo, hlo),
+ _mm256_shuffle_epi8(mask1.hi, hhi),
+ );
+ let res1 = _mm256_and_si256(
+ _mm256_shuffle_epi8(mask2.lo, hlo),
+ _mm256_shuffle_epi8(mask2.hi, hhi),
+ );
+ (res0, res1)
+}
+
+/// Return candidates for Slim 128-bit Teddy, where `chunk` corresponds
+/// to a 16-byte window of the haystack (where the least significant byte
+/// corresponds to the start of the window), and the masks correspond to a
+/// low/high mask for the first, second and third bytes of all patterns that
+/// are being searched. The vectors returned correspond to candidates for the
+/// first, second and third bytes in the patterns represented by the masks.
+#[target_feature(enable = "ssse3")]
+unsafe fn members3m128(
+ chunk: __m128i,
+ mask1: Mask128,
+ mask2: Mask128,
+ mask3: Mask128,
+) -> (__m128i, __m128i, __m128i) {
+ let lomask = _mm_set1_epi8(0xF);
+ let hlo = _mm_and_si128(chunk, lomask);
+ let hhi = _mm_and_si128(_mm_srli_epi16(chunk, 4), lomask);
+ let res0 = _mm_and_si128(
+ _mm_shuffle_epi8(mask1.lo, hlo),
+ _mm_shuffle_epi8(mask1.hi, hhi),
+ );
+ let res1 = _mm_and_si128(
+ _mm_shuffle_epi8(mask2.lo, hlo),
+ _mm_shuffle_epi8(mask2.hi, hhi),
+ );
+ let res2 = _mm_and_si128(
+ _mm_shuffle_epi8(mask3.lo, hlo),
+ _mm_shuffle_epi8(mask3.hi, hhi),
+ );
+ (res0, res1, res2)
+}
+
+/// Return candidates for Slim 256-bit Teddy, where `chunk` corresponds
+/// to a 32-byte window of the haystack (where the least significant byte
+/// corresponds to the start of the window), and the masks correspond to a
+/// low/high mask for the first, second and third bytes of all patterns that
+/// are being searched. The vectors returned correspond to candidates for the
+/// first, second and third bytes in the patterns represented by the masks.
+///
+/// Note that this can also be used for Fat Teddy, where the high 128 bits in
+/// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte
+/// window in the haystack.
+#[target_feature(enable = "avx2")]
+unsafe fn members3m256(
+ chunk: __m256i,
+ mask1: Mask256,
+ mask2: Mask256,
+ mask3: Mask256,
+) -> (__m256i, __m256i, __m256i) {
+ let lomask = _mm256_set1_epi8(0xF);
+ let hlo = _mm256_and_si256(chunk, lomask);
+ let hhi = _mm256_and_si256(_mm256_srli_epi16(chunk, 4), lomask);
+ let res0 = _mm256_and_si256(
+ _mm256_shuffle_epi8(mask1.lo, hlo),
+ _mm256_shuffle_epi8(mask1.hi, hhi),
+ );
+ let res1 = _mm256_and_si256(
+ _mm256_shuffle_epi8(mask2.lo, hlo),
+ _mm256_shuffle_epi8(mask2.hi, hhi),
+ );
+ let res2 = _mm256_and_si256(
+ _mm256_shuffle_epi8(mask3.lo, hlo),
+ _mm256_shuffle_epi8(mask3.hi, hhi),
+ );
+ (res0, res1, res2)
+}
diff --git a/src/packed/tests.rs b/src/packed/tests.rs
new file mode 100644
index 0000000..a384396
--- /dev/null
+++ b/src/packed/tests.rs
@@ -0,0 +1,568 @@
+use std::collections::HashMap;
+use std::usize;
+
+use packed::{Config, MatchKind};
+use Match;
+
+/// A description of a single test against a multi-pattern searcher.
+///
+/// A single test may not necessarily pass on every configuration of a
+/// searcher. The tests are categorized and grouped appropriately below.
+#[derive(Clone, Debug, Eq, PartialEq)]
+struct SearchTest {
+ /// The name of this test, for debugging.
+ name: &'static str,
+ /// The patterns to search for.
+ patterns: &'static [&'static str],
+ /// The text to search.
+ haystack: &'static str,
+ /// Each match is a triple of (pattern_index, start, end), where
+ /// pattern_index is an index into `patterns` and `start`/`end` are indices
+ /// into `haystack`.
+ matches: &'static [(usize, usize, usize)],
+}
+
+struct SearchTestOwned {
+ offset: usize,
+ name: String,
+ patterns: Vec<String>,
+ haystack: String,
+ matches: Vec<(usize, usize, usize)>,
+}
+
+impl SearchTest {
+ fn variations(&self) -> Vec<SearchTestOwned> {
+ let mut tests = vec![];
+ for i in 0..=260 {
+ tests.push(self.offset_prefix(i));
+ tests.push(self.offset_suffix(i));
+ tests.push(self.offset_both(i));
+ }
+ tests
+ }
+
+ fn offset_both(&self, off: usize) -> SearchTestOwned {
+ SearchTestOwned {
+ offset: off,
+ name: self.name.to_string(),
+ patterns: self.patterns.iter().map(|s| s.to_string()).collect(),
+ haystack: format!(
+ "{}{}{}",
+ "Z".repeat(off),
+ self.haystack,
+ "Z".repeat(off)
+ ),
+ matches: self
+ .matches
+ .iter()
+ .map(|&(id, s, e)| (id, s + off, e + off))
+ .collect(),
+ }
+ }
+
+ fn offset_prefix(&self, off: usize) -> SearchTestOwned {
+ SearchTestOwned {
+ offset: off,
+ name: self.name.to_string(),
+ patterns: self.patterns.iter().map(|s| s.to_string()).collect(),
+ haystack: format!("{}{}", "Z".repeat(off), self.haystack),
+ matches: self
+ .matches
+ .iter()
+ .map(|&(id, s, e)| (id, s + off, e + off))
+ .collect(),
+ }
+ }
+
+ fn offset_suffix(&self, off: usize) -> SearchTestOwned {
+ SearchTestOwned {
+ offset: off,
+ name: self.name.to_string(),
+ patterns: self.patterns.iter().map(|s| s.to_string()).collect(),
+ haystack: format!("{}{}", self.haystack, "Z".repeat(off)),
+ matches: self.matches.to_vec(),
+ }
+ }
+
+ // fn to_owned(&self) -> SearchTestOwned {
+ // SearchTestOwned {
+ // name: self.name.to_string(),
+ // patterns: self.patterns.iter().map(|s| s.to_string()).collect(),
+ // haystack: self.haystack.to_string(),
+ // matches: self.matches.iter().cloned().collect(),
+ // }
+ // }
+}
+
+/// Short-hand constructor for SearchTest. We use it a lot below.
+macro_rules! t {
+ ($name:ident, $patterns:expr, $haystack:expr, $matches:expr) => {
+ SearchTest {
+ name: stringify!($name),
+ patterns: $patterns,
+ haystack: $haystack,
+ matches: $matches,
+ }
+ };
+}
+
+/// A collection of test groups.
+type TestCollection = &'static [&'static [SearchTest]];
+
+// Define several collections corresponding to the different type of match
+// semantics supported. These collections have some overlap, but each
+// collection should have some tests that no other collection has.
+
+/// Tests for leftmost-first match semantics.
+const PACKED_LEFTMOST_FIRST: TestCollection =
+ &[BASICS, LEFTMOST, LEFTMOST_FIRST, REGRESSION, TEDDY];
+
+/// Tests for leftmost-longest match semantics.
+const PACKED_LEFTMOST_LONGEST: TestCollection =
+ &[BASICS, LEFTMOST, LEFTMOST_LONGEST, REGRESSION, TEDDY];
+
+// Now define the individual tests that make up the collections above.
+
+/// A collection of tests for the that should always be true regardless of
+/// match semantics. That is, all combinations of leftmost-{first, longest}
+/// should produce the same answer.
+const BASICS: &'static [SearchTest] = &[
+ t!(basic001, &["a"], "", &[]),
+ t!(basic010, &["a"], "a", &[(0, 0, 1)]),
+ t!(basic020, &["a"], "aa", &[(0, 0, 1), (0, 1, 2)]),
+ t!(basic030, &["a"], "aaa", &[(0, 0, 1), (0, 1, 2), (0, 2, 3)]),
+ t!(basic040, &["a"], "aba", &[(0, 0, 1), (0, 2, 3)]),
+ t!(basic050, &["a"], "bba", &[(0, 2, 3)]),
+ t!(basic060, &["a"], "bbb", &[]),
+ t!(basic070, &["a"], "bababbbba", &[(0, 1, 2), (0, 3, 4), (0, 8, 9)]),
+ t!(basic100, &["aa"], "", &[]),
+ t!(basic110, &["aa"], "aa", &[(0, 0, 2)]),
+ t!(basic120, &["aa"], "aabbaa", &[(0, 0, 2), (0, 4, 6)]),
+ t!(basic130, &["aa"], "abbab", &[]),
+ t!(basic140, &["aa"], "abbabaa", &[(0, 5, 7)]),
+ t!(basic150, &["aaa"], "aaa", &[(0, 0, 3)]),
+ t!(basic200, &["abc"], "abc", &[(0, 0, 3)]),
+ t!(basic210, &["abc"], "zazabzabcz", &[(0, 6, 9)]),
+ t!(basic220, &["abc"], "zazabczabcz", &[(0, 3, 6), (0, 7, 10)]),
+ t!(basic300, &["a", "b"], "", &[]),
+ t!(basic310, &["a", "b"], "z", &[]),
+ t!(basic320, &["a", "b"], "b", &[(1, 0, 1)]),
+ t!(basic330, &["a", "b"], "a", &[(0, 0, 1)]),
+ t!(
+ basic340,
+ &["a", "b"],
+ "abba",
+ &[(0, 0, 1), (1, 1, 2), (1, 2, 3), (0, 3, 4),]
+ ),
+ t!(
+ basic350,
+ &["b", "a"],
+ "abba",
+ &[(1, 0, 1), (0, 1, 2), (0, 2, 3), (1, 3, 4),]
+ ),
+ t!(basic360, &["abc", "bc"], "xbc", &[(1, 1, 3),]),
+ t!(basic400, &["foo", "bar"], "", &[]),
+ t!(basic410, &["foo", "bar"], "foobar", &[(0, 0, 3), (1, 3, 6),]),
+ t!(basic420, &["foo", "bar"], "barfoo", &[(1, 0, 3), (0, 3, 6),]),
+ t!(basic430, &["foo", "bar"], "foofoo", &[(0, 0, 3), (0, 3, 6),]),
+ t!(basic440, &["foo", "bar"], "barbar", &[(1, 0, 3), (1, 3, 6),]),
+ t!(basic450, &["foo", "bar"], "bafofoo", &[(0, 4, 7),]),
+ t!(basic460, &["bar", "foo"], "bafofoo", &[(1, 4, 7),]),
+ t!(basic470, &["foo", "bar"], "fobabar", &[(1, 4, 7),]),
+ t!(basic480, &["bar", "foo"], "fobabar", &[(0, 4, 7),]),
+ t!(basic700, &["yabcdef", "abcdezghi"], "yabcdefghi", &[(0, 0, 7),]),
+ t!(basic710, &["yabcdef", "abcdezghi"], "yabcdezghi", &[(1, 1, 10),]),
+ t!(
+ basic720,
+ &["yabcdef", "bcdeyabc", "abcdezghi"],
+ "yabcdezghi",
+ &[(2, 1, 10),]
+ ),
+ t!(basic810, &["abcd", "bcd", "cd"], "abcd", &[(0, 0, 4),]),
+ t!(basic820, &["bcd", "cd", "abcd"], "abcd", &[(2, 0, 4),]),
+ t!(basic830, &["abc", "bc"], "zazabcz", &[(0, 3, 6),]),
+ t!(
+ basic840,
+ &["ab", "ba"],
+ "abababa",
+ &[(0, 0, 2), (0, 2, 4), (0, 4, 6),]
+ ),
+ t!(basic850, &["foo", "foo"], "foobarfoo", &[(0, 0, 3), (0, 6, 9),]),
+];
+
+/// Tests for leftmost match semantics. These should pass for both
+/// leftmost-first and leftmost-longest match kinds. Stated differently, among
+/// ambiguous matches, the longest match and the match that appeared first when
+/// constructing the automaton should always be the same.
+const LEFTMOST: &'static [SearchTest] = &[
+ t!(leftmost000, &["ab", "ab"], "abcd", &[(0, 0, 2)]),
+ t!(leftmost030, &["a", "ab"], "aa", &[(0, 0, 1), (0, 1, 2)]),
+ t!(leftmost031, &["ab", "a"], "aa", &[(1, 0, 1), (1, 1, 2)]),
+ t!(leftmost032, &["ab", "a"], "xayabbbz", &[(1, 1, 2), (0, 3, 5)]),
+ t!(leftmost300, &["abcd", "bce", "b"], "abce", &[(1, 1, 4)]),
+ t!(leftmost310, &["abcd", "ce", "bc"], "abce", &[(2, 1, 3)]),
+ t!(leftmost320, &["abcd", "bce", "ce", "b"], "abce", &[(1, 1, 4)]),
+ t!(leftmost330, &["abcd", "bce", "cz", "bc"], "abcz", &[(3, 1, 3)]),
+ t!(leftmost340, &["bce", "cz", "bc"], "bcz", &[(2, 0, 2)]),
+ t!(leftmost350, &["abc", "bd", "ab"], "abd", &[(2, 0, 2)]),
+ t!(
+ leftmost360,
+ &["abcdefghi", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(2, 0, 8),]
+ ),
+ t!(
+ leftmost370,
+ &["abcdefghi", "cde", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(3, 0, 8),]
+ ),
+ t!(
+ leftmost380,
+ &["abcdefghi", "hz", "abcdefgh", "a"],
+ "abcdefghz",
+ &[(2, 0, 8),]
+ ),
+ t!(
+ leftmost390,
+ &["b", "abcdefghi", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(3, 0, 8),]
+ ),
+ t!(
+ leftmost400,
+ &["h", "abcdefghi", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(3, 0, 8),]
+ ),
+ t!(
+ leftmost410,
+ &["z", "abcdefghi", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(3, 0, 8), (0, 8, 9),]
+ ),
+];
+
+/// Tests for non-overlapping leftmost-first match semantics. These tests
+/// should generally be specific to leftmost-first, which means they should
+/// generally fail under leftmost-longest semantics.
+const LEFTMOST_FIRST: &'static [SearchTest] = &[
+ t!(leftfirst000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]),
+ t!(leftfirst020, &["abcd", "ab"], "abcd", &[(0, 0, 4)]),
+ t!(leftfirst030, &["ab", "ab"], "abcd", &[(0, 0, 2)]),
+ t!(leftfirst040, &["a", "ab"], "xayabbbz", &[(0, 1, 2), (0, 3, 4)]),
+ t!(leftfirst100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[(1, 1, 5)]),
+ t!(leftfirst110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[(1, 1, 6)]),
+ t!(leftfirst300, &["abcd", "b", "bce"], "abce", &[(1, 1, 2)]),
+ t!(
+ leftfirst310,
+ &["abcd", "b", "bce", "ce"],
+ "abce",
+ &[(1, 1, 2), (3, 2, 4),]
+ ),
+ t!(
+ leftfirst320,
+ &["a", "abcdefghi", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(0, 0, 1), (2, 7, 9),]
+ ),
+ t!(leftfirst330, &["a", "abab"], "abab", &[(0, 0, 1), (0, 2, 3)]),
+ t!(
+ leftfirst340,
+ &["abcdef", "x", "x", "x", "x", "x", "x", "abcde"],
+ "abcdef",
+ &[(0, 0, 6)]
+ ),
+];
+
+/// Tests for non-overlapping leftmost-longest match semantics. These tests
+/// should generally be specific to leftmost-longest, which means they should
+/// generally fail under leftmost-first semantics.
+const LEFTMOST_LONGEST: &'static [SearchTest] = &[
+ t!(leftlong000, &["ab", "abcd"], "abcd", &[(1, 0, 4)]),
+ t!(leftlong010, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4),]),
+ t!(leftlong040, &["a", "ab"], "a", &[(0, 0, 1)]),
+ t!(leftlong050, &["a", "ab"], "ab", &[(1, 0, 2)]),
+ t!(leftlong060, &["ab", "a"], "a", &[(1, 0, 1)]),
+ t!(leftlong070, &["ab", "a"], "ab", &[(0, 0, 2)]),
+ t!(leftlong100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[(2, 1, 6)]),
+ t!(leftlong110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[(1, 1, 6)]),
+ t!(leftlong300, &["abcd", "b", "bce"], "abce", &[(2, 1, 4)]),
+ t!(
+ leftlong310,
+ &["a", "abcdefghi", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(3, 0, 8),]
+ ),
+ t!(leftlong320, &["a", "abab"], "abab", &[(1, 0, 4)]),
+ t!(leftlong330, &["abcd", "b", "ce"], "abce", &[(1, 1, 2), (2, 2, 4),]),
+ t!(leftlong340, &["a", "ab"], "xayabbbz", &[(0, 1, 2), (1, 3, 5)]),
+];
+
+/// Regression tests that are applied to all combinations.
+///
+/// If regression tests are needed for specific match semantics, then add them
+/// to the appropriate group above.
+const REGRESSION: &'static [SearchTest] = &[
+ t!(regression010, &["inf", "ind"], "infind", &[(0, 0, 3), (1, 3, 6),]),
+ t!(regression020, &["ind", "inf"], "infind", &[(1, 0, 3), (0, 3, 6),]),
+ t!(
+ regression030,
+ &["libcore/", "libstd/"],
+ "libcore/char/methods.rs",
+ &[(0, 0, 8),]
+ ),
+ t!(
+ regression040,
+ &["libstd/", "libcore/"],
+ "libcore/char/methods.rs",
+ &[(1, 0, 8),]
+ ),
+ t!(
+ regression050,
+ &["\x00\x00\x01", "\x00\x00\x00"],
+ "\x00\x00\x00",
+ &[(1, 0, 3),]
+ ),
+ t!(
+ regression060,
+ &["\x00\x00\x00", "\x00\x00\x01"],
+ "\x00\x00\x00",
+ &[(0, 0, 3),]
+ ),
+];
+
+const TEDDY: &'static [SearchTest] = &[
+ t!(
+ teddy010,
+ &["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"],
+ "abcdefghijk",
+ &[
+ (0, 0, 1),
+ (1, 1, 2),
+ (2, 2, 3),
+ (3, 3, 4),
+ (4, 4, 5),
+ (5, 5, 6),
+ (6, 6, 7),
+ (7, 7, 8),
+ (8, 8, 9),
+ (9, 9, 10),
+ (10, 10, 11)
+ ]
+ ),
+ t!(
+ teddy020,
+ &["ab", "bc", "cd", "de", "ef", "fg", "gh", "hi", "ij", "jk", "kl"],
+ "abcdefghijk",
+ &[(0, 0, 2), (2, 2, 4), (4, 4, 6), (6, 6, 8), (8, 8, 10),]
+ ),
+ t!(
+ teddy030,
+ &["abc"],
+ "abcdefghijklmnopqrstuvwxyzabcdefghijk",
+ &[(0, 0, 3), (0, 26, 29)]
+ ),
+];
+
+// Now define a test for each combination of things above that we want to run.
+// Since there are a few different combinations for each collection of tests,
+// we define a couple of macros to avoid repetition drudgery. The testconfig
+// macro constructs the automaton from a given match kind, and runs the search
+// tests one-by-one over the given collection. The `with` parameter allows one
+// to configure the config with additional parameters. The testcombo macro
+// invokes testconfig in precisely this way: it sets up several tests where
+// each one turns a different knob on Config.
+
+macro_rules! testconfig {
+ ($name:ident, $collection:expr, $with:expr) => {
+ #[test]
+ fn $name() {
+ run_search_tests($collection, |test| {
+ let mut config = Config::new();
+ $with(&mut config);
+ config
+ .builder()
+ .extend(test.patterns.iter().map(|p| p.as_bytes()))
+ .build()
+ .unwrap()
+ .find_iter(&test.haystack)
+ .collect()
+ });
+ }
+ };
+}
+
+#[cfg(target_arch = "x86_64")]
+testconfig!(
+ search_default_leftmost_first,
+ PACKED_LEFTMOST_FIRST,
+ |_: &mut Config| {}
+);
+
+#[cfg(target_arch = "x86_64")]
+testconfig!(
+ search_default_leftmost_longest,
+ PACKED_LEFTMOST_LONGEST,
+ |c: &mut Config| {
+ c.match_kind(MatchKind::LeftmostLongest);
+ }
+);
+
+#[cfg(target_arch = "x86_64")]
+testconfig!(
+ search_teddy_leftmost_first,
+ PACKED_LEFTMOST_FIRST,
+ |c: &mut Config| {
+ c.force_teddy(true);
+ }
+);
+
+#[cfg(target_arch = "x86_64")]
+testconfig!(
+ search_teddy_leftmost_longest,
+ PACKED_LEFTMOST_LONGEST,
+ |c: &mut Config| {
+ c.force_teddy(true).match_kind(MatchKind::LeftmostLongest);
+ }
+);
+
+#[cfg(target_arch = "x86_64")]
+testconfig!(
+ search_teddy_ssse3_leftmost_first,
+ PACKED_LEFTMOST_FIRST,
+ |c: &mut Config| {
+ c.force_teddy(true);
+ if is_x86_feature_detected!("ssse3") {
+ c.force_avx(Some(false));
+ }
+ }
+);
+
+#[cfg(target_arch = "x86_64")]
+testconfig!(
+ search_teddy_ssse3_leftmost_longest,
+ PACKED_LEFTMOST_LONGEST,
+ |c: &mut Config| {
+ c.force_teddy(true).match_kind(MatchKind::LeftmostLongest);
+ if is_x86_feature_detected!("ssse3") {
+ c.force_avx(Some(false));
+ }
+ }
+);
+
+#[cfg(target_arch = "x86_64")]
+testconfig!(
+ search_teddy_avx2_leftmost_first,
+ PACKED_LEFTMOST_FIRST,
+ |c: &mut Config| {
+ c.force_teddy(true);
+ if is_x86_feature_detected!("avx2") {
+ c.force_avx(Some(true));
+ }
+ }
+);
+
+#[cfg(target_arch = "x86_64")]
+testconfig!(
+ search_teddy_avx2_leftmost_longest,
+ PACKED_LEFTMOST_LONGEST,
+ |c: &mut Config| {
+ c.force_teddy(true).match_kind(MatchKind::LeftmostLongest);
+ if is_x86_feature_detected!("avx2") {
+ c.force_avx(Some(true));
+ }
+ }
+);
+
+#[cfg(target_arch = "x86_64")]
+testconfig!(
+ search_teddy_fat_leftmost_first,
+ PACKED_LEFTMOST_FIRST,
+ |c: &mut Config| {
+ c.force_teddy(true);
+ if is_x86_feature_detected!("avx2") {
+ c.force_teddy_fat(Some(true));
+ }
+ }
+);
+
+#[cfg(target_arch = "x86_64")]
+testconfig!(
+ search_teddy_fat_leftmost_longest,
+ PACKED_LEFTMOST_LONGEST,
+ |c: &mut Config| {
+ c.force_teddy(true).match_kind(MatchKind::LeftmostLongest);
+ if is_x86_feature_detected!("avx2") {
+ c.force_teddy_fat(Some(true));
+ }
+ }
+);
+
+testconfig!(
+ search_rabinkarp_leftmost_first,
+ PACKED_LEFTMOST_FIRST,
+ |c: &mut Config| {
+ c.force_rabin_karp(true);
+ }
+);
+
+testconfig!(
+ search_rabinkarp_leftmost_longest,
+ PACKED_LEFTMOST_LONGEST,
+ |c: &mut Config| {
+ c.force_rabin_karp(true).match_kind(MatchKind::LeftmostLongest);
+ }
+);
+
+#[test]
+fn search_tests_have_unique_names() {
+ let assert = |constname, tests: &[SearchTest]| {
+ let mut seen = HashMap::new(); // map from test name to position
+ for (i, test) in tests.iter().enumerate() {
+ if !seen.contains_key(test.name) {
+ seen.insert(test.name, i);
+ } else {
+ let last = seen[test.name];
+ panic!(
+ "{} tests have duplicate names at positions {} and {}",
+ constname, last, i
+ );
+ }
+ }
+ };
+ assert("BASICS", BASICS);
+ assert("LEFTMOST", LEFTMOST);
+ assert("LEFTMOST_FIRST", LEFTMOST_FIRST);
+ assert("LEFTMOST_LONGEST", LEFTMOST_LONGEST);
+ assert("REGRESSION", REGRESSION);
+ assert("TEDDY", TEDDY);
+}
+
+fn run_search_tests<F: FnMut(&SearchTestOwned) -> Vec<Match>>(
+ which: TestCollection,
+ mut f: F,
+) {
+ let get_match_triples =
+ |matches: Vec<Match>| -> Vec<(usize, usize, usize)> {
+ matches
+ .into_iter()
+ .map(|m| (m.pattern(), m.start(), m.end()))
+ .collect()
+ };
+ for &tests in which {
+ for spec in tests {
+ for test in spec.variations() {
+ assert_eq!(
+ test.matches,
+ get_match_triples(f(&test)).as_slice(),
+ "test: {}, patterns: {:?}, haystack: {:?}, offset: {:?}",
+ test.name,
+ test.patterns,
+ test.haystack,
+ test.offset,
+ );
+ }
+ }
+ }
+}
diff --git a/src/packed/vector.rs b/src/packed/vector.rs
new file mode 100644
index 0000000..ca6c2b0
--- /dev/null
+++ b/src/packed/vector.rs
@@ -0,0 +1,181 @@
+// This file contains a set of fairly generic utility functions when working
+// with SIMD vectors.
+//
+// SAFETY: All of the routines below are unsafe to call because they assume
+// the necessary CPU target features in order to use particular vendor
+// intrinsics. Calling these routines when the underlying CPU does not support
+// the appropriate target features is NOT safe. Callers must ensure this
+// themselves.
+//
+// Note that it may not look like this safety invariant is being upheld when
+// these routines are called. Namely, the CPU feature check is typically pretty
+// far away from when these routines are used. Instead, we rely on the fact
+// that certain types serve as a guaranteed receipt that pertinent target
+// features are enabled. For example, the only way TeddySlim3Mask256 can be
+// constructed is if the AVX2 CPU feature is available. Thus, any code running
+// inside of TeddySlim3Mask256 can use any of the functions below without any
+// additional checks: its very existence *is* the check.
+
+use std::arch::x86_64::*;
+
+/// Shift `a` to the left by two bytes (removing its two most significant
+/// bytes), and concatenate it with the the two most significant bytes of `b`.
+#[target_feature(enable = "avx2")]
+pub unsafe fn alignr256_14(a: __m256i, b: __m256i) -> __m256i {
+ // Credit goes to jneem for figuring this out:
+ // https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184
+ //
+ // TL;DR avx2's PALIGNR instruction is actually just two 128-bit PALIGNR
+ // instructions, which is not what we want, so we need to do some extra
+ // shuffling.
+
+ // This permute gives us the low 16 bytes of a concatenated with the high
+ // 16 bytes of b, in order of most significant to least significant. So
+ // `v = a[15:0] b[31:16]`.
+ let v = _mm256_permute2x128_si256(b, a, 0x21);
+ // This effectively does this (where we deal in terms of byte-indexing
+ // and byte-shifting, and use inclusive ranges):
+ //
+ // ret[15:0] := ((a[15:0] << 16) | v[15:0]) >> 14
+ // = ((a[15:0] << 16) | b[31:16]) >> 14
+ // ret[31:16] := ((a[31:16] << 16) | v[31:16]) >> 14
+ // = ((a[31:16] << 16) | a[15:0]) >> 14
+ //
+ // Which therefore results in:
+ //
+ // ret[31:0] := a[29:16] a[15:14] a[13:0] b[31:30]
+ //
+ // The end result is that we've effectively done this:
+ //
+ // (a << 2) | (b >> 30)
+ //
+ // When `A` and `B` are strings---where the beginning of the string is in
+ // the least significant bits---we effectively result in the following
+ // semantic operation:
+ //
+ // (A >> 2) | (B << 30)
+ //
+ // The reversal being attributed to the fact that we are in little-endian.
+ _mm256_alignr_epi8(a, v, 14)
+}
+
+/// Shift `a` to the left by one byte (removing its most significant byte), and
+/// concatenate it with the the most significant byte of `b`.
+#[target_feature(enable = "avx2")]
+pub unsafe fn alignr256_15(a: __m256i, b: __m256i) -> __m256i {
+ // For explanation, see alignr256_14.
+ let v = _mm256_permute2x128_si256(b, a, 0x21);
+ _mm256_alignr_epi8(a, v, 15)
+}
+
+/// Unpack the given 128-bit vector into its 64-bit components. The first
+/// element of the array returned corresponds to the least significant 64-bit
+/// lane in `a`.
+#[target_feature(enable = "ssse3")]
+pub unsafe fn unpack64x128(a: __m128i) -> [u64; 2] {
+ [
+ _mm_cvtsi128_si64(a) as u64,
+ _mm_cvtsi128_si64(_mm_srli_si128(a, 8)) as u64,
+ ]
+}
+
+/// Unpack the given 256-bit vector into its 64-bit components. The first
+/// element of the array returned corresponds to the least significant 64-bit
+/// lane in `a`.
+#[target_feature(enable = "avx2")]
+pub unsafe fn unpack64x256(a: __m256i) -> [u64; 4] {
+ // Using transmute here is precisely equivalent, but actually slower. It's
+ // not quite clear why.
+ let lo = _mm256_extracti128_si256(a, 0);
+ let hi = _mm256_extracti128_si256(a, 1);
+ [
+ _mm_cvtsi128_si64(lo) as u64,
+ _mm_cvtsi128_si64(_mm_srli_si128(lo, 8)) as u64,
+ _mm_cvtsi128_si64(hi) as u64,
+ _mm_cvtsi128_si64(_mm_srli_si128(hi, 8)) as u64,
+ ]
+}
+
+/// Unpack the low 128-bits of `a` and `b`, and return them as 4 64-bit
+/// integers.
+///
+/// More precisely, if a = a4 a3 a2 a1 and b = b4 b3 b2 b1, where each element
+/// is a 64-bit integer and a1/b1 correspond to the least significant 64 bits,
+/// then the return value is `b2 b1 a2 a1`.
+#[target_feature(enable = "avx2")]
+pub unsafe fn unpacklo64x256(a: __m256i, b: __m256i) -> [u64; 4] {
+ let lo = _mm256_castsi256_si128(a);
+ let hi = _mm256_castsi256_si128(b);
+ [
+ _mm_cvtsi128_si64(lo) as u64,
+ _mm_cvtsi128_si64(_mm_srli_si128(lo, 8)) as u64,
+ _mm_cvtsi128_si64(hi) as u64,
+ _mm_cvtsi128_si64(_mm_srli_si128(hi, 8)) as u64,
+ ]
+}
+
+/// Returns true if and only if all bits in the given 128-bit vector are 0.
+#[target_feature(enable = "ssse3")]
+pub unsafe fn is_all_zeroes128(a: __m128i) -> bool {
+ let cmp = _mm_cmpeq_epi8(a, zeroes128());
+ _mm_movemask_epi8(cmp) as u32 == 0xFFFF
+}
+
+/// Returns true if and only if all bits in the given 256-bit vector are 0.
+#[target_feature(enable = "avx2")]
+pub unsafe fn is_all_zeroes256(a: __m256i) -> bool {
+ let cmp = _mm256_cmpeq_epi8(a, zeroes256());
+ _mm256_movemask_epi8(cmp) as u32 == 0xFFFFFFFF
+}
+
+/// Load a 128-bit vector from slice at the given position. The slice does
+/// not need to be unaligned.
+///
+/// Since this code assumes little-endian (there is no big-endian x86), the
+/// bytes starting in `slice[at..]` will be at the least significant bits of
+/// the returned vector. This is important for the surrounding code, since for
+/// example, shifting the resulting vector right is equivalent to logically
+/// shifting the bytes in `slice` left.
+#[target_feature(enable = "sse2")]
+pub unsafe fn loadu128(slice: &[u8], at: usize) -> __m128i {
+ let ptr = slice.get_unchecked(at..).as_ptr();
+ _mm_loadu_si128(ptr as *const u8 as *const __m128i)
+}
+
+/// Load a 256-bit vector from slice at the given position. The slice does
+/// not need to be unaligned.
+///
+/// Since this code assumes little-endian (there is no big-endian x86), the
+/// bytes starting in `slice[at..]` will be at the least significant bits of
+/// the returned vector. This is important for the surrounding code, since for
+/// example, shifting the resulting vector right is equivalent to logically
+/// shifting the bytes in `slice` left.
+#[target_feature(enable = "avx2")]
+pub unsafe fn loadu256(slice: &[u8], at: usize) -> __m256i {
+ let ptr = slice.get_unchecked(at..).as_ptr();
+ _mm256_loadu_si256(ptr as *const u8 as *const __m256i)
+}
+
+/// Returns a 128-bit vector with all bits set to 0.
+#[target_feature(enable = "sse2")]
+pub unsafe fn zeroes128() -> __m128i {
+ _mm_set1_epi8(0)
+}
+
+/// Returns a 256-bit vector with all bits set to 0.
+#[target_feature(enable = "avx2")]
+pub unsafe fn zeroes256() -> __m256i {
+ _mm256_set1_epi8(0)
+}
+
+/// Returns a 128-bit vector with all bits set to 1.
+#[target_feature(enable = "sse2")]
+pub unsafe fn ones128() -> __m128i {
+ _mm_set1_epi8(0xFF as u8 as i8)
+}
+
+/// Returns a 256-bit vector with all bits set to 1.
+#[target_feature(enable = "avx2")]
+pub unsafe fn ones256() -> __m256i {
+ _mm256_set1_epi8(0xFF as u8 as i8)
+}
diff --git a/src/prefilter.rs b/src/prefilter.rs
new file mode 100644
index 0000000..bda215d
--- /dev/null
+++ b/src/prefilter.rs
@@ -0,0 +1,997 @@
+use std::cmp;
+use std::fmt;
+use std::panic::{RefUnwindSafe, UnwindSafe};
+use std::u8;
+
+use memchr::{memchr, memchr2, memchr3};
+
+use ahocorasick::MatchKind;
+use packed;
+use Match;
+
+/// A candidate is the result of running a prefilter on a haystack at a
+/// particular position. The result is either no match, a confirmed match or
+/// a possible match.
+///
+/// When no match is returned, the prefilter is guaranteeing that no possible
+/// match can be found in the haystack, and the caller may trust this. That is,
+/// all correct prefilters must never report false negatives.
+///
+/// In some cases, a prefilter can confirm a match very quickly, in which case,
+/// the caller may use this to stop what it's doing and report the match. In
+/// this case, prefilter implementations must never report a false positive.
+/// In other cases, the prefilter can only report a potential match, in which
+/// case the callers must attempt to confirm the match. In this case, prefilter
+/// implementations are permitted to return false positives.
+#[derive(Clone, Debug)]
+pub enum Candidate {
+ None,
+ Match(Match),
+ PossibleStartOfMatch(usize),
+}
+
+impl Candidate {
+ /// Convert this candidate into an option. This is useful when callers
+ /// do not distinguish between true positives and false positives (i.e.,
+ /// the caller must always confirm the match in order to update some other
+ /// state).
+ pub fn into_option(self) -> Option<usize> {
+ match self {
+ Candidate::None => None,
+ Candidate::Match(ref m) => Some(m.start()),
+ Candidate::PossibleStartOfMatch(start) => Some(start),
+ }
+ }
+}
+
+/// A prefilter describes the behavior of fast literal scanners for quickly
+/// skipping past bytes in the haystack that we know cannot possibly
+/// participate in a match.
+pub trait Prefilter:
+ Send + Sync + RefUnwindSafe + UnwindSafe + fmt::Debug
+{
+ /// Returns the next possible match candidate. This may yield false
+ /// positives, so callers must confirm a match starting at the position
+ /// returned. This, however, must never produce false negatives. That is,
+ /// this must, at minimum, return the starting position of the next match
+ /// in the given haystack after or at the given position.
+ fn next_candidate(
+ &self,
+ state: &mut PrefilterState,
+ haystack: &[u8],
+ at: usize,
+ ) -> Candidate;
+
+ /// A method for cloning a prefilter, to work-around the fact that Clone
+ /// is not object-safe.
+ fn clone_prefilter(&self) -> Box<dyn Prefilter>;
+
+ /// Returns the approximate total amount of heap used by this prefilter, in
+ /// units of bytes.
+ fn heap_bytes(&self) -> usize;
+
+ /// Returns true if and only if this prefilter never returns false
+ /// positives. This is useful for completely avoiding the automaton
+ /// when the prefilter can quickly confirm its own matches.
+ ///
+ /// By default, this returns true, which is conservative; it is always
+ /// correct to return `true`. Returning `false` here and reporting a false
+ /// positive will result in incorrect searches.
+ fn reports_false_positives(&self) -> bool {
+ true
+ }
+}
+
+impl<'a, P: Prefilter + ?Sized> Prefilter for &'a P {
+ #[inline]
+ fn next_candidate(
+ &self,
+ state: &mut PrefilterState,
+ haystack: &[u8],
+ at: usize,
+ ) -> Candidate {
+ (**self).next_candidate(state, haystack, at)
+ }
+
+ fn clone_prefilter(&self) -> Box<dyn Prefilter> {
+ (**self).clone_prefilter()
+ }
+
+ fn heap_bytes(&self) -> usize {
+ (**self).heap_bytes()
+ }
+
+ fn reports_false_positives(&self) -> bool {
+ (**self).reports_false_positives()
+ }
+}
+
+/// A convenience object for representing any type that implements Prefilter
+/// and is cloneable.
+#[derive(Debug)]
+pub struct PrefilterObj(Box<dyn Prefilter>);
+
+impl Clone for PrefilterObj {
+ fn clone(&self) -> Self {
+ PrefilterObj(self.0.clone_prefilter())
+ }
+}
+
+impl PrefilterObj {
+ /// Create a new prefilter object.
+ pub fn new<T: Prefilter + 'static>(t: T) -> PrefilterObj {
+ PrefilterObj(Box::new(t))
+ }
+
+ /// Return the underlying prefilter trait object.
+ pub fn as_ref(&self) -> &dyn Prefilter {
+ &*self.0
+ }
+}
+
+/// PrefilterState tracks state associated with the effectiveness of a
+/// prefilter. It is used to track how many bytes, on average, are skipped by
+/// the prefilter. If this average dips below a certain threshold over time,
+/// then the state renders the prefilter inert and stops using it.
+///
+/// A prefilter state should be created for each search. (Where creating an
+/// iterator via, e.g., `find_iter`, is treated as a single search.)
+#[derive(Clone, Debug)]
+pub struct PrefilterState {
+ /// The number of skips that has been executed.
+ skips: usize,
+ /// The total number of bytes that have been skipped.
+ skipped: usize,
+ /// The maximum length of a match. This is used to help determine how many
+ /// bytes on average should be skipped in order for a prefilter to be
+ /// effective.
+ max_match_len: usize,
+ /// Once this heuristic has been deemed permanently ineffective, it will be
+ /// inert throughout the rest of its lifetime. This serves as a cheap way
+ /// to check inertness.
+ inert: bool,
+ /// The last (absolute) position at which a prefilter scanned to.
+ /// Prefilters can use this position to determine whether to re-scan or
+ /// not.
+ ///
+ /// Unlike other things that impact effectiveness, this is a fleeting
+ /// condition. That is, a prefilter can be considered ineffective if it is
+ /// at a position before `last_scan_at`, but can become effective again
+ /// once the search moves past `last_scan_at`.
+ ///
+ /// The utility of this is to both avoid additional overhead from calling
+ /// the prefilter and to avoid quadratic behavior. This ensures that a
+ /// prefilter will scan any particular byte at most once. (Note that some
+ /// prefilters, like the start-byte prefilter, do not need to use this
+ /// field at all, since it only looks for starting bytes.)
+ last_scan_at: usize,
+}
+
+impl PrefilterState {
+ /// The minimum number of skip attempts to try before considering whether
+ /// a prefilter is effective or not.
+ const MIN_SKIPS: usize = 40;
+
+ /// The minimum amount of bytes that skipping must average, expressed as a
+ /// factor of the multiple of the length of a possible match.
+ ///
+ /// That is, after MIN_SKIPS have occurred, if the average number of bytes
+ /// skipped ever falls below MIN_AVG_FACTOR * max-match-length, then the
+ /// prefilter outed to be rendered inert.
+ const MIN_AVG_FACTOR: usize = 2;
+
+ /// Create a fresh prefilter state.
+ pub fn new(max_match_len: usize) -> PrefilterState {
+ PrefilterState {
+ skips: 0,
+ skipped: 0,
+ max_match_len,
+ inert: false,
+ last_scan_at: 0,
+ }
+ }
+
+ /// Update this state with the number of bytes skipped on the last
+ /// invocation of the prefilter.
+ #[inline]
+ fn update_skipped_bytes(&mut self, skipped: usize) {
+ self.skips += 1;
+ self.skipped += skipped;
+ }
+
+ /// Updates the position at which the last scan stopped. This may be
+ /// greater than the position of the last candidate reported. For example,
+ /// searching for the "rare" byte `z` in `abczdef` for the pattern `abcz`
+ /// will report a candidate at position `0`, but the end of its last scan
+ /// will be at position `3`.
+ ///
+ /// This position factors into the effectiveness of this prefilter. If the
+ /// current position is less than the last position at which a scan ended,
+ /// then the prefilter should not be re-run until the search moves past
+ /// that position.
+ #[inline]
+ fn update_at(&mut self, at: usize) {
+ if at > self.last_scan_at {
+ self.last_scan_at = at;
+ }
+ }
+
+ /// Return true if and only if this state indicates that a prefilter is
+ /// still effective.
+ ///
+ /// The given pos should correspond to the current starting position of the
+ /// search.
+ #[inline]
+ pub fn is_effective(&mut self, at: usize) -> bool {
+ if self.inert {
+ return false;
+ }
+ if at < self.last_scan_at {
+ return false;
+ }
+ if self.skips < PrefilterState::MIN_SKIPS {
+ return true;
+ }
+
+ let min_avg = PrefilterState::MIN_AVG_FACTOR * self.max_match_len;
+ if self.skipped >= min_avg * self.skips {
+ return true;
+ }
+
+ // We're inert.
+ self.inert = true;
+ false
+ }
+}
+
+/// A builder for constructing the best possible prefilter. When constructed,
+/// this builder will heuristically select the best prefilter it can build,
+/// if any, and discard the rest.
+#[derive(Debug)]
+pub struct Builder {
+ count: usize,
+ ascii_case_insensitive: bool,
+ start_bytes: StartBytesBuilder,
+ rare_bytes: RareBytesBuilder,
+ packed: Option<packed::Builder>,
+}
+
+impl Builder {
+ /// Create a new builder for constructing the best possible prefilter.
+ pub fn new(kind: MatchKind) -> Builder {
+ let pbuilder = kind
+ .as_packed()
+ .map(|kind| packed::Config::new().match_kind(kind).builder());
+ Builder {
+ count: 0,
+ ascii_case_insensitive: false,
+ start_bytes: StartBytesBuilder::new(),
+ rare_bytes: RareBytesBuilder::new(),
+ packed: pbuilder,
+ }
+ }
+
+ /// Enable ASCII case insensitivity. When set, byte strings added to this
+ /// builder will be interpreted without respect to ASCII case.
+ pub fn ascii_case_insensitive(mut self, yes: bool) -> Builder {
+ self.ascii_case_insensitive = yes;
+ self.start_bytes = self.start_bytes.ascii_case_insensitive(yes);
+ self.rare_bytes = self.rare_bytes.ascii_case_insensitive(yes);
+ self
+ }
+
+ /// Return a prefilter suitable for quickly finding potential matches.
+ ///
+ /// All patterns added to an Aho-Corasick automaton should be added to this
+ /// builder before attempting to construct the prefilter.
+ pub fn build(&self) -> Option<PrefilterObj> {
+ match (self.start_bytes.build(), self.rare_bytes.build()) {
+ // If we could build both start and rare prefilters, then there are
+ // a few cases in which we'd want to use the start-byte prefilter
+ // over the rare-byte prefilter, since the former has lower
+ // overhead.
+ (prestart @ Some(_), prerare @ Some(_)) => {
+ // If the start-byte prefilter can scan for a smaller number
+ // of bytes than the rare-byte prefilter, then it's probably
+ // faster.
+ let has_fewer_bytes =
+ self.start_bytes.count < self.rare_bytes.count;
+ // Otherwise, if the combined frequency rank of the detected
+ // bytes in the start-byte prefilter is "close" to the combined
+ // frequency rank of the rare-byte prefilter, then we pick
+ // the start-byte prefilter even if the rare-byte prefilter
+ // heuristically searches for rare bytes. This is because the
+ // rare-byte prefilter has higher constant costs, so we tend to
+ // prefer the start-byte prefilter when we can.
+ let has_rarer_bytes =
+ self.start_bytes.rank_sum <= self.rare_bytes.rank_sum + 50;
+ if has_fewer_bytes || has_rarer_bytes {
+ prestart
+ } else {
+ prerare
+ }
+ }
+ (prestart @ Some(_), None) => prestart,
+ (None, prerare @ Some(_)) => prerare,
+ (None, None) if self.ascii_case_insensitive => None,
+ (None, None) => self
+ .packed
+ .as_ref()
+ .and_then(|b| b.build())
+ .map(|s| PrefilterObj::new(Packed(s))),
+ }
+ }
+
+ /// Add a literal string to this prefilter builder.
+ pub fn add(&mut self, bytes: &[u8]) {
+ self.count += 1;
+ self.start_bytes.add(bytes);
+ self.rare_bytes.add(bytes);
+ if let Some(ref mut pbuilder) = self.packed {
+ pbuilder.add(bytes);
+ }
+ }
+}
+
+/// A type that wraps a packed searcher and implements the `Prefilter`
+/// interface.
+#[derive(Clone, Debug)]
+struct Packed(packed::Searcher);
+
+impl Prefilter for Packed {
+ fn next_candidate(
+ &self,
+ _state: &mut PrefilterState,
+ haystack: &[u8],
+ at: usize,
+ ) -> Candidate {
+ self.0.find_at(haystack, at).map_or(Candidate::None, Candidate::Match)
+ }
+
+ fn clone_prefilter(&self) -> Box<dyn Prefilter> {
+ Box::new(self.clone())
+ }
+
+ fn heap_bytes(&self) -> usize {
+ self.0.heap_bytes()
+ }
+
+ fn reports_false_positives(&self) -> bool {
+ false
+ }
+}
+
+/// A builder for constructing a rare byte prefilter.
+///
+/// A rare byte prefilter attempts to pick out a small set of rare bytes that
+/// occurr in the patterns, and then quickly scan to matches of those rare
+/// bytes.
+#[derive(Clone, Debug)]
+struct RareBytesBuilder {
+ /// Whether this prefilter should account for ASCII case insensitivity or
+ /// not.
+ ascii_case_insensitive: bool,
+ /// A set of rare bytes, indexed by byte value.
+ rare_set: ByteSet,
+ /// A set of byte offsets associated with bytes in a pattern. An entry
+ /// corresponds to a particular bytes (its index) and is only non-zero if
+ /// the byte occurred at an offset greater than 0 in at least one pattern.
+ ///
+ /// If a byte's offset is not representable in 8 bits, then the rare bytes
+ /// prefilter becomes inert.
+ byte_offsets: RareByteOffsets,
+ /// Whether this is available as a prefilter or not. This can be set to
+ /// false during construction if a condition is seen that invalidates the
+ /// use of the rare-byte prefilter.
+ available: bool,
+ /// The number of bytes set to an active value in `byte_offsets`.
+ count: usize,
+ /// The sum of frequency ranks for the rare bytes detected. This is
+ /// intended to give a heuristic notion of how rare the bytes are.
+ rank_sum: u16,
+}
+
+/// A set of bytes.
+#[derive(Clone, Copy)]
+struct ByteSet([bool; 256]);
+
+impl ByteSet {
+ fn empty() -> ByteSet {
+ ByteSet([false; 256])
+ }
+
+ fn insert(&mut self, b: u8) -> bool {
+ let new = !self.contains(b);
+ self.0[b as usize] = true;
+ new
+ }
+
+ fn contains(&self, b: u8) -> bool {
+ self.0[b as usize]
+ }
+}
+
+impl fmt::Debug for ByteSet {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ let mut bytes = vec![];
+ for b in 0..=255 {
+ if self.contains(b) {
+ bytes.push(b);
+ }
+ }
+ f.debug_struct("ByteSet").field("set", &bytes).finish()
+ }
+}
+
+/// A set of byte offsets, keyed by byte.
+#[derive(Clone, Copy)]
+struct RareByteOffsets {
+ /// Each entry corresponds to the maximum offset of the corresponding
+ /// byte across all patterns seen.
+ set: [RareByteOffset; 256],
+}
+
+impl RareByteOffsets {
+ /// Create a new empty set of rare byte offsets.
+ pub fn empty() -> RareByteOffsets {
+ RareByteOffsets { set: [RareByteOffset::default(); 256] }
+ }
+
+ /// Add the given offset for the given byte to this set. If the offset is
+ /// greater than the existing offset, then it overwrites the previous
+ /// value and returns false. If there is no previous value set, then this
+ /// sets it and returns true.
+ pub fn set(&mut self, byte: u8, off: RareByteOffset) {
+ self.set[byte as usize].max =
+ cmp::max(self.set[byte as usize].max, off.max);
+ }
+}
+
+impl fmt::Debug for RareByteOffsets {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ let mut offsets = vec![];
+ for off in self.set.iter() {
+ if off.max > 0 {
+ offsets.push(off);
+ }
+ }
+ f.debug_struct("RareByteOffsets").field("set", &offsets).finish()
+ }
+}
+
+/// Offsets associated with an occurrence of a "rare" byte in any of the
+/// patterns used to construct a single Aho-Corasick automaton.
+#[derive(Clone, Copy, Debug)]
+struct RareByteOffset {
+ /// The maximum offset at which a particular byte occurs from the start
+ /// of any pattern. This is used as a shift amount. That is, when an
+ /// occurrence of this byte is found, the candidate position reported by
+ /// the prefilter is `position_of_byte - max`, such that the automaton
+ /// will begin its search at a position that is guaranteed to observe a
+ /// match.
+ ///
+ /// To avoid accidentally quadratic behavior, a prefilter is considered
+ /// ineffective when it is asked to start scanning from a position that it
+ /// has already scanned past.
+ ///
+ /// Using a `u8` here means that if we ever see a pattern that's longer
+ /// than 255 bytes, then the entire rare byte prefilter is disabled.
+ max: u8,
+}
+
+impl Default for RareByteOffset {
+ fn default() -> RareByteOffset {
+ RareByteOffset { max: 0 }
+ }
+}
+
+impl RareByteOffset {
+ /// Create a new rare byte offset. If the given offset is too big, then
+ /// None is returned. In that case, callers should render the rare bytes
+ /// prefilter inert.
+ fn new(max: usize) -> Option<RareByteOffset> {
+ if max > u8::MAX as usize {
+ None
+ } else {
+ Some(RareByteOffset { max: max as u8 })
+ }
+ }
+}
+
+impl RareBytesBuilder {
+ /// Create a new builder for constructing a rare byte prefilter.
+ fn new() -> RareBytesBuilder {
+ RareBytesBuilder {
+ ascii_case_insensitive: false,
+ rare_set: ByteSet::empty(),
+ byte_offsets: RareByteOffsets::empty(),
+ available: true,
+ count: 0,
+ rank_sum: 0,
+ }
+ }
+
+ /// Enable ASCII case insensitivity. When set, byte strings added to this
+ /// builder will be interpreted without respect to ASCII case.
+ fn ascii_case_insensitive(mut self, yes: bool) -> RareBytesBuilder {
+ self.ascii_case_insensitive = yes;
+ self
+ }
+
+ /// Build the rare bytes prefilter.
+ ///
+ /// If there are more than 3 distinct starting bytes, or if heuristics
+ /// otherwise determine that this prefilter should not be used, then `None`
+ /// is returned.
+ fn build(&self) -> Option<PrefilterObj> {
+ if !self.available || self.count > 3 {
+ return None;
+ }
+ let (mut bytes, mut len) = ([0; 3], 0);
+ for b in 0..=255 {
+ if self.rare_set.contains(b) {
+ bytes[len] = b as u8;
+ len += 1;
+ }
+ }
+ match len {
+ 0 => None,
+ 1 => Some(PrefilterObj::new(RareBytesOne {
+ byte1: bytes[0],
+ offset: self.byte_offsets.set[bytes[0] as usize],
+ })),
+ 2 => Some(PrefilterObj::new(RareBytesTwo {
+ offsets: self.byte_offsets,
+ byte1: bytes[0],
+ byte2: bytes[1],
+ })),
+ 3 => Some(PrefilterObj::new(RareBytesThree {
+ offsets: self.byte_offsets,
+ byte1: bytes[0],
+ byte2: bytes[1],
+ byte3: bytes[2],
+ })),
+ _ => unreachable!(),
+ }
+ }
+
+ /// Add a byte string to this builder.
+ ///
+ /// All patterns added to an Aho-Corasick automaton should be added to this
+ /// builder before attempting to construct the prefilter.
+ fn add(&mut self, bytes: &[u8]) {
+ // If we've already given up, then do nothing.
+ if !self.available {
+ return;
+ }
+ // If we've already blown our budget, then don't waste time looking
+ // for more rare bytes.
+ if self.count > 3 {
+ self.available = false;
+ return;
+ }
+ // If the pattern is too long, then our offset table is bunk, so
+ // give up.
+ if bytes.len() >= 256 {
+ self.available = false;
+ return;
+ }
+ let mut rarest = match bytes.get(0) {
+ None => return,
+ Some(&b) => (b, freq_rank(b)),
+ };
+ // The idea here is to look for the rarest byte in each pattern, and
+ // add that to our set. As a special exception, if we see a byte that
+ // we've already added, then we immediately stop and choose that byte,
+ // even if there's another rare byte in the pattern. This helps us
+ // apply the rare byte optimization in more cases by attempting to pick
+ // bytes that are in common between patterns. So for example, if we
+ // were searching for `Sherlock` and `lockjaw`, then this would pick
+ // `k` for both patterns, resulting in the use of `memchr` instead of
+ // `memchr2` for `k` and `j`.
+ let mut found = false;
+ for (pos, &b) in bytes.iter().enumerate() {
+ self.set_offset(pos, b);
+ if found {
+ continue;
+ }
+ if self.rare_set.contains(b) {
+ found = true;
+ continue;
+ }
+ let rank = freq_rank(b);
+ if rank < rarest.1 {
+ rarest = (b, rank);
+ }
+ }
+ if !found {
+ self.add_rare_byte(rarest.0);
+ }
+ }
+
+ fn set_offset(&mut self, pos: usize, byte: u8) {
+ // This unwrap is OK because pos is never bigger than our max.
+ let offset = RareByteOffset::new(pos).unwrap();
+ self.byte_offsets.set(byte, offset);
+ if self.ascii_case_insensitive {
+ self.byte_offsets.set(opposite_ascii_case(byte), offset);
+ }
+ }
+
+ fn add_rare_byte(&mut self, byte: u8) {
+ self.add_one_rare_byte(byte);
+ if self.ascii_case_insensitive {
+ self.add_one_rare_byte(opposite_ascii_case(byte));
+ }
+ }
+
+ fn add_one_rare_byte(&mut self, byte: u8) {
+ if self.rare_set.insert(byte) {
+ self.count += 1;
+ self.rank_sum += freq_rank(byte) as u16;
+ }
+ }
+}
+
+/// A prefilter for scanning for a single "rare" byte.
+#[derive(Clone, Debug)]
+struct RareBytesOne {
+ byte1: u8,
+ offset: RareByteOffset,
+}
+
+impl Prefilter for RareBytesOne {
+ fn next_candidate(
+ &self,
+ state: &mut PrefilterState,
+ haystack: &[u8],
+ at: usize,
+ ) -> Candidate {
+ memchr(self.byte1, &haystack[at..])
+ .map(|i| {
+ let pos = at + i;
+ state.last_scan_at = pos;
+ cmp::max(at, pos.saturating_sub(self.offset.max as usize))
+ })
+ .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
+ }
+
+ fn clone_prefilter(&self) -> Box<dyn Prefilter> {
+ Box::new(self.clone())
+ }
+
+ fn heap_bytes(&self) -> usize {
+ 0
+ }
+}
+
+/// A prefilter for scanning for two "rare" bytes.
+#[derive(Clone, Debug)]
+struct RareBytesTwo {
+ offsets: RareByteOffsets,
+ byte1: u8,
+ byte2: u8,
+}
+
+impl Prefilter for RareBytesTwo {
+ fn next_candidate(
+ &self,
+ state: &mut PrefilterState,
+ haystack: &[u8],
+ at: usize,
+ ) -> Candidate {
+ memchr2(self.byte1, self.byte2, &haystack[at..])
+ .map(|i| {
+ let pos = at + i;
+ state.update_at(pos);
+ let offset = self.offsets.set[haystack[pos] as usize].max;
+ cmp::max(at, pos.saturating_sub(offset as usize))
+ })
+ .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
+ }
+
+ fn clone_prefilter(&self) -> Box<dyn Prefilter> {
+ Box::new(self.clone())
+ }
+
+ fn heap_bytes(&self) -> usize {
+ 0
+ }
+}
+
+/// A prefilter for scanning for three "rare" bytes.
+#[derive(Clone, Debug)]
+struct RareBytesThree {
+ offsets: RareByteOffsets,
+ byte1: u8,
+ byte2: u8,
+ byte3: u8,
+}
+
+impl Prefilter for RareBytesThree {
+ fn next_candidate(
+ &self,
+ state: &mut PrefilterState,
+ haystack: &[u8],
+ at: usize,
+ ) -> Candidate {
+ memchr3(self.byte1, self.byte2, self.byte3, &haystack[at..])
+ .map(|i| {
+ let pos = at + i;
+ state.update_at(pos);
+ let offset = self.offsets.set[haystack[pos] as usize].max;
+ cmp::max(at, pos.saturating_sub(offset as usize))
+ })
+ .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
+ }
+
+ fn clone_prefilter(&self) -> Box<dyn Prefilter> {
+ Box::new(self.clone())
+ }
+
+ fn heap_bytes(&self) -> usize {
+ 0
+ }
+}
+
+/// A builder for constructing a starting byte prefilter.
+///
+/// A starting byte prefilter is a simplistic prefilter that looks for possible
+/// matches by reporting all positions corresponding to a particular byte. This
+/// generally only takes affect when there are at most 3 distinct possible
+/// starting bytes. e.g., the patterns `foo`, `bar`, and `baz` have two
+/// distinct starting bytes (`f` and `b`), and this prefilter returns all
+/// occurrences of either `f` or `b`.
+///
+/// In some cases, a heuristic frequency analysis may determine that it would
+/// be better not to use this prefilter even when there are 3 or fewer distinct
+/// starting bytes.
+#[derive(Clone, Debug)]
+struct StartBytesBuilder {
+ /// Whether this prefilter should account for ASCII case insensitivity or
+ /// not.
+ ascii_case_insensitive: bool,
+ /// The set of starting bytes observed.
+ byteset: Vec<bool>,
+ /// The number of bytes set to true in `byteset`.
+ count: usize,
+ /// The sum of frequency ranks for the rare bytes detected. This is
+ /// intended to give a heuristic notion of how rare the bytes are.
+ rank_sum: u16,
+}
+
+impl StartBytesBuilder {
+ /// Create a new builder for constructing a start byte prefilter.
+ fn new() -> StartBytesBuilder {
+ StartBytesBuilder {
+ ascii_case_insensitive: false,
+ byteset: vec![false; 256],
+ count: 0,
+ rank_sum: 0,
+ }
+ }
+
+ /// Enable ASCII case insensitivity. When set, byte strings added to this
+ /// builder will be interpreted without respect to ASCII case.
+ fn ascii_case_insensitive(mut self, yes: bool) -> StartBytesBuilder {
+ self.ascii_case_insensitive = yes;
+ self
+ }
+
+ /// Build the starting bytes prefilter.
+ ///
+ /// If there are more than 3 distinct starting bytes, or if heuristics
+ /// otherwise determine that this prefilter should not be used, then `None`
+ /// is returned.
+ fn build(&self) -> Option<PrefilterObj> {
+ if self.count > 3 {
+ return None;
+ }
+ let (mut bytes, mut len) = ([0; 3], 0);
+ for b in 0..256 {
+ if !self.byteset[b] {
+ continue;
+ }
+ // We don't handle non-ASCII bytes for now. Getting non-ASCII
+ // bytes right is trickier, since we generally don't want to put
+ // a leading UTF-8 code unit into a prefilter that isn't ASCII,
+ // since they can frequently. Instead, it would be better to use a
+ // continuation byte, but this requires more sophisticated analysis
+ // of the automaton and a richer prefilter API.
+ if b > 0x7F {
+ return None;
+ }
+ bytes[len] = b as u8;
+ len += 1;
+ }
+ match len {
+ 0 => None,
+ 1 => Some(PrefilterObj::new(StartBytesOne { byte1: bytes[0] })),
+ 2 => Some(PrefilterObj::new(StartBytesTwo {
+ byte1: bytes[0],
+ byte2: bytes[1],
+ })),
+ 3 => Some(PrefilterObj::new(StartBytesThree {
+ byte1: bytes[0],
+ byte2: bytes[1],
+ byte3: bytes[2],
+ })),
+ _ => unreachable!(),
+ }
+ }
+
+ /// Add a byte string to this builder.
+ ///
+ /// All patterns added to an Aho-Corasick automaton should be added to this
+ /// builder before attempting to construct the prefilter.
+ fn add(&mut self, bytes: &[u8]) {
+ if self.count > 3 {
+ return;
+ }
+ if let Some(&byte) = bytes.get(0) {
+ self.add_one_byte(byte);
+ if self.ascii_case_insensitive {
+ self.add_one_byte(opposite_ascii_case(byte));
+ }
+ }
+ }
+
+ fn add_one_byte(&mut self, byte: u8) {
+ if !self.byteset[byte as usize] {
+ self.byteset[byte as usize] = true;
+ self.count += 1;
+ self.rank_sum += freq_rank(byte) as u16;
+ }
+ }
+}
+
+/// A prefilter for scanning for a single starting byte.
+#[derive(Clone, Debug)]
+struct StartBytesOne {
+ byte1: u8,
+}
+
+impl Prefilter for StartBytesOne {
+ fn next_candidate(
+ &self,
+ _state: &mut PrefilterState,
+ haystack: &[u8],
+ at: usize,
+ ) -> Candidate {
+ memchr(self.byte1, &haystack[at..])
+ .map(|i| at + i)
+ .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
+ }
+
+ fn clone_prefilter(&self) -> Box<dyn Prefilter> {
+ Box::new(self.clone())
+ }
+
+ fn heap_bytes(&self) -> usize {
+ 0
+ }
+}
+
+/// A prefilter for scanning for two starting bytes.
+#[derive(Clone, Debug)]
+struct StartBytesTwo {
+ byte1: u8,
+ byte2: u8,
+}
+
+impl Prefilter for StartBytesTwo {
+ fn next_candidate(
+ &self,
+ _state: &mut PrefilterState,
+ haystack: &[u8],
+ at: usize,
+ ) -> Candidate {
+ memchr2(self.byte1, self.byte2, &haystack[at..])
+ .map(|i| at + i)
+ .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
+ }
+
+ fn clone_prefilter(&self) -> Box<dyn Prefilter> {
+ Box::new(self.clone())
+ }
+
+ fn heap_bytes(&self) -> usize {
+ 0
+ }
+}
+
+/// A prefilter for scanning for three starting bytes.
+#[derive(Clone, Debug)]
+struct StartBytesThree {
+ byte1: u8,
+ byte2: u8,
+ byte3: u8,
+}
+
+impl Prefilter for StartBytesThree {
+ fn next_candidate(
+ &self,
+ _state: &mut PrefilterState,
+ haystack: &[u8],
+ at: usize,
+ ) -> Candidate {
+ memchr3(self.byte1, self.byte2, self.byte3, &haystack[at..])
+ .map(|i| at + i)
+ .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
+ }
+
+ fn clone_prefilter(&self) -> Box<dyn Prefilter> {
+ Box::new(self.clone())
+ }
+
+ fn heap_bytes(&self) -> usize {
+ 0
+ }
+}
+
+/// Return the next candidate reported by the given prefilter while
+/// simultaneously updating the given prestate.
+///
+/// The caller is responsible for checking the prestate before deciding whether
+/// to initiate a search.
+#[inline]
+pub fn next<P: Prefilter>(
+ prestate: &mut PrefilterState,
+ prefilter: P,
+ haystack: &[u8],
+ at: usize,
+) -> Candidate {
+ let cand = prefilter.next_candidate(prestate, haystack, at);
+ match cand {
+ Candidate::None => {
+ prestate.update_skipped_bytes(haystack.len() - at);
+ }
+ Candidate::Match(ref m) => {
+ prestate.update_skipped_bytes(m.start() - at);
+ }
+ Candidate::PossibleStartOfMatch(i) => {
+ prestate.update_skipped_bytes(i - at);
+ }
+ }
+ cand
+}
+
+/// If the given byte is an ASCII letter, then return it in the opposite case.
+/// e.g., Given `b'A'`, this returns `b'a'`, and given `b'a'`, this returns
+/// `b'A'`. If a non-ASCII letter is given, then the given byte is returned.
+pub fn opposite_ascii_case(b: u8) -> u8 {
+ if b'A' <= b && b <= b'Z' {
+ b.to_ascii_lowercase()
+ } else if b'a' <= b && b <= b'z' {
+ b.to_ascii_uppercase()
+ } else {
+ b
+ }
+}
+
+/// Return the frequency rank of the given byte. The higher the rank, the more
+/// common the byte (heuristically speaking).
+fn freq_rank(b: u8) -> u8 {
+ use byte_frequencies::BYTE_FREQUENCIES;
+ BYTE_FREQUENCIES[b as usize]
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn scratch() {
+ let mut b = Builder::new(MatchKind::LeftmostFirst);
+ b.add(b"Sherlock");
+ b.add(b"locjaw");
+ // b.add(b"Sherlock");
+ // b.add(b"Holmes");
+ // b.add(b"Watson");
+ // b.add("Шерлок Холмс".as_bytes());
+ // b.add("Джон Уотсон".as_bytes());
+
+ let s = b.build().unwrap();
+ println!("{:?}", s);
+ }
+}
diff --git a/src/state_id.rs b/src/state_id.rs
new file mode 100644
index 0000000..22f6be2
--- /dev/null
+++ b/src/state_id.rs
@@ -0,0 +1,192 @@
+use std::fmt::Debug;
+use std::hash::Hash;
+
+use error::{Error, Result};
+
+// NOTE: Most of this code was copied from regex-automata, but without the
+// (de)serialization specific stuff.
+
+/// Check that the premultiplication of the given state identifier can
+/// fit into the representation indicated by `S`. If it cannot, or if it
+/// overflows `usize` itself, then an error is returned.
+pub fn premultiply_overflow_error<S: StateID>(
+ last_state: S,
+ alphabet_len: usize,
+) -> Result<()> {
+ let requested = match last_state.to_usize().checked_mul(alphabet_len) {
+ Some(requested) => requested,
+ None => return Err(Error::premultiply_overflow(0, 0)),
+ };
+ if requested > S::max_id() {
+ return Err(Error::premultiply_overflow(S::max_id(), requested));
+ }
+ Ok(())
+}
+
+/// Convert the given `usize` to the chosen state identifier
+/// representation. If the given value cannot fit in the chosen
+/// representation, then an error is returned.
+pub fn usize_to_state_id<S: StateID>(value: usize) -> Result<S> {
+ if value > S::max_id() {
+ Err(Error::state_id_overflow(S::max_id()))
+ } else {
+ Ok(S::from_usize(value))
+ }
+}
+
+/// Return the unique identifier for an automaton's fail state in the chosen
+/// representation indicated by `S`.
+pub fn fail_id<S: StateID>() -> S {
+ S::from_usize(0)
+}
+
+/// Return the unique identifier for an automaton's fail state in the chosen
+/// representation indicated by `S`.
+pub fn dead_id<S: StateID>() -> S {
+ S::from_usize(1)
+}
+
+mod private {
+ /// Sealed stops crates other than aho-corasick from implementing any
+ /// traits that use it.
+ pub trait Sealed {}
+ impl Sealed for u8 {}
+ impl Sealed for u16 {}
+ impl Sealed for u32 {}
+ impl Sealed for u64 {}
+ impl Sealed for usize {}
+}
+
+/// A trait describing the representation of an automaton's state identifier.
+///
+/// The purpose of this trait is to safely express both the possible state
+/// identifier representations that can be used in an automaton and to convert
+/// between state identifier representations and types that can be used to
+/// efficiently index memory (such as `usize`).
+///
+/// In general, one should not need to implement this trait explicitly. Indeed,
+/// for now, this trait is sealed such that it cannot be implemented by any
+/// other type. In particular, this crate provides implementations for `u8`,
+/// `u16`, `u32`, `u64` and `usize`. (`u32` and `u64` are only provided for
+/// targets that can represent all corresponding values in a `usize`.)
+pub trait StateID:
+ private::Sealed
+ + Clone
+ + Copy
+ + Debug
+ + Eq
+ + Hash
+ + PartialEq
+ + PartialOrd
+ + Ord
+{
+ /// Convert from a `usize` to this implementation's representation.
+ ///
+ /// Implementors may assume that `n <= Self::max_id`. That is, implementors
+ /// do not need to check whether `n` can fit inside this implementation's
+ /// representation.
+ fn from_usize(n: usize) -> Self;
+
+ /// Convert this implementation's representation to a `usize`.
+ ///
+ /// Implementors must not return a `usize` value greater than
+ /// `Self::max_id` and must not permit overflow when converting between the
+ /// implementor's representation and `usize`. In general, the preferred
+ /// way for implementors to achieve this is to simply not provide
+ /// implementations of `StateID` that cannot fit into the target platform's
+ /// `usize`.
+ fn to_usize(self) -> usize;
+
+ /// Return the maximum state identifier supported by this representation.
+ ///
+ /// Implementors must return a correct bound. Doing otherwise may result
+ /// in unspecified behavior (but will not violate memory safety).
+ fn max_id() -> usize;
+}
+
+impl StateID for usize {
+ #[inline]
+ fn from_usize(n: usize) -> usize {
+ n
+ }
+
+ #[inline]
+ fn to_usize(self) -> usize {
+ self
+ }
+
+ #[inline]
+ fn max_id() -> usize {
+ ::std::usize::MAX
+ }
+}
+
+impl StateID for u8 {
+ #[inline]
+ fn from_usize(n: usize) -> u8 {
+ n as u8
+ }
+
+ #[inline]
+ fn to_usize(self) -> usize {
+ self as usize
+ }
+
+ #[inline]
+ fn max_id() -> usize {
+ ::std::u8::MAX as usize
+ }
+}
+
+impl StateID for u16 {
+ #[inline]
+ fn from_usize(n: usize) -> u16 {
+ n as u16
+ }
+
+ #[inline]
+ fn to_usize(self) -> usize {
+ self as usize
+ }
+
+ #[inline]
+ fn max_id() -> usize {
+ ::std::u16::MAX as usize
+ }
+}
+
+#[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
+impl StateID for u32 {
+ #[inline]
+ fn from_usize(n: usize) -> u32 {
+ n as u32
+ }
+
+ #[inline]
+ fn to_usize(self) -> usize {
+ self as usize
+ }
+
+ #[inline]
+ fn max_id() -> usize {
+ ::std::u32::MAX as usize
+ }
+}
+
+#[cfg(target_pointer_width = "64")]
+impl StateID for u64 {
+ #[inline]
+ fn from_usize(n: usize) -> u64 {
+ n as u64
+ }
+
+ #[inline]
+ fn to_usize(self) -> usize {
+ self as usize
+ }
+
+ #[inline]
+ fn max_id() -> usize {
+ ::std::u64::MAX as usize
+ }
+}
diff --git a/src/tests.rs b/src/tests.rs
new file mode 100644
index 0000000..0ae31f0
--- /dev/null
+++ b/src/tests.rs
@@ -0,0 +1,1152 @@
+use std::collections::HashMap;
+use std::io;
+use std::usize;
+
+use {AhoCorasickBuilder, Match, MatchKind};
+
+/// A description of a single test against an Aho-Corasick automaton.
+///
+/// A single test may not necessarily pass on every configuration of an
+/// Aho-Corasick automaton. The tests are categorized and grouped appropriately
+/// below.
+#[derive(Clone, Debug, Eq, PartialEq)]
+struct SearchTest {
+ /// The name of this test, for debugging.
+ name: &'static str,
+ /// The patterns to search for.
+ patterns: &'static [&'static str],
+ /// The text to search.
+ haystack: &'static str,
+ /// Each match is a triple of (pattern_index, start, end), where
+ /// pattern_index is an index into `patterns` and `start`/`end` are indices
+ /// into `haystack`.
+ matches: &'static [(usize, usize, usize)],
+}
+
+/// Short-hand constructor for SearchTest. We use it a lot below.
+macro_rules! t {
+ ($name:ident, $patterns:expr, $haystack:expr, $matches:expr) => {
+ SearchTest {
+ name: stringify!($name),
+ patterns: $patterns,
+ haystack: $haystack,
+ matches: $matches,
+ }
+ };
+}
+
+/// A collection of test groups.
+type TestCollection = &'static [&'static [SearchTest]];
+
+// Define several collections corresponding to the different type of match
+// semantics supported by Aho-Corasick. These collections have some overlap,
+// but each collection should have some tests that no other collection has.
+
+/// Tests for Aho-Corasick's standard non-overlapping match semantics.
+const AC_STANDARD_NON_OVERLAPPING: TestCollection =
+ &[BASICS, NON_OVERLAPPING, STANDARD, REGRESSION];
+
+/// Tests for Aho-Corasick's anchored standard non-overlapping match semantics.
+const AC_STANDARD_ANCHORED_NON_OVERLAPPING: TestCollection =
+ &[ANCHORED_BASICS, ANCHORED_NON_OVERLAPPING, STANDARD_ANCHORED];
+
+/// Tests for Aho-Corasick's standard overlapping match semantics.
+const AC_STANDARD_OVERLAPPING: TestCollection =
+ &[BASICS, OVERLAPPING, REGRESSION];
+
+/// Tests for Aho-Corasick's anchored standard overlapping match semantics.
+const AC_STANDARD_ANCHORED_OVERLAPPING: TestCollection =
+ &[ANCHORED_BASICS, ANCHORED_OVERLAPPING];
+
+/// Tests for Aho-Corasick's leftmost-first match semantics.
+const AC_LEFTMOST_FIRST: TestCollection =
+ &[BASICS, NON_OVERLAPPING, LEFTMOST, LEFTMOST_FIRST, REGRESSION];
+
+/// Tests for Aho-Corasick's anchored leftmost-first match semantics.
+const AC_LEFTMOST_FIRST_ANCHORED: TestCollection = &[
+ ANCHORED_BASICS,
+ ANCHORED_NON_OVERLAPPING,
+ ANCHORED_LEFTMOST,
+ ANCHORED_LEFTMOST_FIRST,
+];
+
+/// Tests for Aho-Corasick's leftmost-longest match semantics.
+const AC_LEFTMOST_LONGEST: TestCollection =
+ &[BASICS, NON_OVERLAPPING, LEFTMOST, LEFTMOST_LONGEST, REGRESSION];
+
+/// Tests for Aho-Corasick's anchored leftmost-longest match semantics.
+const AC_LEFTMOST_LONGEST_ANCHORED: TestCollection = &[
+ ANCHORED_BASICS,
+ ANCHORED_NON_OVERLAPPING,
+ ANCHORED_LEFTMOST,
+ ANCHORED_LEFTMOST_LONGEST,
+];
+
+// Now define the individual tests that make up the collections above.
+
+/// A collection of tests for the Aho-Corasick algorithm that should always be
+/// true regardless of match semantics. That is, all combinations of
+/// leftmost-{shortest, first, longest} x {overlapping, non-overlapping}
+/// should produce the same answer.
+const BASICS: &'static [SearchTest] = &[
+ t!(basic000, &[], "", &[]),
+ t!(basic001, &["a"], "", &[]),
+ t!(basic010, &["a"], "a", &[(0, 0, 1)]),
+ t!(basic020, &["a"], "aa", &[(0, 0, 1), (0, 1, 2)]),
+ t!(basic030, &["a"], "aaa", &[(0, 0, 1), (0, 1, 2), (0, 2, 3)]),
+ t!(basic040, &["a"], "aba", &[(0, 0, 1), (0, 2, 3)]),
+ t!(basic050, &["a"], "bba", &[(0, 2, 3)]),
+ t!(basic060, &["a"], "bbb", &[]),
+ t!(basic070, &["a"], "bababbbba", &[(0, 1, 2), (0, 3, 4), (0, 8, 9)]),
+ t!(basic100, &["aa"], "", &[]),
+ t!(basic110, &["aa"], "aa", &[(0, 0, 2)]),
+ t!(basic120, &["aa"], "aabbaa", &[(0, 0, 2), (0, 4, 6)]),
+ t!(basic130, &["aa"], "abbab", &[]),
+ t!(basic140, &["aa"], "abbabaa", &[(0, 5, 7)]),
+ t!(basic200, &["abc"], "abc", &[(0, 0, 3)]),
+ t!(basic210, &["abc"], "zazabzabcz", &[(0, 6, 9)]),
+ t!(basic220, &["abc"], "zazabczabcz", &[(0, 3, 6), (0, 7, 10)]),
+ t!(basic300, &["a", "b"], "", &[]),
+ t!(basic310, &["a", "b"], "z", &[]),
+ t!(basic320, &["a", "b"], "b", &[(1, 0, 1)]),
+ t!(basic330, &["a", "b"], "a", &[(0, 0, 1)]),
+ t!(
+ basic340,
+ &["a", "b"],
+ "abba",
+ &[(0, 0, 1), (1, 1, 2), (1, 2, 3), (0, 3, 4),]
+ ),
+ t!(
+ basic350,
+ &["b", "a"],
+ "abba",
+ &[(1, 0, 1), (0, 1, 2), (0, 2, 3), (1, 3, 4),]
+ ),
+ t!(basic360, &["abc", "bc"], "xbc", &[(1, 1, 3),]),
+ t!(basic400, &["foo", "bar"], "", &[]),
+ t!(basic410, &["foo", "bar"], "foobar", &[(0, 0, 3), (1, 3, 6),]),
+ t!(basic420, &["foo", "bar"], "barfoo", &[(1, 0, 3), (0, 3, 6),]),
+ t!(basic430, &["foo", "bar"], "foofoo", &[(0, 0, 3), (0, 3, 6),]),
+ t!(basic440, &["foo", "bar"], "barbar", &[(1, 0, 3), (1, 3, 6),]),
+ t!(basic450, &["foo", "bar"], "bafofoo", &[(0, 4, 7),]),
+ t!(basic460, &["bar", "foo"], "bafofoo", &[(1, 4, 7),]),
+ t!(basic470, &["foo", "bar"], "fobabar", &[(1, 4, 7),]),
+ t!(basic480, &["bar", "foo"], "fobabar", &[(0, 4, 7),]),
+ t!(basic600, &[""], "", &[(0, 0, 0)]),
+ t!(basic610, &[""], "a", &[(0, 0, 0), (0, 1, 1)]),
+ t!(basic620, &[""], "abc", &[(0, 0, 0), (0, 1, 1), (0, 2, 2), (0, 3, 3)]),
+ t!(basic700, &["yabcdef", "abcdezghi"], "yabcdefghi", &[(0, 0, 7),]),
+ t!(basic710, &["yabcdef", "abcdezghi"], "yabcdezghi", &[(1, 1, 10),]),
+ t!(
+ basic720,
+ &["yabcdef", "bcdeyabc", "abcdezghi"],
+ "yabcdezghi",
+ &[(2, 1, 10),]
+ ),
+];
+
+/// A collection of *anchored* tests for the Aho-Corasick algorithm that should
+/// always be true regardless of match semantics. That is, all combinations of
+/// leftmost-{shortest, first, longest} x {overlapping, non-overlapping} should
+/// produce the same answer.
+const ANCHORED_BASICS: &'static [SearchTest] = &[
+ t!(abasic000, &[], "", &[]),
+ t!(abasic010, &[""], "", &[(0, 0, 0)]),
+ t!(abasic020, &[""], "a", &[(0, 0, 0)]),
+ t!(abasic030, &[""], "abc", &[(0, 0, 0)]),
+ t!(abasic100, &["a"], "a", &[(0, 0, 1)]),
+ t!(abasic110, &["a"], "aa", &[(0, 0, 1)]),
+ t!(abasic120, &["a", "b"], "ab", &[(0, 0, 1)]),
+ t!(abasic130, &["a", "b"], "ba", &[(1, 0, 1)]),
+ t!(abasic140, &["foo", "foofoo"], "foo", &[(0, 0, 3)]),
+ t!(abasic150, &["foofoo", "foo"], "foo", &[(1, 0, 3)]),
+];
+
+/// Tests for non-overlapping standard match semantics.
+///
+/// These tests generally shouldn't pass for leftmost-{first,longest}, although
+/// some do in order to write clearer tests. For example, standard000 will
+/// pass with leftmost-first semantics, but standard010 will not. We write
+/// both to emphasize how the match semantics work.
+const STANDARD: &'static [SearchTest] = &[
+ t!(standard000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]),
+ t!(standard010, &["abcd", "ab"], "abcd", &[(1, 0, 2)]),
+ t!(standard020, &["abcd", "ab", "abc"], "abcd", &[(1, 0, 2)]),
+ t!(standard030, &["abcd", "abc", "ab"], "abcd", &[(2, 0, 2)]),
+ t!(standard040, &["a", ""], "a", &[(1, 0, 0), (1, 1, 1)]),
+ t!(
+ standard400,
+ &["abcd", "bcd", "cd", "b"],
+ "abcd",
+ &[(3, 1, 2), (2, 2, 4),]
+ ),
+ t!(standard410, &["", "a"], "a", &[(0, 0, 0), (0, 1, 1),]),
+ t!(standard420, &["", "a"], "aa", &[(0, 0, 0), (0, 1, 1), (0, 2, 2),]),
+ t!(standard430, &["", "a", ""], "a", &[(0, 0, 0), (0, 1, 1),]),
+ t!(standard440, &["a", "", ""], "a", &[(1, 0, 0), (1, 1, 1),]),
+ t!(standard450, &["", "", "a"], "a", &[(0, 0, 0), (0, 1, 1),]),
+];
+
+/// Like STANDARD, but for anchored searches.
+const STANDARD_ANCHORED: &'static [SearchTest] = &[
+ t!(astandard000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]),
+ t!(astandard010, &["abcd", "ab"], "abcd", &[(1, 0, 2)]),
+ t!(astandard020, &["abcd", "ab", "abc"], "abcd", &[(1, 0, 2)]),
+ t!(astandard030, &["abcd", "abc", "ab"], "abcd", &[(2, 0, 2)]),
+ t!(astandard040, &["a", ""], "a", &[(1, 0, 0)]),
+ t!(astandard050, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4)]),
+ t!(astandard410, &["", "a"], "a", &[(0, 0, 0)]),
+ t!(astandard420, &["", "a"], "aa", &[(0, 0, 0)]),
+ t!(astandard430, &["", "a", ""], "a", &[(0, 0, 0)]),
+ t!(astandard440, &["a", "", ""], "a", &[(1, 0, 0)]),
+ t!(astandard450, &["", "", "a"], "a", &[(0, 0, 0)]),
+];
+
+/// Tests for non-overlapping leftmost match semantics. These should pass for
+/// both leftmost-first and leftmost-longest match kinds. Stated differently,
+/// among ambiguous matches, the longest match and the match that appeared
+/// first when constructing the automaton should always be the same.
+const LEFTMOST: &'static [SearchTest] = &[
+ t!(leftmost000, &["ab", "ab"], "abcd", &[(0, 0, 2)]),
+ t!(leftmost010, &["a", ""], "a", &[(0, 0, 1), (1, 1, 1)]),
+ t!(leftmost020, &["", ""], "a", &[(0, 0, 0), (0, 1, 1)]),
+ t!(leftmost030, &["a", "ab"], "aa", &[(0, 0, 1), (0, 1, 2)]),
+ t!(leftmost031, &["ab", "a"], "aa", &[(1, 0, 1), (1, 1, 2)]),
+ t!(leftmost032, &["ab", "a"], "xayabbbz", &[(1, 1, 2), (0, 3, 5)]),
+ t!(leftmost300, &["abcd", "bce", "b"], "abce", &[(1, 1, 4)]),
+ t!(leftmost310, &["abcd", "ce", "bc"], "abce", &[(2, 1, 3)]),
+ t!(leftmost320, &["abcd", "bce", "ce", "b"], "abce", &[(1, 1, 4)]),
+ t!(leftmost330, &["abcd", "bce", "cz", "bc"], "abcz", &[(3, 1, 3)]),
+ t!(leftmost340, &["bce", "cz", "bc"], "bcz", &[(2, 0, 2)]),
+ t!(leftmost350, &["abc", "bd", "ab"], "abd", &[(2, 0, 2)]),
+ t!(
+ leftmost360,
+ &["abcdefghi", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(2, 0, 8),]
+ ),
+ t!(
+ leftmost370,
+ &["abcdefghi", "cde", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(3, 0, 8),]
+ ),
+ t!(
+ leftmost380,
+ &["abcdefghi", "hz", "abcdefgh", "a"],
+ "abcdefghz",
+ &[(2, 0, 8),]
+ ),
+ t!(
+ leftmost390,
+ &["b", "abcdefghi", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(3, 0, 8),]
+ ),
+ t!(
+ leftmost400,
+ &["h", "abcdefghi", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(3, 0, 8),]
+ ),
+ t!(
+ leftmost410,
+ &["z", "abcdefghi", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(3, 0, 8), (0, 8, 9),]
+ ),
+];
+
+/// Like LEFTMOST, but for anchored searches.
+const ANCHORED_LEFTMOST: &'static [SearchTest] = &[
+ t!(aleftmost000, &["ab", "ab"], "abcd", &[(0, 0, 2)]),
+ t!(aleftmost010, &["a", ""], "a", &[(0, 0, 1)]),
+ t!(aleftmost020, &["", ""], "a", &[(0, 0, 0)]),
+ t!(aleftmost030, &["a", "ab"], "aa", &[(0, 0, 1)]),
+ t!(aleftmost031, &["ab", "a"], "aa", &[(1, 0, 1)]),
+ t!(aleftmost032, &["ab", "a"], "xayabbbz", &[]),
+ t!(aleftmost300, &["abcd", "bce", "b"], "abce", &[]),
+ t!(aleftmost310, &["abcd", "ce", "bc"], "abce", &[]),
+ t!(aleftmost320, &["abcd", "bce", "ce", "b"], "abce", &[]),
+ t!(aleftmost330, &["abcd", "bce", "cz", "bc"], "abcz", &[]),
+ t!(aleftmost340, &["bce", "cz", "bc"], "bcz", &[(2, 0, 2)]),
+ t!(aleftmost350, &["abc", "bd", "ab"], "abd", &[(2, 0, 2)]),
+ t!(
+ aleftmost360,
+ &["abcdefghi", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(2, 0, 8),]
+ ),
+ t!(
+ aleftmost370,
+ &["abcdefghi", "cde", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(3, 0, 8),]
+ ),
+ t!(
+ aleftmost380,
+ &["abcdefghi", "hz", "abcdefgh", "a"],
+ "abcdefghz",
+ &[(2, 0, 8),]
+ ),
+ t!(
+ aleftmost390,
+ &["b", "abcdefghi", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(3, 0, 8),]
+ ),
+ t!(
+ aleftmost400,
+ &["h", "abcdefghi", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(3, 0, 8),]
+ ),
+ t!(
+ aleftmost410,
+ &["z", "abcdefghi", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(3, 0, 8)]
+ ),
+];
+
+/// Tests for non-overlapping leftmost-first match semantics. These tests
+/// should generally be specific to leftmost-first, which means they should
+/// generally fail under leftmost-longest semantics.
+const LEFTMOST_FIRST: &'static [SearchTest] = &[
+ t!(leftfirst000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]),
+ t!(leftfirst010, &["", "a"], "a", &[(0, 0, 0), (0, 1, 1)]),
+ t!(leftfirst011, &["", "a", ""], "a", &[(0, 0, 0), (0, 1, 1),]),
+ t!(leftfirst012, &["a", "", ""], "a", &[(0, 0, 1), (1, 1, 1),]),
+ t!(leftfirst013, &["", "", "a"], "a", &[(0, 0, 0), (0, 1, 1),]),
+ t!(leftfirst020, &["abcd", "ab"], "abcd", &[(0, 0, 4)]),
+ t!(leftfirst030, &["ab", "ab"], "abcd", &[(0, 0, 2)]),
+ t!(leftfirst040, &["a", "ab"], "xayabbbz", &[(0, 1, 2), (0, 3, 4)]),
+ t!(leftfirst100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[(1, 1, 5)]),
+ t!(leftfirst110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[(1, 1, 6)]),
+ t!(leftfirst300, &["abcd", "b", "bce"], "abce", &[(1, 1, 2)]),
+ t!(
+ leftfirst310,
+ &["abcd", "b", "bce", "ce"],
+ "abce",
+ &[(1, 1, 2), (3, 2, 4),]
+ ),
+ t!(
+ leftfirst320,
+ &["a", "abcdefghi", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(0, 0, 1), (2, 7, 9),]
+ ),
+ t!(leftfirst330, &["a", "abab"], "abab", &[(0, 0, 1), (0, 2, 3)]),
+];
+
+/// Like LEFTMOST_FIRST, but for anchored searches.
+const ANCHORED_LEFTMOST_FIRST: &'static [SearchTest] = &[
+ t!(aleftfirst000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]),
+ t!(aleftfirst010, &["", "a"], "a", &[(0, 0, 0)]),
+ t!(aleftfirst011, &["", "a", ""], "a", &[(0, 0, 0)]),
+ t!(aleftfirst012, &["a", "", ""], "a", &[(0, 0, 1)]),
+ t!(aleftfirst013, &["", "", "a"], "a", &[(0, 0, 0)]),
+ t!(aleftfirst020, &["abcd", "ab"], "abcd", &[(0, 0, 4)]),
+ t!(aleftfirst030, &["ab", "ab"], "abcd", &[(0, 0, 2)]),
+ t!(aleftfirst040, &["a", "ab"], "xayabbbz", &[]),
+ t!(aleftfirst100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[]),
+ t!(aleftfirst110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[]),
+ t!(aleftfirst300, &["abcd", "b", "bce"], "abce", &[]),
+ t!(aleftfirst310, &["abcd", "b", "bce", "ce"], "abce", &[]),
+ t!(
+ aleftfirst320,
+ &["a", "abcdefghi", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(0, 0, 1)]
+ ),
+ t!(aleftfirst330, &["a", "abab"], "abab", &[(0, 0, 1)]),
+];
+
+/// Tests for non-overlapping leftmost-longest match semantics. These tests
+/// should generally be specific to leftmost-longest, which means they should
+/// generally fail under leftmost-first semantics.
+const LEFTMOST_LONGEST: &'static [SearchTest] = &[
+ t!(leftlong000, &["ab", "abcd"], "abcd", &[(1, 0, 4)]),
+ t!(leftlong010, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4),]),
+ t!(leftlong020, &["", "a"], "a", &[(1, 0, 1), (0, 1, 1),]),
+ t!(leftlong021, &["", "a", ""], "a", &[(1, 0, 1), (0, 1, 1),]),
+ t!(leftlong022, &["a", "", ""], "a", &[(0, 0, 1), (1, 1, 1),]),
+ t!(leftlong023, &["", "", "a"], "a", &[(2, 0, 1), (0, 1, 1),]),
+ t!(leftlong030, &["", "a"], "aa", &[(1, 0, 1), (1, 1, 2), (0, 2, 2),]),
+ t!(leftlong040, &["a", "ab"], "a", &[(0, 0, 1)]),
+ t!(leftlong050, &["a", "ab"], "ab", &[(1, 0, 2)]),
+ t!(leftlong060, &["ab", "a"], "a", &[(1, 0, 1)]),
+ t!(leftlong070, &["ab", "a"], "ab", &[(0, 0, 2)]),
+ t!(leftlong100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[(2, 1, 6)]),
+ t!(leftlong110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[(1, 1, 6)]),
+ t!(leftlong300, &["abcd", "b", "bce"], "abce", &[(2, 1, 4)]),
+ t!(
+ leftlong310,
+ &["a", "abcdefghi", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(3, 0, 8),]
+ ),
+ t!(leftlong320, &["a", "abab"], "abab", &[(1, 0, 4)]),
+ t!(leftlong330, &["abcd", "b", "ce"], "abce", &[(1, 1, 2), (2, 2, 4),]),
+ t!(leftlong340, &["a", "ab"], "xayabbbz", &[(0, 1, 2), (1, 3, 5)]),
+];
+
+/// Like LEFTMOST_LONGEST, but for anchored searches.
+const ANCHORED_LEFTMOST_LONGEST: &'static [SearchTest] = &[
+ t!(aleftlong000, &["ab", "abcd"], "abcd", &[(1, 0, 4)]),
+ t!(aleftlong010, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4),]),
+ t!(aleftlong020, &["", "a"], "a", &[(1, 0, 1)]),
+ t!(aleftlong021, &["", "a", ""], "a", &[(1, 0, 1)]),
+ t!(aleftlong022, &["a", "", ""], "a", &[(0, 0, 1)]),
+ t!(aleftlong023, &["", "", "a"], "a", &[(2, 0, 1)]),
+ t!(aleftlong030, &["", "a"], "aa", &[(1, 0, 1)]),
+ t!(aleftlong040, &["a", "ab"], "a", &[(0, 0, 1)]),
+ t!(aleftlong050, &["a", "ab"], "ab", &[(1, 0, 2)]),
+ t!(aleftlong060, &["ab", "a"], "a", &[(1, 0, 1)]),
+ t!(aleftlong070, &["ab", "a"], "ab", &[(0, 0, 2)]),
+ t!(aleftlong100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[]),
+ t!(aleftlong110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[]),
+ t!(aleftlong300, &["abcd", "b", "bce"], "abce", &[]),
+ t!(
+ aleftlong310,
+ &["a", "abcdefghi", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(3, 0, 8),]
+ ),
+ t!(aleftlong320, &["a", "abab"], "abab", &[(1, 0, 4)]),
+ t!(aleftlong330, &["abcd", "b", "ce"], "abce", &[]),
+ t!(aleftlong340, &["a", "ab"], "xayabbbz", &[]),
+];
+
+/// Tests for non-overlapping match semantics.
+///
+/// Generally these tests shouldn't pass when using overlapping semantics.
+/// These should pass for both standard and leftmost match semantics.
+const NON_OVERLAPPING: &'static [SearchTest] = &[
+ t!(nover010, &["abcd", "bcd", "cd"], "abcd", &[(0, 0, 4),]),
+ t!(nover020, &["bcd", "cd", "abcd"], "abcd", &[(2, 0, 4),]),
+ t!(nover030, &["abc", "bc"], "zazabcz", &[(0, 3, 6),]),
+ t!(
+ nover100,
+ &["ab", "ba"],
+ "abababa",
+ &[(0, 0, 2), (0, 2, 4), (0, 4, 6),]
+ ),
+ t!(nover200, &["foo", "foo"], "foobarfoo", &[(0, 0, 3), (0, 6, 9),]),
+ t!(nover300, &["", ""], "", &[(0, 0, 0),]),
+ t!(nover310, &["", ""], "a", &[(0, 0, 0), (0, 1, 1),]),
+];
+
+/// Like NON_OVERLAPPING, but for anchored searches.
+const ANCHORED_NON_OVERLAPPING: &'static [SearchTest] = &[
+ t!(anover010, &["abcd", "bcd", "cd"], "abcd", &[(0, 0, 4),]),
+ t!(anover020, &["bcd", "cd", "abcd"], "abcd", &[(2, 0, 4),]),
+ t!(anover030, &["abc", "bc"], "zazabcz", &[]),
+ t!(anover100, &["ab", "ba"], "abababa", &[(0, 0, 2)]),
+ t!(anover200, &["foo", "foo"], "foobarfoo", &[(0, 0, 3)]),
+ t!(anover300, &["", ""], "", &[(0, 0, 0),]),
+ t!(anover310, &["", ""], "a", &[(0, 0, 0)]),
+];
+
+/// Tests for overlapping match semantics.
+///
+/// This only supports standard match semantics, since leftmost-{first,longest}
+/// do not support overlapping matches.
+const OVERLAPPING: &'static [SearchTest] = &[
+ t!(
+ over000,
+ &["abcd", "bcd", "cd", "b"],
+ "abcd",
+ &[(3, 1, 2), (0, 0, 4), (1, 1, 4), (2, 2, 4),]
+ ),
+ t!(
+ over010,
+ &["bcd", "cd", "b", "abcd"],
+ "abcd",
+ &[(2, 1, 2), (3, 0, 4), (0, 1, 4), (1, 2, 4),]
+ ),
+ t!(
+ over020,
+ &["abcd", "bcd", "cd"],
+ "abcd",
+ &[(0, 0, 4), (1, 1, 4), (2, 2, 4),]
+ ),
+ t!(
+ over030,
+ &["bcd", "abcd", "cd"],
+ "abcd",
+ &[(1, 0, 4), (0, 1, 4), (2, 2, 4),]
+ ),
+ t!(
+ over040,
+ &["bcd", "cd", "abcd"],
+ "abcd",
+ &[(2, 0, 4), (0, 1, 4), (1, 2, 4),]
+ ),
+ t!(over050, &["abc", "bc"], "zazabcz", &[(0, 3, 6), (1, 4, 6),]),
+ t!(
+ over100,
+ &["ab", "ba"],
+ "abababa",
+ &[(0, 0, 2), (1, 1, 3), (0, 2, 4), (1, 3, 5), (0, 4, 6), (1, 5, 7),]
+ ),
+ t!(
+ over200,
+ &["foo", "foo"],
+ "foobarfoo",
+ &[(0, 0, 3), (1, 0, 3), (0, 6, 9), (1, 6, 9),]
+ ),
+ t!(over300, &["", ""], "", &[(0, 0, 0), (1, 0, 0),]),
+ t!(
+ over310,
+ &["", ""],
+ "a",
+ &[(0, 0, 0), (1, 0, 0), (0, 1, 1), (1, 1, 1),]
+ ),
+ t!(over320, &["", "a"], "a", &[(0, 0, 0), (1, 0, 1), (0, 1, 1),]),
+ t!(
+ over330,
+ &["", "a", ""],
+ "a",
+ &[(0, 0, 0), (2, 0, 0), (1, 0, 1), (0, 1, 1), (2, 1, 1),]
+ ),
+ t!(
+ over340,
+ &["a", "", ""],
+ "a",
+ &[(1, 0, 0), (2, 0, 0), (0, 0, 1), (1, 1, 1), (2, 1, 1),]
+ ),
+ t!(
+ over350,
+ &["", "", "a"],
+ "a",
+ &[(0, 0, 0), (1, 0, 0), (2, 0, 1), (0, 1, 1), (1, 1, 1),]
+ ),
+ t!(
+ over360,
+ &["foo", "foofoo"],
+ "foofoo",
+ &[(0, 0, 3), (1, 0, 6), (0, 3, 6)]
+ ),
+];
+
+/// Like OVERLAPPING, but for anchored searches.
+const ANCHORED_OVERLAPPING: &'static [SearchTest] = &[
+ t!(aover000, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4)]),
+ t!(aover010, &["bcd", "cd", "b", "abcd"], "abcd", &[(3, 0, 4)]),
+ t!(aover020, &["abcd", "bcd", "cd"], "abcd", &[(0, 0, 4)]),
+ t!(aover030, &["bcd", "abcd", "cd"], "abcd", &[(1, 0, 4)]),
+ t!(aover040, &["bcd", "cd", "abcd"], "abcd", &[(2, 0, 4)]),
+ t!(aover050, &["abc", "bc"], "zazabcz", &[]),
+ t!(aover100, &["ab", "ba"], "abababa", &[(0, 0, 2)]),
+ t!(aover200, &["foo", "foo"], "foobarfoo", &[(0, 0, 3), (1, 0, 3)]),
+ t!(aover300, &["", ""], "", &[(0, 0, 0), (1, 0, 0),]),
+ t!(aover310, &["", ""], "a", &[(0, 0, 0), (1, 0, 0)]),
+ t!(aover320, &["", "a"], "a", &[(0, 0, 0), (1, 0, 1)]),
+ t!(aover330, &["", "a", ""], "a", &[(0, 0, 0), (2, 0, 0), (1, 0, 1)]),
+ t!(aover340, &["a", "", ""], "a", &[(1, 0, 0), (2, 0, 0), (0, 0, 1)]),
+ t!(aover350, &["", "", "a"], "a", &[(0, 0, 0), (1, 0, 0), (2, 0, 1)]),
+ t!(aover360, &["foo", "foofoo"], "foofoo", &[(0, 0, 3), (1, 0, 6)]),
+];
+
+/// Tests for ASCII case insensitivity.
+///
+/// These tests should all have the same behavior regardless of match semantics
+/// or whether the search is overlapping.
+const ASCII_CASE_INSENSITIVE: &'static [SearchTest] = &[
+ t!(acasei000, &["a"], "A", &[(0, 0, 1)]),
+ t!(acasei010, &["Samwise"], "SAMWISE", &[(0, 0, 7)]),
+ t!(acasei011, &["Samwise"], "SAMWISE.abcd", &[(0, 0, 7)]),
+ t!(acasei020, &["fOoBaR"], "quux foobar baz", &[(0, 5, 11)]),
+];
+
+/// Like ASCII_CASE_INSENSITIVE, but specifically for non-overlapping tests.
+const ASCII_CASE_INSENSITIVE_NON_OVERLAPPING: &'static [SearchTest] = &[
+ t!(acasei000, &["foo", "FOO"], "fOo", &[(0, 0, 3)]),
+ t!(acasei000, &["FOO", "foo"], "fOo", &[(0, 0, 3)]),
+];
+
+/// Like ASCII_CASE_INSENSITIVE, but specifically for overlapping tests.
+const ASCII_CASE_INSENSITIVE_OVERLAPPING: &'static [SearchTest] = &[
+ t!(acasei000, &["foo", "FOO"], "fOo", &[(0, 0, 3), (1, 0, 3)]),
+ t!(acasei001, &["FOO", "foo"], "fOo", &[(0, 0, 3), (1, 0, 3)]),
+];
+
+/// Regression tests that are applied to all Aho-Corasick combinations.
+///
+/// If regression tests are needed for specific match semantics, then add them
+/// to the appropriate group above.
+const REGRESSION: &'static [SearchTest] = &[
+ t!(regression010, &["inf", "ind"], "infind", &[(0, 0, 3), (1, 3, 6),]),
+ t!(regression020, &["ind", "inf"], "infind", &[(1, 0, 3), (0, 3, 6),]),
+ t!(
+ regression030,
+ &["libcore/", "libstd/"],
+ "libcore/char/methods.rs",
+ &[(0, 0, 8),]
+ ),
+ t!(
+ regression040,
+ &["libstd/", "libcore/"],
+ "libcore/char/methods.rs",
+ &[(1, 0, 8),]
+ ),
+ t!(
+ regression050,
+ &["\x00\x00\x01", "\x00\x00\x00"],
+ "\x00\x00\x00",
+ &[(1, 0, 3),]
+ ),
+ t!(
+ regression060,
+ &["\x00\x00\x00", "\x00\x00\x01"],
+ "\x00\x00\x00",
+ &[(0, 0, 3),]
+ ),
+];
+
+// Now define a test for each combination of things above that we want to run.
+// Since there are a few different combinations for each collection of tests,
+// we define a couple of macros to avoid repetition drudgery. The testconfig
+// macro constructs the automaton from a given match kind, and runs the search
+// tests one-by-one over the given collection. The `with` parameter allows one
+// to configure the builder with additional parameters. The testcombo macro
+// invokes testconfig in precisely this way: it sets up several tests where
+// each one turns a different knob on AhoCorasickBuilder.
+
+macro_rules! testconfig {
+ (overlapping, $name:ident, $collection:expr, $kind:ident, $with:expr) => {
+ #[test]
+ fn $name() {
+ run_search_tests($collection, |test| {
+ let mut builder = AhoCorasickBuilder::new();
+ $with(&mut builder);
+ builder
+ .match_kind(MatchKind::$kind)
+ .build(test.patterns)
+ .find_overlapping_iter(test.haystack)
+ .collect()
+ });
+ }
+ };
+ (stream, $name:ident, $collection:expr, $kind:ident, $with:expr) => {
+ #[test]
+ fn $name() {
+ run_search_tests($collection, |test| {
+ let buf =
+ io::BufReader::with_capacity(1, test.haystack.as_bytes());
+ let mut builder = AhoCorasickBuilder::new();
+ $with(&mut builder);
+ builder
+ .match_kind(MatchKind::$kind)
+ .build(test.patterns)
+ .stream_find_iter(buf)
+ .map(|result| result.unwrap())
+ .collect()
+ });
+ }
+ };
+ ($name:ident, $collection:expr, $kind:ident, $with:expr) => {
+ #[test]
+ fn $name() {
+ run_search_tests($collection, |test| {
+ let mut builder = AhoCorasickBuilder::new();
+ $with(&mut builder);
+ builder
+ .match_kind(MatchKind::$kind)
+ .build(test.patterns)
+ .find_iter(test.haystack)
+ .collect()
+ });
+ }
+ };
+}
+
+macro_rules! testcombo {
+ ($name:ident, $collection:expr, $kind:ident) => {
+ mod $name {
+ use super::*;
+
+ testconfig!(nfa_default, $collection, $kind, |_| ());
+ testconfig!(
+ nfa_no_prefilter,
+ $collection,
+ $kind,
+ |b: &mut AhoCorasickBuilder| {
+ b.prefilter(false);
+ }
+ );
+ testconfig!(
+ nfa_all_sparse,
+ $collection,
+ $kind,
+ |b: &mut AhoCorasickBuilder| {
+ b.dense_depth(0);
+ }
+ );
+ testconfig!(
+ nfa_all_dense,
+ $collection,
+ $kind,
+ |b: &mut AhoCorasickBuilder| {
+ b.dense_depth(usize::MAX);
+ }
+ );
+ testconfig!(
+ dfa_default,
+ $collection,
+ $kind,
+ |b: &mut AhoCorasickBuilder| {
+ b.dfa(true);
+ }
+ );
+ testconfig!(
+ dfa_no_prefilter,
+ $collection,
+ $kind,
+ |b: &mut AhoCorasickBuilder| {
+ b.dfa(true).prefilter(false);
+ }
+ );
+ testconfig!(
+ dfa_all_sparse,
+ $collection,
+ $kind,
+ |b: &mut AhoCorasickBuilder| {
+ b.dfa(true).dense_depth(0);
+ }
+ );
+ testconfig!(
+ dfa_all_dense,
+ $collection,
+ $kind,
+ |b: &mut AhoCorasickBuilder| {
+ b.dfa(true).dense_depth(usize::MAX);
+ }
+ );
+ testconfig!(
+ dfa_no_byte_class,
+ $collection,
+ $kind,
+ |b: &mut AhoCorasickBuilder| {
+ b.dfa(true).byte_classes(false);
+ }
+ );
+ testconfig!(
+ dfa_no_premultiply,
+ $collection,
+ $kind,
+ |b: &mut AhoCorasickBuilder| {
+ b.dfa(true).premultiply(false);
+ }
+ );
+ testconfig!(
+ dfa_no_byte_class_no_premultiply,
+ $collection,
+ $kind,
+ |b: &mut AhoCorasickBuilder| {
+ b.dfa(true).byte_classes(false).premultiply(false);
+ }
+ );
+ }
+ };
+}
+
+// Write out the combinations.
+testcombo!(search_leftmost_longest, AC_LEFTMOST_LONGEST, LeftmostLongest);
+testcombo!(search_leftmost_first, AC_LEFTMOST_FIRST, LeftmostFirst);
+testcombo!(
+ search_standard_nonoverlapping,
+ AC_STANDARD_NON_OVERLAPPING,
+ Standard
+);
+
+// Write out the overlapping combo by hand since there is only one of them.
+testconfig!(
+ overlapping,
+ search_standard_overlapping_nfa_default,
+ AC_STANDARD_OVERLAPPING,
+ Standard,
+ |_| ()
+);
+testconfig!(
+ overlapping,
+ search_standard_overlapping_nfa_all_sparse,
+ AC_STANDARD_OVERLAPPING,
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.dense_depth(0);
+ }
+);
+testconfig!(
+ overlapping,
+ search_standard_overlapping_nfa_all_dense,
+ AC_STANDARD_OVERLAPPING,
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.dense_depth(usize::MAX);
+ }
+);
+testconfig!(
+ overlapping,
+ search_standard_overlapping_dfa_default,
+ AC_STANDARD_OVERLAPPING,
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.dfa(true);
+ }
+);
+testconfig!(
+ overlapping,
+ search_standard_overlapping_dfa_all_sparse,
+ AC_STANDARD_OVERLAPPING,
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.dfa(true).dense_depth(0);
+ }
+);
+testconfig!(
+ overlapping,
+ search_standard_overlapping_dfa_all_dense,
+ AC_STANDARD_OVERLAPPING,
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.dfa(true).dense_depth(usize::MAX);
+ }
+);
+testconfig!(
+ overlapping,
+ search_standard_overlapping_dfa_no_byte_class,
+ AC_STANDARD_OVERLAPPING,
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.dfa(true).byte_classes(false);
+ }
+);
+testconfig!(
+ overlapping,
+ search_standard_overlapping_dfa_no_premultiply,
+ AC_STANDARD_OVERLAPPING,
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.dfa(true).premultiply(false);
+ }
+);
+testconfig!(
+ overlapping,
+ search_standard_overlapping_dfa_no_byte_class_no_premultiply,
+ AC_STANDARD_OVERLAPPING,
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.dfa(true).byte_classes(false).premultiply(false);
+ }
+);
+
+// Also write out tests manually for streams, since we only test the standard
+// match semantics. We also don't bother testing different automaton
+// configurations, since those are well covered by tests above.
+testconfig!(
+ stream,
+ search_standard_stream_nfa_default,
+ AC_STANDARD_NON_OVERLAPPING,
+ Standard,
+ |_| ()
+);
+testconfig!(
+ stream,
+ search_standard_stream_dfa_default,
+ AC_STANDARD_NON_OVERLAPPING,
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.dfa(true);
+ }
+);
+
+// Same thing for anchored searches. Write them out manually.
+testconfig!(
+ search_standard_anchored_nfa_default,
+ AC_STANDARD_ANCHORED_NON_OVERLAPPING,
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.anchored(true);
+ }
+);
+testconfig!(
+ search_standard_anchored_dfa_default,
+ AC_STANDARD_ANCHORED_NON_OVERLAPPING,
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.anchored(true).dfa(true);
+ }
+);
+testconfig!(
+ overlapping,
+ search_standard_anchored_overlapping_nfa_default,
+ AC_STANDARD_ANCHORED_OVERLAPPING,
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.anchored(true);
+ }
+);
+testconfig!(
+ overlapping,
+ search_standard_anchored_overlapping_dfa_default,
+ AC_STANDARD_ANCHORED_OVERLAPPING,
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.anchored(true).dfa(true);
+ }
+);
+testconfig!(
+ search_leftmost_first_anchored_nfa_default,
+ AC_LEFTMOST_FIRST_ANCHORED,
+ LeftmostFirst,
+ |b: &mut AhoCorasickBuilder| {
+ b.anchored(true);
+ }
+);
+testconfig!(
+ search_leftmost_first_anchored_dfa_default,
+ AC_LEFTMOST_FIRST_ANCHORED,
+ LeftmostFirst,
+ |b: &mut AhoCorasickBuilder| {
+ b.anchored(true).dfa(true);
+ }
+);
+testconfig!(
+ search_leftmost_longest_anchored_nfa_default,
+ AC_LEFTMOST_LONGEST_ANCHORED,
+ LeftmostLongest,
+ |b: &mut AhoCorasickBuilder| {
+ b.anchored(true);
+ }
+);
+testconfig!(
+ search_leftmost_longest_anchored_dfa_default,
+ AC_LEFTMOST_LONGEST_ANCHORED,
+ LeftmostLongest,
+ |b: &mut AhoCorasickBuilder| {
+ b.anchored(true).dfa(true);
+ }
+);
+
+// And also write out the test combinations for ASCII case insensitivity.
+testconfig!(
+ acasei_standard_nfa_default,
+ &[ASCII_CASE_INSENSITIVE],
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.prefilter(false).ascii_case_insensitive(true);
+ }
+);
+testconfig!(
+ acasei_standard_dfa_default,
+ &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING],
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.ascii_case_insensitive(true).dfa(true);
+ }
+);
+testconfig!(
+ overlapping,
+ acasei_standard_overlapping_nfa_default,
+ &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_OVERLAPPING],
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.ascii_case_insensitive(true);
+ }
+);
+testconfig!(
+ overlapping,
+ acasei_standard_overlapping_dfa_default,
+ &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_OVERLAPPING],
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.ascii_case_insensitive(true).dfa(true);
+ }
+);
+testconfig!(
+ acasei_leftmost_first_nfa_default,
+ &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING],
+ LeftmostFirst,
+ |b: &mut AhoCorasickBuilder| {
+ b.ascii_case_insensitive(true);
+ }
+);
+testconfig!(
+ acasei_leftmost_first_dfa_default,
+ &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING],
+ LeftmostFirst,
+ |b: &mut AhoCorasickBuilder| {
+ b.ascii_case_insensitive(true).dfa(true);
+ }
+);
+testconfig!(
+ acasei_leftmost_longest_nfa_default,
+ &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING],
+ LeftmostLongest,
+ |b: &mut AhoCorasickBuilder| {
+ b.ascii_case_insensitive(true);
+ }
+);
+testconfig!(
+ acasei_leftmost_longest_dfa_default,
+ &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING],
+ LeftmostLongest,
+ |b: &mut AhoCorasickBuilder| {
+ b.ascii_case_insensitive(true).dfa(true);
+ }
+);
+
+#[test]
+fn search_tests_have_unique_names() {
+ let assert = |constname, tests: &[SearchTest]| {
+ let mut seen = HashMap::new(); // map from test name to position
+ for (i, test) in tests.iter().enumerate() {
+ if !seen.contains_key(test.name) {
+ seen.insert(test.name, i);
+ } else {
+ let last = seen[test.name];
+ panic!(
+ "{} tests have duplicate names at positions {} and {}",
+ constname, last, i
+ );
+ }
+ }
+ };
+ assert("BASICS", BASICS);
+ assert("STANDARD", STANDARD);
+ assert("LEFTMOST", LEFTMOST);
+ assert("LEFTMOST_FIRST", LEFTMOST_FIRST);
+ assert("LEFTMOST_LONGEST", LEFTMOST_LONGEST);
+ assert("NON_OVERLAPPING", NON_OVERLAPPING);
+ assert("OVERLAPPING", OVERLAPPING);
+ assert("REGRESSION", REGRESSION);
+}
+
+#[test]
+#[should_panic]
+fn stream_not_allowed_leftmost_first() {
+ let fsm = AhoCorasickBuilder::new()
+ .match_kind(MatchKind::LeftmostFirst)
+ .build(None::<String>);
+ assert_eq!(fsm.stream_find_iter(&b""[..]).count(), 0);
+}
+
+#[test]
+#[should_panic]
+fn stream_not_allowed_leftmost_longest() {
+ let fsm = AhoCorasickBuilder::new()
+ .match_kind(MatchKind::LeftmostLongest)
+ .build(None::<String>);
+ assert_eq!(fsm.stream_find_iter(&b""[..]).count(), 0);
+}
+
+#[test]
+#[should_panic]
+fn overlapping_not_allowed_leftmost_first() {
+ let fsm = AhoCorasickBuilder::new()
+ .match_kind(MatchKind::LeftmostFirst)
+ .build(None::<String>);
+ assert_eq!(fsm.find_overlapping_iter("").count(), 0);
+}
+
+#[test]
+#[should_panic]
+fn overlapping_not_allowed_leftmost_longest() {
+ let fsm = AhoCorasickBuilder::new()
+ .match_kind(MatchKind::LeftmostLongest)
+ .build(None::<String>);
+ assert_eq!(fsm.find_overlapping_iter("").count(), 0);
+}
+
+#[test]
+fn state_id_too_small() {
+ let mut patterns = vec![];
+ for c1 in (b'a'..b'z').map(|b| b as char) {
+ for c2 in (b'a'..b'z').map(|b| b as char) {
+ for c3 in (b'a'..b'z').map(|b| b as char) {
+ patterns.push(format!("{}{}{}", c1, c2, c3));
+ }
+ }
+ }
+ let result =
+ AhoCorasickBuilder::new().build_with_size::<u8, _, _>(&patterns);
+ assert!(result.is_err());
+}
+
+// See: https://github.com/BurntSushi/aho-corasick/issues/44
+//
+// In short, this test ensures that enabling ASCII case insensitivity does not
+// visit an exponential number of states when filling in failure transitions.
+#[test]
+fn regression_ascii_case_insensitive_no_exponential() {
+ let ac = AhoCorasickBuilder::new()
+ .ascii_case_insensitive(true)
+ .build(&["Tsubaki House-Triple Shot Vol01校花三姐妹"]);
+ assert!(ac.find("").is_none());
+}
+
+// See: https://github.com/BurntSushi/aho-corasick/issues/53
+//
+// This test ensures that the rare byte prefilter works in a particular corner
+// case. In particular, the shift offset detected for '/' in the patterns below
+// was incorrect, leading to a false negative.
+#[test]
+fn regression_rare_byte_prefilter() {
+ use AhoCorasick;
+
+ let ac = AhoCorasick::new_auto_configured(&["ab/j/", "x/"]);
+ assert!(ac.is_match("ab/j/"));
+}
+
+#[test]
+fn regression_case_insensitive_prefilter() {
+ use AhoCorasickBuilder;
+
+ for c in b'a'..b'z' {
+ for c2 in b'a'..b'z' {
+ let c = c as char;
+ let c2 = c2 as char;
+ let needle = format!("{}{}", c, c2).to_lowercase();
+ let haystack = needle.to_uppercase();
+ let ac = AhoCorasickBuilder::new()
+ .ascii_case_insensitive(true)
+ .prefilter(true)
+ .build(&[&needle]);
+ assert_eq!(
+ 1,
+ ac.find_iter(&haystack).count(),
+ "failed to find {:?} in {:?}\n\nautomaton:\n{:?}",
+ needle,
+ haystack,
+ ac,
+ );
+ }
+ }
+}
+
+fn run_search_tests<F: FnMut(&SearchTest) -> Vec<Match>>(
+ which: TestCollection,
+ mut f: F,
+) {
+ let get_match_triples =
+ |matches: Vec<Match>| -> Vec<(usize, usize, usize)> {
+ matches
+ .into_iter()
+ .map(|m| (m.pattern(), m.start(), m.end()))
+ .collect()
+ };
+ for &tests in which {
+ for test in tests {
+ assert_eq!(
+ test.matches,
+ get_match_triples(f(&test)).as_slice(),
+ "test: {}, patterns: {:?}, haystack: {:?}",
+ test.name,
+ test.patterns,
+ test.haystack
+ );
+ }
+ }
+}