Import 'aho-corasick' package version 0.7.10 am: 0a0edd505c am: 92b76d957f am: 70d61f6af2

Change-Id: I3485dbbb61c0610a1cd6ee32c9557f04c5b66c5d
author: Chih-Hung Hsieh <chh@google.com> 2020-04-17 01:35:30 +0000
committer: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com> 2020-04-17 01:35:30 +0000
commit: 7b456f1d9de795231795b2541692205e00e93133 (patch)
tree: 7820a857da0555fc089addcce3710b33363592ea
parent: 15f90c2d64d844022e06bbb33cafdb56fae4989c (diff)
parent: 70d61f6af2e52b51edfff374d4f8dfd4940018c9 (diff)
download: aho-corasick-7b456f1d9de795231795b2541692205e00e93133.tar.gz
37 files changed, 13118 insertions, 0 deletions
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json
new file mode 100644
index 0000000..06dfa3d
--- /dev/null
+++ b/.cargo_vcs_info.json
@@ -0,0 +1,5 @@
+{
+  "git": {
+    "sha1": "36de9d383aeaf925c7425ed53eee91e61cb9b61c"
+  }
+}
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..06dcdd5
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,111 @@
+name: ci
+on:
+  pull_request:
+  push:
+    branches:
+    - master
+  schedule:
+  - cron: '00 01 * * *'
+jobs:
+  test:
+    name: test
+    env:
+      # For some builds, we use cross to test on 32-bit and big-endian
+      # systems.
+      CARGO: cargo
+      # When CARGO is set to CROSS, TARGET is set to `--target matrix.target`.
+      TARGET:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        build:
+        - pinned
+        - stable
+        - stable-32
+        - stable-mips
+        - beta
+        - nightly
+        - macos
+        - win-msvc
+        - win-gnu
+        include:
+        - build: pinned
+          os: ubuntu-18.04
+          rust: 1.28.0
+        - build: stable
+          os: ubuntu-18.04
+          rust: stable
+        - build: stable-32
+          os: ubuntu-18.04
+          rust: stable
+          target: i686-unknown-linux-gnu
+        - build: stable-mips
+          os: ubuntu-18.04
+          rust: stable
+          target: mips64-unknown-linux-gnuabi64
+        - build: beta
+          os: ubuntu-18.04
+          rust: beta
+        - build: nightly
+          os: ubuntu-18.04
+          rust: nightly
+        - build: macos
+          os: macos-latest
+          rust: stable
+        - build: win-msvc
+          os: windows-2019
+          rust: stable
+        - build: win-gnu
+          os: windows-2019
+          rust: stable-x86_64-gnu
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v1
+      with:
+        fetch-depth: 1
+    - name: Install Rust
+      uses: actions-rs/toolchain@v1
+      with:
+        toolchain: ${{ matrix.rust }}
+        profile: minimal
+        override: true
+    - name: Use Cross
+      if: matrix.target != ''
+      run: |
+        # FIXME: to work around bugs in latest cross release, install master.
+        # See: https://github.com/rust-embedded/cross/issues/357
+        cargo install --git https://github.com/rust-embedded/cross
+        echo "::set-env name=CARGO::cross"
+        echo "::set-env name=TARGET::--target ${{ matrix.target }}"
+    - name: Show command used for Cargo
+      run: |
+        echo "cargo command is: ${{ env.CARGO }}"
+        echo "target flag is: ${{ env.TARGET }}"
+    - name: Show CPU info for debugging
+      if: matrix.os == 'ubuntu-18.04'
+      run: lscpu
+    - run: ${{ env.CARGO }} build --verbose
+    - run: ${{ env.CARGO }} doc --verbose
+    - run: ${{ env.CARGO }} test --verbose
+    - if: matrix.build == 'nightly'
+      run: ${{ env.CARGO }} build --manifest-path aho-corasick-debug/Cargo.toml
+    - if: matrix.build == 'nightly'
+      run: ${{ env.CARGO }} bench --verbose --manifest-path bench/Cargo.toml -- --test
+
+  rustfmt:
+    name: rustfmt
+    runs-on: ubuntu-18.04
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v1
+      with:
+        fetch-depth: 1
+    - name: Install Rust
+      uses: actions-rs/toolchain@v1
+      with:
+        toolchain: stable
+        profile: minimal
+        components: rustfmt
+    - name: Check formatting
+      run: |
+        cargo fmt --all -- --check
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..f1a4d65
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,12 @@
+.*.swp
+doc
+tags
+examples/ss10pusa.csv
+build
+target
+/Cargo.lock
+scratch*
+bench_large/huge
+BREADCRUMBS
+/tmp
+/aho-corasick-debug/Cargo.lock
diff --git a/COPYING b/COPYING
new file mode 100644
index 0000000..bb9c20a
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,3 @@
+This project is dual-licensed under the Unlicense and MIT licenses.
+
+You may use this code under the terms of either license.
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..b240ec3
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,42 @@
+# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
+#
+# When uploading crates to the registry Cargo will automatically
+# "normalize" Cargo.toml files for maximal compatibility
+# with all versions of Cargo and also rewrite `path` dependencies
+# to registry (e.g., crates.io) dependencies
+#
+# If you believe there's an error in this file please file an
+# issue against the rust-lang/cargo repository. If you're
+# editing this file be aware that the upstream Cargo.toml
+# will likely look very different (and much more reasonable)
+
+[package]
+name = "aho-corasick"
+version = "0.7.10"
+authors = ["Andrew Gallant <jamslam@gmail.com>"]
+exclude = ["/aho-corasick-debug", "/ci/*", "/.travis.yml", "/appveyor.yml"]
+autotests = false
+description = "Fast multiple substring searching."
+homepage = "https://github.com/BurntSushi/aho-corasick"
+readme = "README.md"
+keywords = ["string", "search", "text", "aho", "multi"]
+categories = ["text-processing"]
+license = "Unlicense/MIT"
+repository = "https://github.com/BurntSushi/aho-corasick"
+[profile.bench]
+debug = true
+
+[profile.release]
+debug = true
+
+[lib]
+name = "aho_corasick"
+[dependencies.memchr]
+version = "2.2.0"
+default-features = false
+[dev-dependencies.doc-comment]
+version = "0.3.1"
+
+[features]
+default = ["std"]
+std = ["memchr/use_std"]
diff --git a/Cargo.toml.orig b/Cargo.toml.orig
new file mode 100644
index 0000000..3166f9b
--- /dev/null
+++ b/Cargo.toml.orig
@@ -0,0 +1,44 @@
+[package]
+name = "aho-corasick"
+version = "0.7.10"  #:version
+authors = ["Andrew Gallant <jamslam@gmail.com>"]
+description = "Fast multiple substring searching."
+homepage = "https://github.com/BurntSushi/aho-corasick"
+repository = "https://github.com/BurntSushi/aho-corasick"
+readme = "README.md"
+keywords = ["string", "search", "text", "aho", "multi"]
+license = "Unlicense/MIT"
+categories = ["text-processing"]
+autotests = false
+exclude = [
+  "/aho-corasick-debug", "/ci/*", "/.travis.yml", "/appveyor.yml",
+]
+
+[workspace]
+members = ["bench"]
+# We'd ideally not do this, but since the debug tool uses Rust 2018, older
+# versions of Rust (such as 1.28) fail to parse the manifest because it treats
+# `edition = "2018"` as an unstable feature.
+#
+# When we move our MSRV to Rust 2018, then we should be able to add this back
+# to the workspace.
+exclude = ["aho-corasick-debug"]
+
+[lib]
+name = "aho_corasick"
+
+[features]
+default = ["std"]
+std = ["memchr/use_std"]
+
+[dependencies]
+memchr = { version = "2.2.0", default-features = false }
+
+[dev-dependencies]
+doc-comment = "0.3.1"
+
+[profile.release]
+debug = true
+
+[profile.bench]
+debug = true
diff --git a/DESIGN.md b/DESIGN.md
new file mode 100644
index 0000000..367e203
--- /dev/null
+++ b/DESIGN.md
@@ -0,0 +1,483 @@
+This document describes the internal design of this crate, which is an object
+lesson in what happens when you take a fairly simple old algorithm like
+Aho-Corasick and make it fast and production ready.
+
+The target audience of this document is Rust programmers that have some
+familiarity with string searching, however, one does not need to know the
+Aho-Corasick algorithm in order to read this (it is explained below). One
+should, however, know what a trie is. (If you don't, go read its Wikipedia
+article.)
+
+The center-piece of this crate is an implementation of Aho-Corasick. On its
+own, Aho-Corasick isn't that complicated. The complex pieces come from the
+different variants of Aho-Corasick implemented in this crate. Specifically,
+they are:
+
+* Aho-Corasick as an NFA, using dense transitions near the root with sparse
+  transitions elsewhere.
+* Aho-Corasick as a DFA. (An NFA is slower to search, but cheaper to construct
+  and uses less memory.)
+  * A DFA with pre-multiplied state identifiers. This saves a multiplication
+    instruction in the core search loop.
+  * A DFA with equivalence classes of bytes as the alphabet, instead of the
+    traditional 256-byte alphabet. This shrinks the size of the DFA in memory,
+    but adds an extra lookup in the core search loop to map the input byte to
+    an equivalent class.
+* The option to choose how state identifiers are represented, via one of
+  u8, u16, u32, u64 or usize. This permits creating compact automatons when
+  matching a small number of patterns.
+* Supporting "standard" match semantics, along with its overlapping variant,
+  in addition to leftmost-first and leftmost-longest semantics. The "standard"
+  semantics are typically what you see in a textbook description of
+  Aho-Corasick. However, Aho-Corasick is also useful as an optimization in
+  regex engines, which often use leftmost-first or leftmost-longest semantics.
+  Thus, it is useful to implement those semantics here. The "standard" and
+  "leftmost" search algorithms are subtly different, and also require slightly
+  different construction algorithms.
+* Support for ASCII case insensitive matching.
+* Support for accelerating searches when the patterns all start with a small
+  number of fixed bytes. Or alternatively, when the  patterns all contain a
+  small number of rare bytes. (Searching for these bytes uses SIMD vectorized
+  code courtesy of `memchr`.)
+* Transparent support for alternative SIMD vectorized search routines for
+  smaller number of literals, such as the Teddy algorithm. We called these
+  "packed" search routines because they use SIMD. They can often be an order of
+  magnitude faster than just Aho-Corasick, but don't scale as well.
+* Support for searching streams. This can reuse most of the underlying code,
+  but does require careful buffering support.
+* Support for anchored searches, which permit efficient `is_prefix` checks for
+  a large number of patterns.
+
+When you combine all of this together along with trying to make everything as
+fast as possible, what you end up with is enitrely too much code with too much
+`unsafe`. Alas, I was not smart enough to figure out how to reduce it. Instead,
+we will explain it.
+
+
+# Basics
+
+The fundamental problem this crate is trying to solve is to determine the
+occurrences of possibly many patterns in a haystack. The naive way to solve
+this is to look for a match for each pattern at each position in the haystack:
+
+    for i in 0..haystack.len():
+      for p in patterns.iter():
+        if haystack[i..].starts_with(p.bytes()):
+          return Match(p.id(), i, i + p.bytes().len())
+
+Those four lines are effectively all this crate does. The problem with those
+four lines is that they are very slow, especially when you're searching for a
+large number of patterns.
+
+While there are many different algorithms available to solve this, a popular
+one is Aho-Corasick. It's a common solution because it's not too hard to
+implement, scales quite well even when searching for thousands of patterns and
+is generally pretty fast. Aho-Corasick does well here because, regardless of
+the number of patterns you're searching for, it always visits each byte in the
+haystack exactly once. This means, generally speaking, adding more patterns to
+an Aho-Corasick automaton does not make it slower. (Strictly speaking, however,
+this is not true, since a larger automaton will make less effective use of the
+CPU's cache.)
+
+Aho-Corasick can be succinctly described as a trie with state transitions
+between some of the nodes that efficiently instruct the search algorithm to
+try matching alternative keys in the automaton. The trick is that these state
+transitions are arranged such that each byte of input needs to be inspected
+only once. These state transitions are typically called "failure transitions,"
+because they instruct the searcher (the thing traversing the automaton while
+reading from the haystack) what to do when a byte in the haystack does not
+correspond to a valid transition in the current state of the trie.
+
+More formally, a failure transition points to a state in the automaton that may
+lead to a match whose prefix is a proper suffix of the path traversed through
+the trie so far. (If no such proper suffix exists, then the failure transition
+points back to the start state of the trie, effectively restarting the search.)
+This is perhaps simpler to explain pictorally. For example, let's say we built
+an Aho-Corasick automaton with the following patterns: 'abcd' and 'cef'. The
+trie looks like this:
+
+         a - S1 - b - S2 - c - S3 - d - S4*
+        /
+    S0 - c - S5 - e - S6 - f - S7*
+
+where states marked with a `*` are match states (meaning, the search algorithm
+should stop and report a match to the caller).
+
+So given this trie, it should be somewhat straight-forward to see how it can
+be used to determine whether any particular haystack *starts* with either
+`abcd` or `cef`. It's easy to express this in code:
+
+    fn has_prefix(trie: &Trie, haystack: &[u8]) -> bool {
+      let mut state_id = trie.start();
+      // If the empty pattern is in trie, then state_id is a match state.
+      if trie.is_match(state_id) {
+        return true;
+      }
+      for (i, &b) in haystack.iter().enumerate() {
+        state_id = match trie.next_state(state_id, b) {
+          Some(id) => id,
+          // If there was no transition for this state and byte, then we know
+          // the haystack does not start with one of the patterns in our trie.
+          None => return false,
+        };
+        if trie.is_match(state_id) {
+          return true;
+        }
+      }
+      false
+    }
+
+And that's pretty much it. All we do is move through the trie starting with the
+bytes at the beginning of the haystack. If we find ourselves in a position
+where we can't move, or if we've looked through the entire haystack without
+seeing a match state, then we know the haystack does not start with any of the
+patterns in the trie.
+
+The meat of the Aho-Corasick algorithm is in how we add failure transitions to
+our trie to keep searching efficient. Specifically, it permits us to not only
+check whether a haystack *starts* with any one of a number of patterns, but
+rather, whether the haystack contains any of a number of patterns *anywhere* in
+the haystack.
+
+As mentioned before, failure transitions connect a proper suffix of the path
+traversed through the trie before, with a path that leads to a match that has a
+prefix corresponding to that proper suffix. So in our case, for patterns `abcd`
+and `cef`, with a haystack `abcef`, we want to transition to state `S5` (from
+the diagram above) from `S3` upon seeing that the byte following `c` is not
+`d`. Namely, the proper suffix in this example is `c`, which is a prefix of
+`cef`. So the modified diagram looks like this:
+
+
+         a - S1 - b - S2 - c - S3 - d - S4*
+        /                      /
+       /       ----------------
+      /       /
+    S0 - c - S5 - e - S6 - f - S7*
+
+One thing that isn't shown in this diagram is that *all* states have a failure
+transition, but only `S3` has a *non-trivial* failure transition. That is, all
+other states have a failure transition back to the start state. So if our
+haystack was `abzabcd`, then the searcher would transition back to `S0` after
+seeing `z`, which effectively restarts the search. (Because there is no pattern
+in our trie that has a prefix of `bz` or `z`.)
+
+The code for traversing this *automaton* or *finite state machine* (it is no
+longer just a trie) is not that much different from the `has_prefix` code
+above:
+
+    fn contains(fsm: &FiniteStateMachine, haystack: &[u8]) -> bool {
+      let mut state_id = fsm.start();
+      // If the empty pattern is in fsm, then state_id is a match state.
+      if fsm.is_match(state_id) {
+        return true;
+      }
+      for (i, &b) in haystack.iter().enumerate() {
+        // While the diagram above doesn't show this, we may wind up needing
+        // to follow multiple failure transitions before we land on a state
+        // in which we can advance. Therefore, when searching for the next
+        // state, we need to loop until we don't see a failure transition.
+        //
+        // This loop terminates because the start state has no empty
+        // transitions. Every transition from the start state either points to
+        // another state, or loops back to the start state.
+        loop {
+          match fsm.next_state(state_id, b) {
+            Some(id) => {
+              state_id = id;
+              break;
+            }
+            // Unlike our code above, if there was no transition for this
+            // state, then we don't quit. Instead, we look for this state's
+            // failure transition and follow that instead.
+            None => {
+              state_id = fsm.next_fail_state(state_id);
+            }
+          };
+        }
+        if fsm.is_match(state_id) {
+          return true;
+        }
+      }
+      false
+    }
+
+Other than the complication around traversing failure transitions, this code
+is still roughly "traverse the automaton with bytes from the haystack, and quit
+when a match is seen."
+
+And that concludes our section on the basics. While we didn't go deep into
+how the automaton is built (see `src/nfa.rs`, which has detailed comments about
+that), the basic structure of Aho-Corasick should be reasonably clear.
+
+
+# NFAs and DFAs
+
+There are generally two types of finite automata: non-deterministic finite
+automata (NFA) and deterministic finite automata (DFA). The difference between
+them is, principally, that an NFA can be in multiple states at once. This is
+typically accomplished by things called _epsilon_ transitions, where one could
+move to a new state without consuming any bytes from the input. (The other
+mechanism by which NFAs can be in more than one state is where the same byte in
+a particular state transitions to multiple distinct states.) In contrast, a DFA
+can only ever be in one state at a time. A DFA has no epsilon transitions, and
+for any given state, a byte transitions to at most one other state.
+
+By this formulation, the Aho-Corasick automaton described in the previous
+section is an NFA. This is because failure transitions are, effectively,
+epsilon transitions. That is, whenever the automaton is in state `S`, it is
+actually in the set of states that are reachable by recursively following
+failure transitions from `S`. (This means that, for example, the start state
+is always active since the start state is reachable via failure transitions
+from any state in the automaton.)
+
+NFAs have a lot of nice properties. They tend to be easier to construct, and
+also tend to use less memory. However, their primary downside is that they are
+typically slower to execute. For example, the code above showing how to search
+with an Aho-Corasick automaton needs to potentially iterate through many
+failure transitions for every byte of input. While this is a fairly small
+amount of overhead, this can add up, especially if the automaton has a lot of
+overlapping patterns with a lot of failure transitions.
+
+A DFA's search code, by contrast, looks like this:
+
+    fn contains(dfa: &DFA, haystack: &[u8]) -> bool {
+      let mut state_id = dfa.start();
+      // If the empty pattern is in dfa, then state_id is a match state.
+      if dfa.is_match(state_id) {
+        return true;
+      }
+      for (i, &b) in haystack.iter().enumerate() {
+        // An Aho-Corasick DFA *never* has a missing state that requires
+        // failure transitions to be followed. One byte of input advances the
+        // automaton by one state. Always.
+        state_id = trie.next_state(state_id, b);
+        if fsm.is_match(state_id) {
+          return true;
+        }
+      }
+      false
+    }
+
+The search logic here is much simpler than for the NFA, and this tends to
+translate into significant performance benefits as well, since there's a lot
+less work being done for each byte in the haystack. How is this accomplished?
+It's done by pre-following all failure transitions for all states for all bytes
+in the alphabet, and then building a single state transition table. Building
+this DFA can be much more costly than building the NFA, and use much more
+memory, but the better performance can be worth it.
+
+Users of this crate can actually choose between using an NFA or a DFA. By
+default, an NFA is used, because it typically strikes the best balance between
+space usage and search performance. But the DFA option is available for cases
+where a little extra memory and upfront time building the automaton is okay.
+For example, the `AhoCorasick::auto_configure` and
+`AhoCorasickBuilder::auto_configure` methods will enable the DFA setting if
+there are a small number of patterns.
+
+
+# More DFA tricks
+
+As described in the previous section, one of the downsides of using a DFA
+is that is uses more memory and can take longer to build. One small way of
+mitigating these concerns is to map the alphabet used by the automaton into
+a smaller space. Typically, the alphabet of a DFA has 256 elements in it:
+one element for each possible value that fits into a byte. However, in many
+cases, one does not need the full alphabet. For example, if all patterns in an
+Aho-Corasick automaton are ASCII letters, then this only uses up 52 distinct
+bytes. As far as the automaton is concerned, the rest of the 204 bytes are
+indistinguishable from one another: they will never disrciminate between a
+match or a non-match. Therefore, in cases like that, the alphabet can be shrunk
+to just 53 elements. One for each ASCII letter, and then another to serve as a
+placeholder for every other unused byte.
+
+In practice, this library doesn't quite compute the optimal set of equivalence
+classes, but it's close enough in most cases. The key idea is that this then
+allows the transition table for the DFA to be potentially much smaller. The
+downside of doing this, however, is that since the transition table is defined
+in terms of this smaller alphabet space, every byte in the haystack must be
+re-mapped to this smaller space. This requires an additional 256-byte table.
+In practice, this can lead to a small search time hit, but it can be difficult
+to measure. Moreover, it can sometimes lead to faster search times for bigger
+automata, since it could be difference between more parts of the automaton
+staying in the CPU cache or not.
+
+One other trick for DFAs employed by this crate is the notion of premultiplying
+state identifiers. Specifically, the normal way to compute the next transition
+in a DFA is via the following (assuming that the transition table is laid out
+sequentially in memory, in row-major order, where the rows are states):
+
+    next_state_id = dfa.transitions[current_state_id * 256 + current_byte]
+
+However, since the value `256` is a fixed constant, we can actually premultiply
+the state identifiers in the table when we build the table initially. Then, the
+next transition computation simply becomes:
+
+    next_state_id = dfa.transitions[current_state_id + current_byte]
+
+This doesn't seem like much, but when this is being executed for every byte of
+input that you're searching, saving that extra multiplication instruction can
+add up.
+
+The same optimization works even when equivalence classes are enabled, as
+described above. The only difference is that the premultiplication is by the
+total number of equivalence classes instead of 256.
+
+There isn't much downside to premultiplying state identifiers, other than the
+fact that you may need to choose a bigger integer representation than you would
+otherwise. For example, if you don't premultiply state identifiers, then an
+automaton that uses `u8` as a state identifier can hold up to 256 states.
+However, if they are premultiplied, then it can only hold up to
+`floor(256 / len(alphabet))` states. Thus premultiplication impacts how compact
+your DFA can be. In practice, it's pretty rare to use `u8` as a state
+identifier, so premultiplication is usually a good thing to do.
+
+Both equivalence classes and premultiplication are tuneable parameters via the
+`AhoCorasickBuilder` type, and both are enabled by default.
+
+
+# Match semantics
+
+One of the more interesting things about this implementation of Aho-Corasick
+that (as far as this author knows) separates it from other implementations, is
+that it natively supports leftmost-first and leftmost-longest match semantics.
+Briefly, match semantics refer to the decision procedure by which searching
+will disambiguate matches when there are multiple to choose from:
+
+* **standard** match semantics emits matches as soon as they are detected by
+  the automaton. This is typically equivalent to the textbook non-overlapping
+  formulation of Aho-Corasick.
+* **leftmost-first** match semantics means that 1) the next match is the match
+  starting at the leftmost position and 2) among multiple matches starting at
+  the same leftmost position, the match corresponding to the pattern provided
+  first by the caller is reported.
+* **leftmost-longest** is like leftmost-first, except when there are multiple
+  matches starting at the same leftmost position, the pattern corresponding to
+  the longest match is returned.
+
+(The crate API documentation discusses these differences, with examples, in
+more depth on the `MatchKind` type.)
+
+The reason why supporting these match semantics is important is because it
+gives the user more control over the match procedure. For example,
+leftmost-first permits users to implement match priority by simply putting the
+higher priority patterns first. Leftmost-longest, on the other hand, permits
+finding the longest possible match, which might be useful when trying to find
+words matching a dictionary. Additionally, regex engines often want to use
+Aho-Corasick as an optimization when searching for an alternation of literals.
+In order to preserve correct match semantics, regex engines typically can't use
+the standard textbook definition directly, since regex engines will implement
+either leftmost-first (Perl-like) or leftmost-longest (POSIX) match semantics.
+
+Supporting leftmost semantics requires a couple key changes:
+
+* Constructing the Aho-Corasick automaton changes a bit in both how the trie is
+  constructed and how failure transitions are found. Namely, only a subset of
+  the failure transitions are added. Specifically, only the failure transitions
+  that either do not occur after a match or do occur after a match but preserve
+  that match are kept. (More details on this can be found in `src/nfa.rs`.)
+* The search algorithm changes slightly. Since we are looking for the leftmost
+  match, we cannot quit as soon as a match is detected. Instead, after a match
+  is detected, we must keep searching until either the end of the input or
+  until a dead state is seen. (Dead states are not used for standard match
+  semantics. Dead states mean that searching should stop after a match has been
+  found.)
+
+Other implementations of Aho-Corasick do support leftmost match semantics, but
+they do it with more overhead at search time, or even worse, with a queue of
+matches and sophisticated hijinks to disambiguate the matches. While our
+construction algorithm becomes a bit more complicated, the correct match
+semantics fall out from the structure of the automaton itself.
+
+
+# Overlapping matches
+
+One of the nice properties of an Aho-Corasick automaton is that it can report
+all possible matches, even when they overlap with one another. In this mode,
+the match semantics don't matter, since all possible matches are reported.
+Overlapping searches work just like regular searches, except the state
+identifier at which the previous search left off is carried over to the next
+search, so that it can pick up where it left off. If there are additional
+matches at that state, then they are reported before resuming the search.
+
+Enabling leftmost-first or leftmost-longest match semantics causes the
+automaton to use a subset of all failure transitions, which means that
+overlapping searches cannot be used. Therefore, if leftmost match semantics are
+used, attempting to do an overlapping search will panic. Thus, to get
+overlapping searches, the caller must use the default standard match semantics.
+This behavior was chosen because there are only two alternatives, which were
+deemed worse:
+
+* Compile two automatons internally, one for standard semantics and one for
+  the semantics requested by the caller (if not standard).
+* Create a new type, distinct from the `AhoCorasick` type, which has different
+  capabilities based on the configuration options.
+
+The first is untenable because of the amount of memory used by the automaton.
+The second increases the complexity of the API too much by adding too many
+types that do similar things. It is conceptually much simpler to keep all
+searching isolated to a single type. Callers may query whether the automaton
+supports overlapping searches via the `AhoCorasick::supports_overlapping`
+method.
+
+
+# Stream searching
+
+Since Aho-Corasick is an automaton, it is possible to do partial searches on
+partial parts of the haystack, and then resume that search on subsequent pieces
+of the haystack. This is useful when the haystack you're trying to search is
+not stored contiguous in memory, or if one does not want to read the entire
+haystack into memory at once.
+
+Currently, only standard semantics are supported for stream searching. This is
+some of the more complicated code in this crate, and is something I would very
+much like to improve. In particular, it currently has the restriction that it
+must buffer at least enough of the haystack in memory in order to fit the
+longest possible match. The difficulty in getting stream searching right is
+that the implementation choices (such as the buffer size) often impact what the
+API looks like and what it's allowed to do.
+
+
+# Prefilters
+
+In some cases, Aho-Corasick is not the fastest way to find matches containing
+multiple patterns. Sometimes, the search can be accelerated using highly
+optimized SIMD routines. For example, consider searching the following
+patterns:
+
+    Sherlock
+    Moriarty
+    Watson
+
+It is plausible that it would be much faster to quickly look for occurrences of
+the leading bytes, `S`, `M` or `W`, before trying to start searching via the
+automaton. Indeed, this is exactly what this crate will do.
+
+When there are more than three distinct starting bytes, then this crate will
+look for three distinct bytes occurring at any position in the patterns, while
+preferring bytes that are heuristically determined to be rare over others. For
+example:
+
+    Abuzz
+    Sanchez
+    Vasquez
+    Topaz
+    Waltz
+
+Here, we have more than 3 distinct starting bytes, but all of the patterns
+contain `z`, which is typically a rare byte. In this case, the prefilter will
+scan for `z`, back up a bit, and then execute the Aho-Corasick automaton.
+
+If all of that fails, then a packed multiple substring algorithm will be
+attempted. Currently, the only algorithm available for this is Teddy, but more
+may be added in the future. Teddy is unlike the above prefilters in that it
+confirms its own matches, so when Teddy is active, it might not be necessary
+for Aho-Corasick to run at all. (See `Automaton::leftmost_find_at_no_state_imp`
+in `src/automaton.rs`.) However, the current Teddy implementation only works
+in `x86_64` and when SSSE3 or AVX2 are available, and moreover, only works
+_well_ when there are a small number of patterns (say, less than 100). Teddy
+also requires the haystack to be of a certain length (more than 16-34 bytes).
+When the haystack is shorter than that, Rabin-Karp is used instead. (See
+`src/packed/rabinkarp.rs`.)
+
+There is a more thorough description of Teddy at
+[`src/packed/teddy/README.md`](src/packed/teddy/README.md).
diff --git a/LICENSE b/LICENSE
new file mode 120000
index 0000000..7f9a88e
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1 @@
+LICENSE-MIT
+\ No newline at end of file
diff --git a/LICENSE-MIT b/LICENSE-MIT
new file mode 100644
index 0000000..3b0a5dc
--- /dev/null
+++ b/LICENSE-MIT
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2015 Andrew Gallant
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/METADATA b/METADATA
new file mode 100644
index 0000000..a6b685d
--- /dev/null
+++ b/METADATA
@@ -0,0 +1,19 @@
+name: "aho-corasick"
+description: "A library for finding occurrences of many patterns at once with SIMD acceleration in some cases. This library provides multiple pattern search principally through an implementation of the Aho-Corasick algorithm, which builds a finite state machine for executing searches in linear time. Features include case insensitive matching, overlapping matches and search & replace in streams."
+third_party {
+  url {
+    type: HOMEPAGE
+    value: "https://crates.io/crates/aho-corasick"
+  }
+  url {
+    type: GIT
+    value: "https://github.com/BurntSushi/aho-corasick"
+  }
+  version: "0.7.10"
+  license_type: NOTICE
+  last_upgrade_date {
+    year: 2020
+    month: 3
+    day: 31
+  }
+}
diff --git a/MODULE_LICENSE_MIT b/MODULE_LICENSE_MIT
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/MODULE_LICENSE_MIT
diff --git a/OWNERS b/OWNERS
new file mode 100644
index 0000000..46fc303
--- /dev/null
+++ b/OWNERS
@@ -0,0 +1 @@
+include platform/prebuilts/rust:/OWNERS
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..9ae3427
--- /dev/null
+++ b/README.md
@@ -0,0 +1,186 @@
+aho-corasick
+============
+A library for finding occurrences of many patterns at once with SIMD
+acceleration in some cases. This library provides multiple pattern
+search principally through an implementation of the
+[Aho-Corasick algorithm](https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm),
+which builds a finite state machine for executing searches in linear time.
+Features include case insensitive matching, overlapping matches and search &
+replace in streams.
+
+[![Build status](https://github.com/BurntSushi/aho-corasick/workflows/ci/badge.svg)](https://github.com/BurntSushi/aho-corasick/actions)
+[![](http://meritbadge.herokuapp.com/aho-corasick)](https://crates.io/crates/aho-corasick)
+
+Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org).
+
+
+### Documentation
+
+https://docs.rs/aho-corasick
+
+
+### Usage
+
+Add this to your `Cargo.toml`:
+
+```toml
+[dependencies]
+aho-corasick = "0.7"
+```
+
+and this to your crate root (if you're using Rust 2015):
+
+```rust
+extern crate aho_corasick;
+```
+
+
+### Example: basic searching
+
+This example shows how to search for occurrences of multiple patterns
+simultaneously. Each match includes the pattern that matched along with the
+byte offsets of the match.
+
+```rust
+use aho_corasick::AhoCorasick;
+
+let patterns = &["apple", "maple", "Snapple"];
+let haystack = "Nobody likes maple in their apple flavored Snapple.";
+
+let ac = AhoCorasick::new(patterns);
+let mut matches = vec![];
+for mat in ac.find_iter(haystack) {
+    matches.push((mat.pattern(), mat.start(), mat.end()));
+}
+assert_eq!(matches, vec![
+    (1, 13, 18),
+    (0, 28, 33),
+    (2, 43, 50),
+]);
+```
+
+
+### Example: case insensitivity
+
+This is like the previous example, but matches `Snapple` case insensitively
+using `AhoCorasickBuilder`:
+
+```rust
+use aho_corasick::AhoCorasickBuilder;
+
+let patterns = &["apple", "maple", "snapple"];
+let haystack = "Nobody likes maple in their apple flavored Snapple.";
+
+let ac = AhoCorasickBuilder::new()
+    .ascii_case_insensitive(true)
+    .build(patterns);
+let mut matches = vec![];
+for mat in ac.find_iter(haystack) {
+    matches.push((mat.pattern(), mat.start(), mat.end()));
+}
+assert_eq!(matches, vec![
+    (1, 13, 18),
+    (0, 28, 33),
+    (2, 43, 50),
+]);
+```
+
+
+### Example: replacing matches in a stream
+
+This example shows how to execute a search and replace on a stream without
+loading the entire stream into memory first.
+
+```rust
+use aho_corasick::AhoCorasick;
+
+let patterns = &["fox", "brown", "quick"];
+let replace_with = &["sloth", "grey", "slow"];
+
+// In a real example, these might be `std::fs::File`s instead. All you need to
+// do is supply a pair of `std::io::Read` and `std::io::Write` implementations.
+let rdr = "The quick brown fox.";
+let mut wtr = vec![];
+
+let ac = AhoCorasick::new(patterns);
+ac.stream_replace_all(rdr.as_bytes(), &mut wtr, replace_with)?;
+assert_eq!(b"The slow grey sloth.".to_vec(), wtr);
+```
+
+
+### Example: finding the leftmost first match
+
+In the textbook description of Aho-Corasick, its formulation is typically
+structured such that it reports all possible matches, even when they overlap
+with another. In many cases, overlapping matches may not be desired, such as
+the case of finding all successive non-overlapping matches like you might with
+a standard regular expression.
+
+Unfortunately the "obvious" way to modify the Aho-Corasick algorithm to do
+this doesn't always work in the expected way, since it will report matches as
+soon as they are seen. For example, consider matching the regex `Samwise|Sam`
+against the text `Samwise`. Most regex engines (that are Perl-like, or
+non-POSIX) will report `Samwise` as a match, but the standard Aho-Corasick
+algorithm modified for reporting non-overlapping matches will report `Sam`.
+
+A novel contribution of this library is the ability to change the match
+semantics of Aho-Corasick (without additional search time overhead) such that
+`Samwise` is reported instead. For example, here's the standard approach:
+
+```rust
+use aho_corasick::AhoCorasick;
+
+let patterns = &["Samwise", "Sam"];
+let haystack = "Samwise";
+
+let ac = AhoCorasick::new(patterns);
+let mat = ac.find(haystack).expect("should have a match");
+assert_eq!("Sam", &haystack[mat.start()..mat.end()]);
+```
+
+And now here's the leftmost-first version, which matches how a Perl-like
+regex will work:
+
+```rust
+use aho_corasick::{AhoCorasickBuilder, MatchKind};
+
+let patterns = &["Samwise", "Sam"];
+let haystack = "Samwise";
+
+let ac = AhoCorasickBuilder::new()
+    .match_kind(MatchKind::LeftmostFirst)
+    .build(patterns);
+let mat = ac.find(haystack).expect("should have a match");
+assert_eq!("Samwise", &haystack[mat.start()..mat.end()]);
+```
+
+In addition to leftmost-first semantics, this library also supports
+leftmost-longest semantics, which match the POSIX behavior of a regular
+expression alternation. See `MatchKind` in the docs for more details.
+
+
+### Minimum Rust version policy
+
+This crate's minimum supported `rustc` version is `1.28.0`.
+
+The current policy is that the minimum Rust version required to use this crate
+can be increased in minor version updates. For example, if `crate 1.0` requires
+Rust 1.20.0, then `crate 1.0.z` for all values of `z` will also require Rust
+1.20.0 or newer. However, `crate 1.y` for `y > 0` may require a newer minimum
+version of Rust.
+
+In general, this crate will be conservative with respect to the minimum
+supported version of Rust.
+
+
+### Future work
+
+Here are some plans for the future:
+
+* Assuming the current API is sufficient, I'd like to commit to it and release
+  a `1.0` version of this crate some time in the next 6-12 months.
+* Support stream searching with leftmost match semantics. Currently, only
+  standard match semantics are supported. Getting this right seems possible,
+  but is tricky since the match state needs to be propagated through multiple
+  searches. (With standard semantics, as soon as a match is seen the search
+  ends.)
diff --git a/UNLICENSE b/UNLICENSE
new file mode 100644
index 0000000..68a49da
--- /dev/null
+++ b/UNLICENSE
@@ -0,0 +1,24 @@
+This is free and unencumbered software released into the public domain.
+
+Anyone is free to copy, modify, publish, use, compile, sell, or
+distribute this software, either in source code form or as a compiled
+binary, for any purpose, commercial or non-commercial, and by any
+means.
+
+In jurisdictions that recognize copyright laws, the author or authors
+of this software dedicate any and all copyright interest in the
+software to the public domain. We make this dedication for the benefit
+of the public at large and to the detriment of our heirs and
+successors. We intend this dedication to be an overt act of
+relinquishment in perpetuity of all present and future rights to this
+software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
+For more information, please refer to <http://unlicense.org/>
diff --git a/rustfmt.toml b/rustfmt.toml
new file mode 100644
index 0000000..aa37a21
--- /dev/null
+++ b/rustfmt.toml
@@ -0,0 +1,2 @@
+max_width = 79
+use_small_heuristics = "max"
diff --git a/src/ahocorasick.rs b/src/ahocorasick.rs
new file mode 100644
index 0000000..9b7d9e7
--- /dev/null
+++ b/src/ahocorasick.rs
@@ -0,0 +1,2087 @@
+use std::io;
+
+use automaton::Automaton;
+use buffer::Buffer;
+use dfa::{self, DFA};
+use error::Result;
+use nfa::{self, NFA};
+use packed;
+use prefilter::PrefilterState;
+use state_id::StateID;
+use Match;
+
+/// An automaton for searching multiple strings in linear time.
+///
+/// The `AhoCorasick` type supports a few basic ways of constructing an
+/// automaton, including
+/// [`AhoCorasick::new`](struct.AhoCorasick.html#method.new)
+/// and
+/// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured).
+/// However, there are a fair number of configurable options that can be set
+/// by using
+/// [`AhoCorasickBuilder`](struct.AhoCorasickBuilder.html)
+/// instead. Such options include, but are not limited to, how matches are
+/// determined, simple case insensitivity, whether to use a DFA or not and
+/// various knobs for controlling the space-vs-time trade offs taken when
+/// building the automaton.
+///
+/// If you aren't sure where to start, try beginning with
+/// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured).
+///
+/// # Resource usage
+///
+/// Aho-Corasick automatons are always constructed in `O(p)` time, where `p`
+/// is the combined length of all patterns being searched. With that said,
+/// building an automaton can be fairly costly because of high constant
+/// factors, particularly when enabling the
+/// [DFA](struct.AhoCorasickBuilder.html#method.dfa)
+/// option (which is disabled by default). For this reason, it's generally a
+/// good idea to build an automaton once and reuse it as much as possible.
+///
+/// Aho-Corasick automatons can also use a fair bit of memory. To get a
+/// concrete idea of how much memory is being used, try using the
+/// [`AhoCorasick::heap_bytes`](struct.AhoCorasick.html#method.heap_bytes)
+/// method.
+///
+/// # Examples
+///
+/// This example shows how to search for occurrences of multiple patterns
+/// simultaneously in a case insensitive fashion. Each match includes the
+/// pattern that matched along with the byte offsets of the match.
+///
+/// ```
+/// use aho_corasick::AhoCorasickBuilder;
+///
+/// let patterns = &["apple", "maple", "snapple"];
+/// let haystack = "Nobody likes maple in their apple flavored Snapple.";
+///
+/// let ac = AhoCorasickBuilder::new()
+///     .ascii_case_insensitive(true)
+///     .build(patterns);
+/// let mut matches = vec![];
+/// for mat in ac.find_iter(haystack) {
+///     matches.push((mat.pattern(), mat.start(), mat.end()));
+/// }
+/// assert_eq!(matches, vec![
+///     (1, 13, 18),
+///     (0, 28, 33),
+///     (2, 43, 50),
+/// ]);
+/// ```
+///
+/// This example shows how to replace matches with some other string:
+///
+/// ```
+/// use aho_corasick::AhoCorasick;
+///
+/// let patterns = &["fox", "brown", "quick"];
+/// let haystack = "The quick brown fox.";
+/// let replace_with = &["sloth", "grey", "slow"];
+///
+/// let ac = AhoCorasick::new(patterns);
+/// let result = ac.replace_all(haystack, replace_with);
+/// assert_eq!(result, "The slow grey sloth.");
+/// ```
+#[derive(Clone, Debug)]
+pub struct AhoCorasick<S: StateID = usize> {
+    imp: Imp<S>,
+    match_kind: MatchKind,
+}
+
+impl AhoCorasick {
+    /// Create a new Aho-Corasick automaton using the default configuration.
+    ///
+    /// The default configuration optimizes for less space usage, but at the
+    /// expense of longer search times. To change the configuration, use
+    /// [`AhoCorasickBuilder`](struct.AhoCorasickBuilder.html)
+    /// for fine-grained control, or
+    /// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured)
+    /// for automatic configuration if you aren't sure which settings to pick.
+    ///
+    /// This uses the default
+    /// [`MatchKind::Standard`](enum.MatchKind.html#variant.Standard)
+    /// match semantics, which reports a match as soon as it is found. This
+    /// corresponds to the standard match semantics supported by textbook
+    /// descriptions of the Aho-Corasick algorithm.
+    ///
+    /// # Examples
+    ///
+    /// Basic usage:
+    ///
+    /// ```
+    /// use aho_corasick::AhoCorasick;
+    ///
+    /// let ac = AhoCorasick::new(&[
+    ///     "foo", "bar", "baz",
+    /// ]);
+    /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
+    /// ```
+    pub fn new<I, P>(patterns: I) -> AhoCorasick
+    where
+        I: IntoIterator<Item = P>,
+        P: AsRef<[u8]>,
+    {
+        AhoCorasickBuilder::new().build(patterns)
+    }
+
+    /// Build an Aho-Corasick automaton with an automatically determined
+    /// configuration.
+    ///
+    /// Specifically, this requires a slice of patterns instead of an iterator
+    /// since the configuration is determined by looking at the patterns before
+    /// constructing the automaton. The idea here is to balance space and time
+    /// automatically. That is, when searching a small number of patterns, this
+    /// will attempt to use the fastest possible configuration since the total
+    /// space required will be small anyway. As the number of patterns grows,
+    /// this will fall back to slower configurations that use less space.
+    ///
+    /// If you want auto configuration but with match semantics different from
+    /// the default `MatchKind::Standard`, then use
+    /// [`AhoCorasickBuilder::auto_configure`](struct.AhoCorasickBuilder.html#method.auto_configure).
+    ///
+    /// # Examples
+    ///
+    /// Basic usage is just like `new`, except you must provide a slice:
+    ///
+    /// ```
+    /// use aho_corasick::AhoCorasick;
+    ///
+    /// let ac = AhoCorasick::new_auto_configured(&[
+    ///     "foo", "bar", "baz",
+    /// ]);
+    /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
+    /// ```
+    pub fn new_auto_configured<B>(patterns: &[B]) -> AhoCorasick
+    where
+        B: AsRef<[u8]>,
+    {
+        AhoCorasickBuilder::new().auto_configure(patterns).build(patterns)
+    }
+}
+
+impl<S: StateID> AhoCorasick<S> {
+    /// Returns true if and only if this automaton matches the haystack at any
+    /// position.
+    ///
+    /// `haystack` may be any type that is cheaply convertible to a `&[u8]`.
+    /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and
+    /// `&[u8]` itself.
+    ///
+    /// # Examples
+    ///
+    /// Basic usage:
+    ///
+    /// ```
+    /// use aho_corasick::AhoCorasick;
+    ///
+    /// let ac = AhoCorasick::new(&[
+    ///     "foo", "bar", "quux", "baz",
+    /// ]);
+    /// assert!(ac.is_match("xxx bar xxx"));
+    /// assert!(!ac.is_match("xxx qux xxx"));
+    /// ```
+    pub fn is_match<B: AsRef<[u8]>>(&self, haystack: B) -> bool {
+        self.earliest_find(haystack).is_some()
+    }
+
+    /// Returns the location of the first detected match in `haystack`.
+    ///
+    /// This method has the same behavior regardless of the
+    /// [`MatchKind`](enum.MatchKind.html)
+    /// of this automaton.
+    ///
+    /// `haystack` may be any type that is cheaply convertible to a `&[u8]`.
+    /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and
+    /// `&[u8]` itself.
+    ///
+    /// # Examples
+    ///
+    /// Basic usage:
+    ///
+    /// ```
+    /// use aho_corasick::AhoCorasick;
+    ///
+    /// let ac = AhoCorasick::new(&[
+    ///     "abc", "b",
+    /// ]);
+    /// let mat = ac.earliest_find("abcd").expect("should have match");
+    /// assert_eq!(1, mat.pattern());
+    /// assert_eq!((1, 2), (mat.start(), mat.end()));
+    /// ```
+    pub fn earliest_find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match> {
+        let mut prestate = PrefilterState::new(self.max_pattern_len());
+        let mut start = self.imp.start_state();
+        self.imp.earliest_find_at(
+            &mut prestate,
+            haystack.as_ref(),
+            0,
+            &mut start,
+        )
+    }
+
+    /// Returns the location of the first match according to the match
+    /// semantics that this automaton was constructed with.
+    ///
+    /// When using `MatchKind::Standard`, this corresponds precisely to the
+    /// same behavior as
+    /// [`earliest_find`](struct.AhoCorasick.html#method.earliest_find).
+    /// Otherwise, match semantics correspond to either
+    /// [leftmost-first](enum.MatchKind.html#variant.LeftmostFirst)
+    /// or
+    /// [leftmost-longest](enum.MatchKind.html#variant.LeftmostLongest).
+    ///
+    /// `haystack` may be any type that is cheaply convertible to a `&[u8]`.
+    /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and
+    /// `&[u8]` itself.
+    ///
+    /// # Examples
+    ///
+    /// Basic usage, with standard semantics:
+    ///
+    /// ```
+    /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
+    ///
+    /// let patterns = &["b", "abc", "abcd"];
+    /// let haystack = "abcd";
+    ///
+    /// let ac = AhoCorasickBuilder::new()
+    ///     .match_kind(MatchKind::Standard) // default, not necessary
+    ///     .build(patterns);
+    /// let mat = ac.find(haystack).expect("should have a match");
+    /// assert_eq!("b", &haystack[mat.start()..mat.end()]);
+    /// ```
+    ///
+    /// Now with leftmost-first semantics:
+    ///
+    /// ```
+    /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
+    ///
+    /// let patterns = &["b", "abc", "abcd"];
+    /// let haystack = "abcd";
+    ///
+    /// let ac = AhoCorasickBuilder::new()
+    ///     .match_kind(MatchKind::LeftmostFirst)
+    ///     .build(patterns);
+    /// let mat = ac.find(haystack).expect("should have a match");
+    /// assert_eq!("abc", &haystack[mat.start()..mat.end()]);
+    /// ```
+    ///
+    /// And finally, leftmost-longest semantics:
+    ///
+    /// ```
+    /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
+    ///
+    /// let patterns = &["b", "abc", "abcd"];
+    /// let haystack = "abcd";
+    ///
+    /// let ac = AhoCorasickBuilder::new()
+    ///     .match_kind(MatchKind::LeftmostLongest)
+    ///     .build(patterns);
+    /// let mat = ac.find(haystack).expect("should have a match");
+    /// assert_eq!("abcd", &haystack[mat.start()..mat.end()]);
+    /// ```
+    pub fn find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match> {
+        let mut prestate = PrefilterState::new(self.max_pattern_len());
+        self.imp.find_at_no_state(&mut prestate, haystack.as_ref(), 0)
+    }
+
+    /// Returns an iterator of non-overlapping matches, using the match
+    /// semantics that this automaton was constructed with.
+    ///
+    /// `haystack` may be any type that is cheaply convertible to a `&[u8]`.
+    /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and
+    /// `&[u8]` itself.
+    ///
+    /// # Examples
+    ///
+    /// Basic usage, with standard semantics:
+    ///
+    /// ```
+    /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
+    ///
+    /// let patterns = &["append", "appendage", "app"];
+    /// let haystack = "append the app to the appendage";
+    ///
+    /// let ac = AhoCorasickBuilder::new()
+    ///     .match_kind(MatchKind::Standard) // default, not necessary
+    ///     .build(patterns);
+    /// let matches: Vec<usize> = ac
+    ///     .find_iter(haystack)
+    ///     .map(|mat| mat.pattern())
+    ///     .collect();
+    /// assert_eq!(vec![2, 2, 2], matches);
+    /// ```
+    ///
+    /// Now with leftmost-first semantics:
+    ///
+    /// ```
+    /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
+    ///
+    /// let patterns = &["append", "appendage", "app"];
+    /// let haystack = "append the app to the appendage";
+    ///
+    /// let ac = AhoCorasickBuilder::new()
+    ///     .match_kind(MatchKind::LeftmostFirst)
+    ///     .build(patterns);
+    /// let matches: Vec<usize> = ac
+    ///     .find_iter(haystack)
+    ///     .map(|mat| mat.pattern())
+    ///     .collect();
+    /// assert_eq!(vec![0, 2, 0], matches);
+    /// ```
+    ///
+    /// And finally, leftmost-longest semantics:
+    ///
+    /// ```
+    /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
+    ///
+    /// let patterns = &["append", "appendage", "app"];
+    /// let haystack = "append the app to the appendage";
+    ///
+    /// let ac = AhoCorasickBuilder::new()
+    ///     .match_kind(MatchKind::LeftmostLongest)
+    ///     .build(patterns);
+    /// let matches: Vec<usize> = ac
+    ///     .find_iter(haystack)
+    ///     .map(|mat| mat.pattern())
+    ///     .collect();
+    /// assert_eq!(vec![0, 2, 1], matches);
+    /// ```
+    pub fn find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>(
+        &'a self,
+        haystack: &'b B,
+    ) -> FindIter<'a, 'b, S> {
+        FindIter::new(self, haystack.as_ref())
+    }
+
+    /// Returns an iterator of overlapping matches in the given `haystack`.
+    ///
+    /// Overlapping matches can _only_ be detected using
+    /// `MatchKind::Standard` semantics. If this automaton was constructed with
+    /// leftmost semantics, then this method will panic. To determine whether
+    /// this will panic at runtime, use the
+    /// [`AhoCorasick::supports_overlapping`](struct.AhoCorasick.html#method.supports_overlapping)
+    /// method.
+    ///
+    /// `haystack` may be any type that is cheaply convertible to a `&[u8]`.
+    /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and
+    /// `&[u8]` itself.
+    ///
+    /// # Panics
+    ///
+    /// This panics when `AhoCorasick::supports_overlapping` returns `false`.
+    /// That is, this panics when this automaton's match semantics are not
+    /// `MatchKind::Standard`.
+    ///
+    /// # Examples
+    ///
+    /// Basic usage, with standard semantics:
+    ///
+    /// ```
+    /// use aho_corasick::AhoCorasick;
+    ///
+    /// let patterns = &["append", "appendage", "app"];
+    /// let haystack = "append the app to the appendage";
+    ///
+    /// let ac = AhoCorasick::new(patterns);
+    /// let matches: Vec<usize> = ac
+    ///     .find_overlapping_iter(haystack)
+    ///     .map(|mat| mat.pattern())
+    ///     .collect();
+    /// assert_eq!(vec![2, 0, 2, 2, 0, 1], matches);
+    /// ```
+    pub fn find_overlapping_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>(
+        &'a self,
+        haystack: &'b B,
+    ) -> FindOverlappingIter<'a, 'b, S> {
+        FindOverlappingIter::new(self, haystack.as_ref())
+    }
+
+    /// Replace all matches with a corresponding value in the `replace_with`
+    /// slice given. Matches correspond to the same matches as reported by
+    /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
+    ///
+    /// Replacements are determined by the index of the matching pattern.
+    /// For example, if the pattern with index `2` is found, then it is
+    /// replaced by `replace_with[2]`.
+    ///
+    /// # Panics
+    ///
+    /// This panics when `replace_with.len()` does not equal the total number
+    /// of patterns that are matched by this automaton.
+    ///
+    /// # Examples
+    ///
+    /// Basic usage:
+    ///
+    /// ```
+    /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
+    ///
+    /// let patterns = &["append", "appendage", "app"];
+    /// let haystack = "append the app to the appendage";
+    ///
+    /// let ac = AhoCorasickBuilder::new()
+    ///     .match_kind(MatchKind::LeftmostFirst)
+    ///     .build(patterns);
+    /// let result = ac.replace_all(haystack, &["x", "y", "z"]);
+    /// assert_eq!("x the z to the xage", result);
+    /// ```
+    pub fn replace_all<B>(&self, haystack: &str, replace_with: &[B]) -> String
+    where
+        B: AsRef<str>,
+    {
+        assert_eq!(
+            replace_with.len(),
+            self.pattern_count(),
+            "replace_all requires a replacement for every pattern \
+             in the automaton"
+        );
+        let mut dst = String::with_capacity(haystack.len());
+        self.replace_all_with(haystack, &mut dst, |mat, _, dst| {
+            dst.push_str(replace_with[mat.pattern()].as_ref());
+            true
+        });
+        dst
+    }
+
+    /// Replace all matches using raw bytes with a corresponding value in the
+    /// `replace_with` slice given. Matches correspond to the same matches as
+    /// reported by [`find_iter`](struct.AhoCorasick.html#method.find_iter).
+    ///
+    /// Replacements are determined by the index of the matching pattern.
+    /// For example, if the pattern with index `2` is found, then it is
+    /// replaced by `replace_with[2]`.
+    ///
+    /// # Panics
+    ///
+    /// This panics when `replace_with.len()` does not equal the total number
+    /// of patterns that are matched by this automaton.
+    ///
+    /// # Examples
+    ///
+    /// Basic usage:
+    ///
+    /// ```
+    /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
+    ///
+    /// let patterns = &["append", "appendage", "app"];
+    /// let haystack = b"append the app to the appendage";
+    ///
+    /// let ac = AhoCorasickBuilder::new()
+    ///     .match_kind(MatchKind::LeftmostFirst)
+    ///     .build(patterns);
+    /// let result = ac.replace_all_bytes(haystack, &["x", "y", "z"]);
+    /// assert_eq!(b"x the z to the xage".to_vec(), result);
+    /// ```
+    pub fn replace_all_bytes<B>(
+        &self,
+        haystack: &[u8],
+        replace_with: &[B],
+    ) -> Vec<u8>
+    where
+        B: AsRef<[u8]>,
+    {
+        assert_eq!(
+            replace_with.len(),
+            self.pattern_count(),
+            "replace_all_bytes requires a replacement for every pattern \
+             in the automaton"
+        );
+        let mut dst = Vec::with_capacity(haystack.len());
+        self.replace_all_with_bytes(haystack, &mut dst, |mat, _, dst| {
+            dst.extend(replace_with[mat.pattern()].as_ref());
+            true
+        });
+        dst
+    }
+
+    /// Replace all matches using a closure called on each match.
+    /// Matches correspond to the same matches as reported by
+    /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
+    ///
+    /// The closure accepts three parameters: the match found, the text of
+    /// the match and a string buffer with which to write the replaced text
+    /// (if any). If the closure returns `true`, then it continues to the next
+    /// match. If the closure returns false, then searching is stopped.
+    ///
+    /// # Examples
+    ///
+    /// Basic usage:
+    ///
+    /// ```
+    /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
+    ///
+    /// let patterns = &["append", "appendage", "app"];
+    /// let haystack = "append the app to the appendage";
+    ///
+    /// let ac = AhoCorasickBuilder::new()
+    ///     .match_kind(MatchKind::LeftmostFirst)
+    ///     .build(patterns);
+    /// let mut result = String::new();
+    /// ac.replace_all_with(haystack, &mut result, |mat, _, dst| {
+    ///     dst.push_str(&mat.pattern().to_string());
+    ///     true
+    /// });
+    /// assert_eq!("0 the 2 to the 0age", result);
+    /// ```
+    pub fn replace_all_with<F>(
+        &self,
+        haystack: &str,
+        dst: &mut String,
+        mut replace_with: F,
+    ) where
+        F: FnMut(&Match, &str, &mut String) -> bool,
+    {
+        let mut last_match = 0;
+        for mat in self.find_iter(haystack) {
+            dst.push_str(&haystack[last_match..mat.start()]);
+            last_match = mat.end();
+            replace_with(&mat, &haystack[mat.start()..mat.end()], dst);
+        }
+        dst.push_str(&haystack[last_match..]);
+    }
+
+    /// Replace all matches using raw bytes with a closure called on each
+    /// match. Matches correspond to the same matches as reported by
+    /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
+    ///
+    /// The closure accepts three parameters: the match found, the text of
+    /// the match and a byte buffer with which to write the replaced text
+    /// (if any). If the closure returns `true`, then it continues to the next
+    /// match. If the closure returns false, then searching is stopped.
+    ///
+    /// # Examples
+    ///
+    /// Basic usage:
+    ///
+    /// ```
+    /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
+    ///
+    /// let patterns = &["append", "appendage", "app"];
+    /// let haystack = b"append the app to the appendage";
+    ///
+    /// let ac = AhoCorasickBuilder::new()
+    ///     .match_kind(MatchKind::LeftmostFirst)
+    ///     .build(patterns);
+    /// let mut result = vec![];
+    /// ac.replace_all_with_bytes(haystack, &mut result, |mat, _, dst| {
+    ///     dst.extend(mat.pattern().to_string().bytes());
+    ///     true
+    /// });
+    /// assert_eq!(b"0 the 2 to the 0age".to_vec(), result);
+    /// ```
+    pub fn replace_all_with_bytes<F>(
+        &self,
+        haystack: &[u8],
+        dst: &mut Vec<u8>,
+        mut replace_with: F,
+    ) where
+        F: FnMut(&Match, &[u8], &mut Vec<u8>) -> bool,
+    {
+        let mut last_match = 0;
+        for mat in self.find_iter(haystack) {
+            dst.extend(&haystack[last_match..mat.start()]);
+            last_match = mat.end();
+            replace_with(&mat, &haystack[mat.start()..mat.end()], dst);
+        }
+        dst.extend(&haystack[last_match..]);
+    }
+
+    /// Returns an iterator of non-overlapping matches in the given
+    /// stream. Matches correspond to the same matches as reported by
+    /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
+    ///
+    /// The matches yielded by this iterator use absolute position offsets in
+    /// the stream given, where the first byte has index `0`. Matches are
+    /// yieled until the stream is exhausted.
+    ///
+    /// Each item yielded by the iterator is an `io::Result<Match>`, where an
+    /// error is yielded if there was a problem reading from the reader given.
+    ///
+    /// When searching a stream, an internal buffer is used. Therefore, callers
+    /// should avoiding providing a buffered reader, if possible.
+    ///
+    /// Searching a stream requires that the automaton was built with
+    /// `MatchKind::Standard` semantics. If this automaton was constructed
+    /// with leftmost semantics, then this method will panic. To determine
+    /// whether this will panic at runtime, use the
+    /// [`AhoCorasick::supports_stream`](struct.AhoCorasick.html#method.supports_stream)
+    /// method.
+    ///
+    /// # Memory usage
+    ///
+    /// In general, searching streams will use a constant amount of memory for
+    /// its internal buffer. The one requirement is that the internal buffer
+    /// must be at least the size of the longest possible match. In most use
+    /// cases, the default buffer size will be much larger than any individual
+    /// match.
+    ///
+    /// # Panics
+    ///
+    /// This panics when `AhoCorasick::supports_stream` returns `false`.
+    /// That is, this panics when this automaton's match semantics are not
+    /// `MatchKind::Standard`. This restriction may be lifted in the future.
+    ///
+    /// # Examples
+    ///
+    /// Basic usage:
+    ///
+    /// ```
+    /// use aho_corasick::AhoCorasick;
+    ///
+    /// # fn example() -> Result<(), ::std::io::Error> {
+    /// let patterns = &["append", "appendage", "app"];
+    /// let haystack = "append the app to the appendage";
+    ///
+    /// let ac = AhoCorasick::new(patterns);
+    /// let mut matches = vec![];
+    /// for result in ac.stream_find_iter(haystack.as_bytes()) {
+    ///     let mat = result?;
+    ///     matches.push(mat.pattern());
+    /// }
+    /// assert_eq!(vec![2, 2, 2], matches);
+    /// # Ok(()) }; example().unwrap()
+    /// ```
+    pub fn stream_find_iter<'a, R: io::Read>(
+        &'a self,
+        rdr: R,
+    ) -> StreamFindIter<'a, R, S> {
+        StreamFindIter::new(self, rdr)
+    }
+
+    /// Search for and replace all matches of this automaton in
+    /// the given reader, and write the replacements to the given
+    /// writer. Matches correspond to the same matches as reported by
+    /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
+    ///
+    /// Replacements are determined by the index of the matching pattern.
+    /// For example, if the pattern with index `2` is found, then it is
+    /// replaced by `replace_with[2]`.
+    ///
+    /// After all matches are replaced, the writer is _not_ flushed.
+    ///
+    /// If there was a problem reading from the given reader or writing to the
+    /// given writer, then the corresponding `io::Error` is returned and all
+    /// replacement is stopped.
+    ///
+    /// When searching a stream, an internal buffer is used. Therefore, callers
+    /// should avoiding providing a buffered reader, if possible. However,
+    /// callers may want to provide a buffered writer.
+    ///
+    /// Searching a stream requires that the automaton was built with
+    /// `MatchKind::Standard` semantics. If this automaton was constructed
+    /// with leftmost semantics, then this method will panic. To determine
+    /// whether this will panic at runtime, use the
+    /// [`AhoCorasick::supports_stream`](struct.AhoCorasick.html#method.supports_stream)
+    /// method.
+    ///
+    /// # Memory usage
+    ///
+    /// In general, searching streams will use a constant amount of memory for
+    /// its internal buffer. The one requirement is that the internal buffer
+    /// must be at least the size of the longest possible match. In most use
+    /// cases, the default buffer size will be much larger than any individual
+    /// match.
+    ///
+    /// # Panics
+    ///
+    /// This panics when `AhoCorasick::supports_stream` returns `false`.
+    /// That is, this panics when this automaton's match semantics are not
+    /// `MatchKind::Standard`. This restriction may be lifted in the future.
+    ///
+    /// # Examples
+    ///
+    /// Basic usage:
+    ///
+    /// ```
+    /// use aho_corasick::AhoCorasick;
+    ///
+    /// # fn example() -> Result<(), ::std::io::Error> {
+    /// let patterns = &["fox", "brown", "quick"];
+    /// let haystack = "The quick brown fox.";
+    /// let replace_with = &["sloth", "grey", "slow"];
+    ///
+    /// let ac = AhoCorasick::new(patterns);
+    /// let mut result = vec![];
+    /// ac.stream_replace_all(haystack.as_bytes(), &mut result, replace_with)?;
+    /// assert_eq!(b"The slow grey sloth.".to_vec(), result);
+    /// # Ok(()) }; example().unwrap()
+    /// ```
+    pub fn stream_replace_all<R, W, B>(
+        &self,
+        rdr: R,
+        wtr: W,
+        replace_with: &[B],
+    ) -> io::Result<()>
+    where
+        R: io::Read,
+        W: io::Write,
+        B: AsRef<[u8]>,
+    {
+        assert_eq!(
+            replace_with.len(),
+            self.pattern_count(),
+            "stream_replace_all requires a replacement for every pattern \
+             in the automaton"
+        );
+        self.stream_replace_all_with(rdr, wtr, |mat, _, wtr| {
+            wtr.write_all(replace_with[mat.pattern()].as_ref())
+        })
+    }
+
+    /// Search the given reader and replace all matches of this automaton
+    /// using the given closure. The result is written to the given
+    /// writer. Matches correspond to the same matches as reported by
+    /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
+    ///
+    /// The closure accepts three parameters: the match found, the text of
+    /// the match and the writer with which to write the replaced text
+    /// (if any). If the closure returns `true`, then it continues to the next
+    /// match. If the closure returns false, then searching is stopped.
+    ///
+    /// After all matches are replaced, the writer is _not_ flushed.
+    ///
+    /// If there was a problem reading from the given reader or writing to the
+    /// given writer, then the corresponding `io::Error` is returned and all
+    /// replacement is stopped.
+    ///
+    /// When searching a stream, an internal buffer is used. Therefore, callers
+    /// should avoiding providing a buffered reader, if possible. However,
+    /// callers may want to provide a buffered writer.
+    ///
+    /// Searching a stream requires that the automaton was built with
+    /// `MatchKind::Standard` semantics. If this automaton was constructed
+    /// with leftmost semantics, then this method will panic. To determine
+    /// whether this will panic at runtime, use the
+    /// [`AhoCorasick::supports_stream`](struct.AhoCorasick.html#method.supports_stream)
+    /// method.
+    ///
+    /// # Memory usage
+    ///
+    /// In general, searching streams will use a constant amount of memory for
+    /// its internal buffer. The one requirement is that the internal buffer
+    /// must be at least the size of the longest possible match. In most use
+    /// cases, the default buffer size will be much larger than any individual
+    /// match.
+    ///
+    /// # Panics
+    ///
+    /// This panics when `AhoCorasick::supports_stream` returns `false`.
+    /// That is, this panics when this automaton's match semantics are not
+    /// `MatchKind::Standard`. This restriction may be lifted in the future.
+    ///
+    /// # Examples
+    ///
+    /// Basic usage:
+    ///
+    /// ```
+    /// use std::io::Write;
+    /// use aho_corasick::AhoCorasick;
+    ///
+    /// # fn example() -> Result<(), ::std::io::Error> {
+    /// let patterns = &["fox", "brown", "quick"];
+    /// let haystack = "The quick brown fox.";
+    ///
+    /// let ac = AhoCorasick::new(patterns);
+    /// let mut result = vec![];
+    /// ac.stream_replace_all_with(
+    ///     haystack.as_bytes(),
+    ///     &mut result,
+    ///     |mat, _, wtr| {
+    ///         wtr.write_all(mat.pattern().to_string().as_bytes())
+    ///     },
+    /// )?;
+    /// assert_eq!(b"The 2 1 0.".to_vec(), result);
+    /// # Ok(()) }; example().unwrap()
+    /// ```
+    pub fn stream_replace_all_with<R, W, F>(
+        &self,
+        rdr: R,
+        mut wtr: W,
+        mut replace_with: F,
+    ) -> io::Result<()>
+    where
+        R: io::Read,
+        W: io::Write,
+        F: FnMut(&Match, &[u8], &mut W) -> io::Result<()>,
+    {
+        let mut it = StreamChunkIter::new(self, rdr);
+        while let Some(result) = it.next() {
+            let chunk = result?;
+            match chunk {
+                StreamChunk::NonMatch { bytes, .. } => {
+                    wtr.write_all(bytes)?;
+                }
+                StreamChunk::Match { bytes, mat } => {
+                    replace_with(&mat, bytes, &mut wtr)?;
+                }
+            }
+        }
+        Ok(())
+    }
+
+    /// Returns the match kind used by this automaton.
+    ///
+    /// # Examples
+    ///
+    /// Basic usage:
+    ///
+    /// ```
+    /// use aho_corasick::{AhoCorasick, MatchKind};
+    ///
+    /// let ac = AhoCorasick::new(&[
+    ///     "foo", "bar", "quux", "baz",
+    /// ]);
+    /// assert_eq!(&MatchKind::Standard, ac.match_kind());
+    /// ```
+    pub fn match_kind(&self) -> &MatchKind {
+        self.imp.match_kind()
+    }
+
+    /// Returns the length of the longest pattern matched by this automaton.
+    ///
+    /// # Examples
+    ///
+    /// Basic usage:
+    ///
+    /// ```
+    /// use aho_corasick::AhoCorasick;
+    ///
+    /// let ac = AhoCorasick::new(&[
+    ///     "foo", "bar", "quux", "baz",
+    /// ]);
+    /// assert_eq!(4, ac.max_pattern_len());
+    /// ```
+    pub fn max_pattern_len(&self) -> usize {
+        self.imp.max_pattern_len()
+    }
+
+    /// Return the total number of patterns matched by this automaton.
+    ///
+    /// This includes patterns that may never participate in a match. For
+    /// example, if
+    /// [`MatchKind::LeftmostFirst`](enum.MatchKind.html#variant.LeftmostFirst)
+    /// match semantics are used, and the patterns `Sam` and `Samwise` were
+    /// used to build the automaton, then `Samwise` can never participate in a
+    /// match because `Sam` will always take priority.
+    ///
+    /// # Examples
+    ///
+    /// Basic usage:
+    ///
+    /// ```
+    /// use aho_corasick::AhoCorasick;
+    ///
+    /// let ac = AhoCorasick::new(&[
+    ///     "foo", "bar", "baz",
+    /// ]);
+    /// assert_eq!(3, ac.pattern_count());
+    /// ```
+    pub fn pattern_count(&self) -> usize {
+        self.imp.pattern_count()
+    }
+
+    /// Returns true if and only if this automaton supports reporting
+    /// overlapping matches.
+    ///
+    /// If this returns false and overlapping matches are requested, then it
+    /// will result in a panic.
+    ///
+    /// Since leftmost matching is inherently incompatible with overlapping
+    /// matches, only
+    /// [`MatchKind::Standard`](enum.MatchKind.html#variant.Standard)
+    /// supports overlapping matches. This is unlikely to change in the future.
+    ///
+    /// # Examples
+    ///
+    /// Basic usage:
+    ///
+    /// ```
+    /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
+    ///
+    /// let ac = AhoCorasickBuilder::new()
+    ///     .match_kind(MatchKind::Standard)
+    ///     .build(&["foo", "bar", "baz"]);
+    /// assert!(ac.supports_overlapping());
+    ///
+    /// let ac = AhoCorasickBuilder::new()
+    ///     .match_kind(MatchKind::LeftmostFirst)
+    ///     .build(&["foo", "bar", "baz"]);
+    /// assert!(!ac.supports_overlapping());
+    /// ```
+    pub fn supports_overlapping(&self) -> bool {
+        self.match_kind.supports_overlapping()
+    }
+
+    /// Returns true if and only if this automaton supports stream searching.
+    ///
+    /// If this returns false and stream searching (or replacing) is attempted,
+    /// then it will result in a panic.
+    ///
+    /// Currently, only
+    /// [`MatchKind::Standard`](enum.MatchKind.html#variant.Standard)
+    /// supports streaming. This may be expanded in the future.
+    ///
+    /// # Examples
+    ///
+    /// Basic usage:
+    ///
+    /// ```
+    /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
+    ///
+    /// let ac = AhoCorasickBuilder::new()
+    ///     .match_kind(MatchKind::Standard)
+    ///     .build(&["foo", "bar", "baz"]);
+    /// assert!(ac.supports_stream());
+    ///
+    /// let ac = AhoCorasickBuilder::new()
+    ///     .match_kind(MatchKind::LeftmostFirst)
+    ///     .build(&["foo", "bar", "baz"]);
+    /// assert!(!ac.supports_stream());
+    /// ```
+    pub fn supports_stream(&self) -> bool {
+        self.match_kind.supports_stream()
+    }
+
+    /// Returns the approximate total amount of heap used by this automaton, in
+    /// units of bytes.
+    ///
+    /// # Examples
+    ///
+    /// This example shows the difference in heap usage between a few
+    /// configurations:
+    ///
+    /// ```ignore
+    /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
+    ///
+    /// let ac = AhoCorasickBuilder::new()
+    ///     .dfa(false) // default
+    ///     .build(&["foo", "bar", "baz"]);
+    /// assert_eq!(10_336, ac.heap_bytes());
+    ///
+    /// let ac = AhoCorasickBuilder::new()
+    ///     .dfa(false) // default
+    ///     .ascii_case_insensitive(true)
+    ///     .build(&["foo", "bar", "baz"]);
+    /// assert_eq!(10_384, ac.heap_bytes());
+    ///
+    /// let ac = AhoCorasickBuilder::new()
+    ///     .dfa(true)
+    ///     .byte_classes(false)
+    ///     .build(&["foo", "bar", "baz"]);
+    /// assert_eq!(20_768, ac.heap_bytes());
+    ///
+    /// let ac = AhoCorasickBuilder::new()
+    ///     .dfa(true)
+    ///     .byte_classes(true) // default
+    ///     .build(&["foo", "bar", "baz"]);
+    /// assert_eq!(1_248, ac.heap_bytes());
+    ///
+    /// let ac = AhoCorasickBuilder::new()
+    ///     .dfa(true)
+    ///     .ascii_case_insensitive(true)
+    ///     .build(&["foo", "bar", "baz"]);
+    /// assert_eq!(1_248, ac.heap_bytes());
+    /// ```
+    pub fn heap_bytes(&self) -> usize {
+        match self.imp {
+            Imp::NFA(ref nfa) => nfa.heap_bytes(),
+            Imp::DFA(ref dfa) => dfa.heap_bytes(),
+        }
+    }
+}
+
+/// The internal implementation of Aho-Corasick, which is either an NFA or
+/// a DFA. The NFA is slower but uses less memory. The DFA is faster but uses
+/// more memory.
+#[derive(Clone, Debug)]
+enum Imp<S: StateID> {
+    NFA(NFA<S>),
+    DFA(DFA<S>),
+}
+
+impl<S: StateID> Imp<S> {
+    /// Returns the type of match semantics implemented by this automaton.
+    fn match_kind(&self) -> &MatchKind {
+        match *self {
+            Imp::NFA(ref nfa) => nfa.match_kind(),
+            Imp::DFA(ref dfa) => dfa.match_kind(),
+        }
+    }
+
+    /// Returns the identifier of the start state.
+    fn start_state(&self) -> S {
+        match *self {
+            Imp::NFA(ref nfa) => nfa.start_state(),
+            Imp::DFA(ref dfa) => dfa.start_state(),
+        }
+    }
+
+    /// The length, in bytes, of the longest pattern in this automaton. This
+    /// information is useful for maintaining correct buffer sizes when
+    /// searching on streams.
+    fn max_pattern_len(&self) -> usize {
+        match *self {
+            Imp::NFA(ref nfa) => nfa.max_pattern_len(),
+            Imp::DFA(ref dfa) => dfa.max_pattern_len(),
+        }
+    }
+
+    /// The total number of patterns added to this automaton. This includes
+    /// patterns that may never match. The maximum matching pattern that can be
+    /// reported is exactly one less than this number.
+    fn pattern_count(&self) -> usize {
+        match *self {
+            Imp::NFA(ref nfa) => nfa.pattern_count(),
+            Imp::DFA(ref dfa) => dfa.pattern_count(),
+        }
+    }
+
+    #[inline(always)]
+    fn overlapping_find_at(
+        &self,
+        prestate: &mut PrefilterState,
+        haystack: &[u8],
+        at: usize,
+        state_id: &mut S,
+        match_index: &mut usize,
+    ) -> Option<Match> {
+        match *self {
+            Imp::NFA(ref nfa) => nfa.overlapping_find_at(
+                prestate,
+                haystack,
+                at,
+                state_id,
+                match_index,
+            ),
+            Imp::DFA(ref dfa) => dfa.overlapping_find_at(
+                prestate,
+                haystack,
+                at,
+                state_id,
+                match_index,
+            ),
+        }
+    }
+
+    #[inline(always)]
+    fn earliest_find_at(
+        &self,
+        prestate: &mut PrefilterState,
+        haystack: &[u8],
+        at: usize,
+        state_id: &mut S,
+    ) -> Option<Match> {
+        match *self {
+            Imp::NFA(ref nfa) => {
+                nfa.earliest_find_at(prestate, haystack, at, state_id)
+            }
+            Imp::DFA(ref dfa) => {
+                dfa.earliest_find_at(prestate, haystack, at, state_id)
+            }
+        }
+    }
+
+    #[inline(always)]
+    fn find_at_no_state(
+        &self,
+        prestate: &mut PrefilterState,
+        haystack: &[u8],
+        at: usize,
+    ) -> Option<Match> {
+        match *self {
+            Imp::NFA(ref nfa) => nfa.find_at_no_state(prestate, haystack, at),
+            Imp::DFA(ref dfa) => dfa.find_at_no_state(prestate, haystack, at),
+        }
+    }
+}
+
+/// An iterator of non-overlapping matches in a particular haystack.
+///
+/// This iterator yields matches according to the
+/// [`MatchKind`](enum.MatchKind.html)
+/// used by this automaton.
+///
+/// This iterator is constructed via the
+/// [`AhoCorasick::find_iter`](struct.AhoCorasick.html#method.find_iter)
+/// method.
+///
+/// The type variable `S` refers to the representation used for state
+/// identifiers. (By default, this is `usize`.)
+///
+/// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton.
+///
+/// The lifetime `'b` refers to the lifetime of the haystack being searched.
+#[derive(Debug)]
+pub struct FindIter<'a, 'b, S: 'a + StateID> {
+    fsm: &'a Imp<S>,
+    prestate: PrefilterState,
+    haystack: &'b [u8],
+    pos: usize,
+}
+
+impl<'a, 'b, S: StateID> FindIter<'a, 'b, S> {
+    fn new(ac: &'a AhoCorasick<S>, haystack: &'b [u8]) -> FindIter<'a, 'b, S> {
+        let prestate = PrefilterState::new(ac.max_pattern_len());
+        FindIter { fsm: &ac.imp, prestate, haystack, pos: 0 }
+    }
+}
+
+impl<'a, 'b, S: StateID> Iterator for FindIter<'a, 'b, S> {
+    type Item = Match;
+
+    fn next(&mut self) -> Option<Match> {
+        if self.pos > self.haystack.len() {
+            return None;
+        }
+        let result = self.fsm.find_at_no_state(
+            &mut self.prestate,
+            self.haystack,
+            self.pos,
+        );
+        let mat = match result {
+            None => return None,
+            Some(mat) => mat,
+        };
+        if mat.end() == self.pos {
+            // If the automaton can match the empty string and if we found an
+            // empty match, then we need to forcefully move the position.
+            self.pos += 1;
+        } else {
+            self.pos = mat.end();
+        }
+        Some(mat)
+    }
+}
+
+/// An iterator of overlapping matches in a particular haystack.
+///
+/// This iterator will report all possible matches in a particular haystack,
+/// even when the matches overlap.
+///
+/// This iterator is constructed via the
+/// [`AhoCorasick::find_overlapping_iter`](struct.AhoCorasick.html#method.find_overlapping_iter)
+/// method.
+///
+/// The type variable `S` refers to the representation used for state
+/// identifiers. (By default, this is `usize`.)
+///
+/// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton.
+///
+/// The lifetime `'b` refers to the lifetime of the haystack being searched.
+#[derive(Debug)]
+pub struct FindOverlappingIter<'a, 'b, S: 'a + StateID> {
+    fsm: &'a Imp<S>,
+    prestate: PrefilterState,
+    haystack: &'b [u8],
+    pos: usize,
+    last_match_end: usize,
+    state_id: S,
+    match_index: usize,
+}
+
+impl<'a, 'b, S: StateID> FindOverlappingIter<'a, 'b, S> {
+    fn new(
+        ac: &'a AhoCorasick<S>,
+        haystack: &'b [u8],
+    ) -> FindOverlappingIter<'a, 'b, S> {
+        assert!(
+            ac.supports_overlapping(),
+            "automaton does not support overlapping searches"
+        );
+        let prestate = PrefilterState::new(ac.max_pattern_len());
+        FindOverlappingIter {
+            fsm: &ac.imp,
+            prestate,
+            haystack,
+            pos: 0,
+            last_match_end: 0,
+            state_id: ac.imp.start_state(),
+            match_index: 0,
+        }
+    }
+}
+
+impl<'a, 'b, S: StateID> Iterator for FindOverlappingIter<'a, 'b, S> {
+    type Item = Match;
+
+    fn next(&mut self) -> Option<Match> {
+        let result = self.fsm.overlapping_find_at(
+            &mut self.prestate,
+            self.haystack,
+            self.pos,
+            &mut self.state_id,
+            &mut self.match_index,
+        );
+        match result {
+            None => return None,
+            Some(m) => {
+                self.pos = m.end();
+                Some(m)
+            }
+        }
+    }
+}
+
+/// An iterator that reports Aho-Corasick matches in a stream.
+///
+/// This iterator yields elements of type `io::Result<Match>`, where an error
+/// is reported if there was a problem reading from the underlying stream.
+/// The iterator terminates only when the underlying stream reaches `EOF`.
+///
+/// This iterator is constructed via the
+/// [`AhoCorasick::stream_find_iter`](struct.AhoCorasick.html#method.stream_find_iter)
+/// method.
+///
+/// The type variable `R` refers to the `io::Read` stream that is being read
+/// from.
+///
+/// The type variable `S` refers to the representation used for state
+/// identifiers. (By default, this is `usize`.)
+///
+/// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton.
+#[derive(Debug)]
+pub struct StreamFindIter<'a, R, S: 'a + StateID> {
+    it: StreamChunkIter<'a, R, S>,
+}
+
+impl<'a, R: io::Read, S: StateID> StreamFindIter<'a, R, S> {
+    fn new(ac: &'a AhoCorasick<S>, rdr: R) -> StreamFindIter<'a, R, S> {
+        StreamFindIter { it: StreamChunkIter::new(ac, rdr) }
+    }
+}
+
+impl<'a, R: io::Read, S: StateID> Iterator for StreamFindIter<'a, R, S> {
+    type Item = io::Result<Match>;
+
+    fn next(&mut self) -> Option<io::Result<Match>> {
+        loop {
+            match self.it.next() {
+                None => return None,
+                Some(Err(err)) => return Some(Err(err)),
+                Some(Ok(StreamChunk::NonMatch { .. })) => {}
+                Some(Ok(StreamChunk::Match { mat, .. })) => {
+                    return Some(Ok(mat));
+                }
+            }
+        }
+    }
+}
+
+/// An iterator over chunks in an underlying reader. Each chunk either
+/// corresponds to non-matching bytes or matching bytes, but all bytes from
+/// the underlying reader are reported in sequence. There may be an arbitrary
+/// number of non-matching chunks before seeing a matching chunk.
+///
+/// N.B. This does not actually implement Iterator because we need to borrow
+/// from the underlying reader. But conceptually, it's still an iterator.
+#[derive(Debug)]
+struct StreamChunkIter<'a, R, S: 'a + StateID> {
+    /// The AC automaton.
+    fsm: &'a Imp<S>,
+    /// State associated with this automaton's prefilter. It is a heuristic
+    /// for stopping the prefilter if it's deemed ineffective.
+    prestate: PrefilterState,
+    /// The source of bytes we read from.
+    rdr: R,
+    /// A fixed size buffer. This is what we actually search. There are some
+    /// invariants around the buffer's size, namely, it must be big enough to
+    /// contain the longest possible match.
+    buf: Buffer,
+    /// The ID of the FSM state we're currently in.
+    state_id: S,
+    /// The current position at which to start the next search in `buf`.
+    search_pos: usize,
+    /// The absolute position of `search_pos`, where `0` corresponds to the
+    /// position of the first byte read from `rdr`.
+    absolute_pos: usize,
+    /// The ending position of the last StreamChunk that was returned to the
+    /// caller. This position is used to determine whether we need to emit
+    /// non-matching bytes before emitting a match.
+    report_pos: usize,
+    /// A match that should be reported on the next call.
+    pending_match: Option<Match>,
+    /// Enabled only when the automaton can match the empty string. When
+    /// enabled, we need to execute one final search after consuming the
+    /// reader to find the trailing empty match.
+    has_empty_match_at_end: bool,
+}
+
+/// A single chunk yielded by the stream chunk iterator.
+///
+/// The `'r` lifetime refers to the lifetime of the stream chunk iterator.
+#[derive(Debug)]
+enum StreamChunk<'r> {
+    /// A chunk that does not contain any matches.
+    NonMatch { bytes: &'r [u8], start: usize },
+    /// A chunk that precisely contains a match.
+    Match { bytes: &'r [u8], mat: Match },
+}
+
+impl<'a, R: io::Read, S: StateID> StreamChunkIter<'a, R, S> {
+    fn new(ac: &'a AhoCorasick<S>, rdr: R) -> StreamChunkIter<'a, R, S> {
+        assert!(
+            ac.supports_stream(),
+            "stream searching is only supported for Standard match semantics"
+        );
+
+        let prestate = PrefilterState::new(ac.max_pattern_len());
+        let buf = Buffer::new(ac.imp.max_pattern_len());
+        let state_id = ac.imp.start_state();
+        StreamChunkIter {
+            fsm: &ac.imp,
+            prestate,
+            rdr,
+            buf,
+            state_id,
+            absolute_pos: 0,
+            report_pos: 0,
+            search_pos: 0,
+            pending_match: None,
+            has_empty_match_at_end: ac.is_match(""),
+        }
+    }
+
+    fn next<'r>(&'r mut self) -> Option<io::Result<StreamChunk<'r>>> {
+        loop {
+            if let Some(mut mat) = self.pending_match.take() {
+                let bytes = &self.buf.buffer()[mat.start()..mat.end()];
+                self.report_pos = mat.end();
+                mat = mat.increment(self.absolute_pos);
+                return Some(Ok(StreamChunk::Match { bytes, mat }));
+            }
+            if self.search_pos >= self.buf.len() {
+                if let Some(end) = self.unreported() {
+                    let bytes = &self.buf.buffer()[self.report_pos..end];
+                    let start = self.absolute_pos + self.report_pos;
+                    self.report_pos = end;
+                    return Some(Ok(StreamChunk::NonMatch { bytes, start }));
+                }
+                if self.buf.len() >= self.buf.min_buffer_len() {
+                    // This is the point at which we roll our buffer, which we
+                    // only do if our buffer has at least the minimum amount of
+                    // bytes in it. Before rolling, we update our various
+                    // positions to be consistent with the buffer after it has
+                    // been rolled.
+
+                    self.report_pos -=
+                        self.buf.len() - self.buf.min_buffer_len();
+                    self.absolute_pos +=
+                        self.search_pos - self.buf.min_buffer_len();
+                    self.search_pos = self.buf.min_buffer_len();
+                    self.buf.roll();
+                }
+                match self.buf.fill(&mut self.rdr) {
+                    Err(err) => return Some(Err(err)),
+                    Ok(false) => {
+                        // We've hit EOF, but if there are still some
+                        // unreported bytes remaining, return them now.
+                        if self.report_pos < self.buf.len() {
+                            let bytes = &self.buf.buffer()[self.report_pos..];
+                            let start = self.absolute_pos + self.report_pos;
+                            self.report_pos = self.buf.len();
+
+                            let chunk = StreamChunk::NonMatch { bytes, start };
+                            return Some(Ok(chunk));
+                        } else {
+                            // We've reported everything, but there might still
+                            // be a match at the very last position.
+                            if !self.has_empty_match_at_end {
+                                return None;
+                            }
+                            // fallthrough for another search to get trailing
+                            // empty matches
+                            self.has_empty_match_at_end = false;
+                        }
+                    }
+                    Ok(true) => {}
+                }
+            }
+            let result = self.fsm.earliest_find_at(
+                &mut self.prestate,
+                self.buf.buffer(),
+                self.search_pos,
+                &mut self.state_id,
+            );
+            match result {
+                None => {
+                    self.search_pos = self.buf.len();
+                }
+                Some(mat) => {
+                    self.state_id = self.fsm.start_state();
+                    if mat.end() == self.search_pos {
+                        // If the automaton can match the empty string and if
+                        // we found an empty match, then we need to forcefully
+                        // move the position.
+                        self.search_pos += 1;
+                    } else {
+                        self.search_pos = mat.end();
+                    }
+                    self.pending_match = Some(mat.clone());
+                    if self.report_pos < mat.start() {
+                        let bytes =
+                            &self.buf.buffer()[self.report_pos..mat.start()];
+                        let start = self.absolute_pos + self.report_pos;
+                        self.report_pos = mat.start();
+
+                        let chunk = StreamChunk::NonMatch { bytes, start };
+                        return Some(Ok(chunk));
+                    }
+                }
+            }
+        }
+    }
+
+    fn unreported(&self) -> Option<usize> {
+        let end = self.search_pos.saturating_sub(self.buf.min_buffer_len());
+        if self.report_pos < end {
+            Some(end)
+        } else {
+            None
+        }
+    }
+}
+
+/// A builder for configuring an Aho-Corasick automaton.
+#[derive(Clone, Debug)]
+pub struct AhoCorasickBuilder {
+    nfa_builder: nfa::Builder,
+    dfa_builder: dfa::Builder,
+    dfa: bool,
+}
+
+impl Default for AhoCorasickBuilder {
+    fn default() -> AhoCorasickBuilder {
+        AhoCorasickBuilder::new()
+    }
+}
+
+impl AhoCorasickBuilder {
+    /// Create a new builder for configuring an Aho-Corasick automaton.
+    ///
+    /// If you don't need fine grained configuration or aren't sure which knobs
+    /// to set, try using
+    /// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured)
+    /// instead.
+    pub fn new() -> AhoCorasickBuilder {
+        AhoCorasickBuilder {
+            nfa_builder: nfa::Builder::new(),
+            dfa_builder: dfa::Builder::new(),
+            dfa: false,
+        }
+    }
+
+    /// Build an Aho-Corasick automaton using the configuration set on this
+    /// builder.
+    ///
+    /// A builder may be reused to create more automatons.
+    ///
+    /// This method will use the default for representing internal state
+    /// identifiers, which is `usize`. This guarantees that building the
+    /// automaton will succeed and is generally a good default, but can make
+    /// the size of the automaton 2-8 times bigger than it needs to be,
+    /// depending on your target platform.
+    ///
+    /// # Examples
+    ///
+    /// Basic usage:
+    ///
+    /// ```
+    /// use aho_corasick::AhoCorasickBuilder;
+    ///
+    /// let patterns = &["foo", "bar", "baz"];
+    /// let ac = AhoCorasickBuilder::new()
+    ///     .build(patterns);
+    /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
+    /// ```
+    pub fn build<I, P>(&self, patterns: I) -> AhoCorasick
+    where
+        I: IntoIterator<Item = P>,
+        P: AsRef<[u8]>,
+    {
+        // The builder only returns an error if the chosen state ID
+        // representation is too small to fit all of the given patterns. In
+        // this case, since we fix the representation to usize, it will always
+        // work because it's impossible to overflow usize since the underlying
+        // storage would OOM long before that happens.
+        self.build_with_size::<usize, I, P>(patterns)
+            .expect("usize state ID type should always work")
+    }
+
+    /// Build an Aho-Corasick automaton using the configuration set on this
+    /// builder with a specific state identifier representation. This only has
+    /// an effect when the `dfa` option is enabled.
+    ///
+    /// Generally, the choices for a state identifier representation are
+    /// `u8`, `u16`, `u32`, `u64` or `usize`, with `usize` being the default.
+    /// The advantage of choosing a smaller state identifier representation
+    /// is that the automaton produced will be smaller. This might be
+    /// beneficial for just generally using less space, or might even allow it
+    /// to fit more of the automaton in your CPU's cache, leading to overall
+    /// better search performance.
+    ///
+    /// Unlike the standard `build` method, this can report an error if the
+    /// state identifier representation cannot support the size of the
+    /// automaton.
+    ///
+    /// Note that the state identifier representation is determined by the
+    /// `S` type variable. This requires a type hint of some sort, either
+    /// by specifying the return type or using the turbofish, e.g.,
+    /// `build_with_size::<u16, _, _>(...)`.
+    ///
+    /// # Examples
+    ///
+    /// Basic usage:
+    ///
+    /// ```
+    /// use aho_corasick::{AhoCorasick, AhoCorasickBuilder};
+    ///
+    /// # fn example() -> Result<(), ::aho_corasick::Error> {
+    /// let patterns = &["foo", "bar", "baz"];
+    /// let ac: AhoCorasick<u8> = AhoCorasickBuilder::new()
+    ///     .build_with_size(patterns)?;
+    /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
+    /// # Ok(()) }; example().unwrap()
+    /// ```
+    ///
+    /// Or alternatively, with turbofish:
+    ///
+    /// ```
+    /// use aho_corasick::AhoCorasickBuilder;
+    ///
+    /// # fn example() -> Result<(), ::aho_corasick::Error> {
+    /// let patterns = &["foo", "bar", "baz"];
+    /// let ac = AhoCorasickBuilder::new()
+    ///     .build_with_size::<u8, _, _>(patterns)?;
+    /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
+    /// # Ok(()) }; example().unwrap()
+    /// ```
+    pub fn build_with_size<S, I, P>(
+        &self,
+        patterns: I,
+    ) -> Result<AhoCorasick<S>>
+    where
+        S: StateID,
+        I: IntoIterator<Item = P>,
+        P: AsRef<[u8]>,
+    {
+        let nfa = self.nfa_builder.build(patterns)?;
+        let match_kind = nfa.match_kind().clone();
+        let imp = if self.dfa {
+            let dfa = self.dfa_builder.build(&nfa)?;
+            Imp::DFA(dfa)
+        } else {
+            Imp::NFA(nfa)
+        };
+        Ok(AhoCorasick { imp, match_kind })
+    }
+
+    /// Automatically configure the settings on this builder according to the
+    /// patterns that will be used to construct the automaton.
+    ///
+    /// The idea here is to balance space and time automatically. That is, when
+    /// searching a small number of patterns, this will attempt to use the
+    /// fastest possible configuration since the total space required will be
+    /// small anyway. As the number of patterns grows, this will fall back to
+    /// slower configurations that use less space.
+    ///
+    /// This is guaranteed to never set `match_kind`, but any other option may
+    /// be overridden.
+    ///
+    /// # Examples
+    ///
+    /// Basic usage:
+    ///
+    /// ```
+    /// use aho_corasick::AhoCorasickBuilder;
+    ///
+    /// let patterns = &["foo", "bar", "baz"];
+    /// let ac = AhoCorasickBuilder::new()
+    ///     .auto_configure(patterns)
+    ///     .build(patterns);
+    /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
+    /// ```
+    pub fn auto_configure<B: AsRef<[u8]>>(
+        &mut self,
+        patterns: &[B],
+    ) -> &mut AhoCorasickBuilder {
+        // N.B. Currently we only use the length of `patterns` to make a
+        // decision here, and could therefore ask for an `ExactSizeIterator`
+        // instead. But it's conceivable that we might adapt this to look at
+        // the total number of bytes, which would requires a second pass.
+        //
+        // The logic here is fairly rudimentary at the moment, but probably
+        // OK. The idea here is to use the fastest thing possible for a small
+        // number of patterns. That is, a DFA with no byte classes, since byte
+        // classes require an extra indirection for every byte searched. With a
+        // moderate number of patterns, we still want a DFA, but save on both
+        // space and compilation time by enabling byte classes. Finally, fall
+        // back to the slower but smaller NFA.
+        if patterns.len() <= 100 {
+            // N.B. Using byte classes can actually be faster by improving
+            // locality, but this only really applies for multi-megabyte
+            // automata (i.e., automata that don't fit in your CPU's cache).
+            self.dfa(true).byte_classes(false);
+        } else if patterns.len() <= 5000 {
+            self.dfa(true);
+        }
+        self
+    }
+
+    /// Set the desired match semantics.
+    ///
+    /// The default is `MatchKind::Standard`, which corresponds to the match
+    /// semantics supported by the standard textbook description of the
+    /// Aho-Corasick algorithm. Namely, matches are reported as soon as they
+    /// are found. Moreover, this is the only way to get overlapping matches
+    /// or do stream searching.
+    ///
+    /// The other kinds of match semantics that are supported are
+    /// `MatchKind::LeftmostFirst` and `MatchKind::LeftmostLongest`. The former
+    /// corresponds to the match you would get if you were to try to match
+    /// each pattern at each position in the haystack in the same order that
+    /// you give to the automaton. That is, it returns the leftmost match
+    /// corresponding the earliest pattern given to the automaton. The latter
+    /// corresponds to finding the longest possible match among all leftmost
+    /// matches.
+    ///
+    /// For more details on match semantics, see the
+    /// [documentation for `MatchKind`](enum.MatchKind.html).
+    ///
+    /// # Examples
+    ///
+    /// In these examples, we demonstrate the differences between match
+    /// semantics for a particular set of patterns in a specific order:
+    /// `b`, `abc`, `abcd`.
+    ///
+    /// Standard semantics:
+    ///
+    /// ```
+    /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
+    ///
+    /// let patterns = &["b", "abc", "abcd"];
+    /// let haystack = "abcd";
+    ///
+    /// let ac = AhoCorasickBuilder::new()
+    ///     .match_kind(MatchKind::Standard) // default, not necessary
+    ///     .build(patterns);
+    /// let mat = ac.find(haystack).expect("should have a match");
+    /// assert_eq!("b", &haystack[mat.start()..mat.end()]);
+    /// ```
+    ///
+    /// Leftmost-first semantics:
+    ///
+    /// ```
+    /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
+    ///
+    /// let patterns = &["b", "abc", "abcd"];
+    /// let haystack = "abcd";
+    ///
+    /// let ac = AhoCorasickBuilder::new()
+    ///     .match_kind(MatchKind::LeftmostFirst)
+    ///     .build(patterns);
+    /// let mat = ac.find(haystack).expect("should have a match");
+    /// assert_eq!("abc", &haystack[mat.start()..mat.end()]);
+    /// ```
+    ///
+    /// Leftmost-longest semantics:
+    ///
+    /// ```
+    /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
+    ///
+    /// let patterns = &["b", "abc", "abcd"];
+    /// let haystack = "abcd";
+    ///
+    /// let ac = AhoCorasickBuilder::new()
+    ///     .match_kind(MatchKind::LeftmostLongest)
+    ///     .build(patterns);
+    /// let mat = ac.find(haystack).expect("should have a match");
+    /// assert_eq!("abcd", &haystack[mat.start()..mat.end()]);
+    /// ```
+    pub fn match_kind(&mut self, kind: MatchKind) -> &mut AhoCorasickBuilder {
+        self.nfa_builder.match_kind(kind);
+        self
+    }
+
+    /// Enable anchored mode, which requires all matches to start at the
+    /// first position in a haystack.
+    ///
+    /// This option is disabled by default.
+    ///
+    /// # Examples
+    ///
+    /// Basic usage:
+    ///
+    /// ```
+    /// use aho_corasick::AhoCorasickBuilder;
+    ///
+    /// let patterns = &["foo", "bar"];
+    /// let haystack = "foobar";
+    ///
+    /// let ac = AhoCorasickBuilder::new()
+    ///     .anchored(true)
+    ///     .build(patterns);
+    /// assert_eq!(1, ac.find_iter(haystack).count());
+    /// ```
+    ///
+    /// When searching for overlapping matches, all matches that start at
+    /// the beginning of a haystack will be reported:
+    ///
+    /// ```
+    /// use aho_corasick::AhoCorasickBuilder;
+    ///
+    /// let patterns = &["foo", "foofoo"];
+    /// let haystack = "foofoo";
+    ///
+    /// let ac = AhoCorasickBuilder::new()
+    ///     .anchored(true)
+    ///     .build(patterns);
+    /// assert_eq!(2, ac.find_overlapping_iter(haystack).count());
+    /// // A non-anchored search would return 3 matches.
+    /// ```
+    pub fn anchored(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
+        self.nfa_builder.anchored(yes);
+        self
+    }
+
+    /// Enable ASCII-aware case insensitive matching.
+    ///
+    /// When this option is enabled, searching will be performed without
+    /// respect to case for ASCII letters (`a-z` and `A-Z`) only.
+    ///
+    /// Enabling this option does not change the search algorithm, but it may
+    /// increase the size of the automaton.
+    ///
+    /// **NOTE:** In the future, support for full Unicode case insensitivity
+    /// may be added, but ASCII case insensitivity is comparatively much
+    /// simpler to add.
+    ///
+    /// # Examples
+    ///
+    /// Basic usage:
+    ///
+    /// ```
+    /// use aho_corasick::AhoCorasickBuilder;
+    ///
+    /// let patterns = &["FOO", "bAr", "BaZ"];
+    /// let haystack = "foo bar baz";
+    ///
+    /// let ac = AhoCorasickBuilder::new()
+    ///     .ascii_case_insensitive(true)
+    ///     .build(patterns);
+    /// assert_eq!(3, ac.find_iter(haystack).count());
+    /// ```
+    pub fn ascii_case_insensitive(
+        &mut self,
+        yes: bool,
+    ) -> &mut AhoCorasickBuilder {
+        self.nfa_builder.ascii_case_insensitive(yes);
+        self
+    }
+
+    /// Set the limit on how many NFA states use a dense representation for
+    /// their transitions.
+    ///
+    /// A dense representation uses more space, but supports faster access to
+    /// transitions at search time. Thus, this setting permits the control of a
+    /// space vs time trade off when using the NFA variant of Aho-Corasick.
+    ///
+    /// This limit is expressed in terms of the depth of a state, i.e., the
+    /// number of transitions from the starting state of the NFA. The idea is
+    /// that most of the time searching will be spent near the starting state
+    /// of the automaton, so states near the start state should use a dense
+    /// representation. States further away from the start state would then use
+    /// a sparse representation, which uses less space but is slower to access
+    /// transitions at search time.
+    ///
+    /// By default, this is set to a low but non-zero number.
+    ///
+    /// This setting has no effect if the `dfa` option is enabled.
+    pub fn dense_depth(&mut self, depth: usize) -> &mut AhoCorasickBuilder {
+        self.nfa_builder.dense_depth(depth);
+        self
+    }
+
+    /// Compile the standard Aho-Corasick automaton into a deterministic finite
+    /// automaton (DFA).
+    ///
+    /// When this is disabled (which is the default), then a non-deterministic
+    /// finite automaton (NFA) is used instead.
+    ///
+    /// The main benefit to a DFA is that it can execute searches more quickly
+    /// than a DFA (perhaps 2-4 times as fast). The main drawback is that the
+    /// DFA uses more space and can take much longer to build.
+    ///
+    /// Enabling this option does not change the time complexity for
+    /// constructing the Aho-Corasick automaton (which is `O(p)` where
+    /// `p` is the total number of patterns being compiled). Enabling this
+    /// option does however reduce the time complexity of non-overlapping
+    /// searches from `O(n + p)` to `O(n)`, where `n` is the length of the
+    /// haystack.
+    ///
+    /// In general, it's a good idea to enable this if you're searching a
+    /// small number of fairly short patterns (~1000), or if you want the
+    /// fastest possible search without regard to compilation time or space
+    /// usage.
+    pub fn dfa(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
+        self.dfa = yes;
+        self
+    }
+
+    /// Enable heuristic prefilter optimizations.
+    ///
+    /// When enabled, searching will attempt to quickly skip to match
+    /// candidates using specialized literal search routines. A prefilter
+    /// cannot always be used, and is generally treated as a heuristic. It
+    /// can be useful to disable this if the prefilter is observed to be
+    /// sub-optimal for a particular workload.
+    ///
+    /// This is enabled by default.
+    pub fn prefilter(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
+        self.nfa_builder.prefilter(yes);
+        self
+    }
+
+    /// Shrink the size of the transition alphabet by mapping bytes to their
+    /// equivalence classes. This only has an effect when the `dfa` option is
+    /// enabled.
+    ///
+    /// When enabled, each a DFA will use a map from all possible bytes
+    /// to their corresponding equivalence class. Each equivalence class
+    /// represents a set of bytes that does not discriminate between a match
+    /// and a non-match in the DFA. For example, the patterns `bar` and `baz`
+    /// have at least five equivalence classes: singleton sets of `b`, `a`, `r`
+    /// and `z`, and a final set that contains every other byte.
+    ///
+    /// The advantage of this map is that the size of the transition table can
+    /// be reduced drastically from `#states * 256 * sizeof(id)` to
+    /// `#states * k * sizeof(id)` where `k` is the number of equivalence
+    /// classes. As a result, total space usage can decrease substantially.
+    /// Moreover, since a smaller alphabet is used, compilation becomes faster
+    /// as well.
+    ///
+    /// The disadvantage of this map is that every byte searched must be
+    /// passed through this map before it can be used to determine the next
+    /// transition. This has a small match time performance cost. However, if
+    /// the DFA is otherwise very large without byte classes, then using byte
+    /// classes can greatly improve memory locality and thus lead to better
+    /// overall performance.
+    ///
+    /// This option is enabled by default.
+    pub fn byte_classes(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
+        self.dfa_builder.byte_classes(yes);
+        self
+    }
+
+    /// Premultiply state identifiers in the transition table. This only has
+    /// an effect when the `dfa` option is enabled.
+    ///
+    /// When enabled, state identifiers are premultiplied to point to their
+    /// corresponding row in the transition table. That is, given the `i`th
+    /// state, its corresponding premultiplied identifier is `i * k` where `k`
+    /// is the alphabet size of the automaton. (The alphabet size is at most
+    /// 256, but is in practice smaller if byte classes is enabled.)
+    ///
+    /// When state identifiers are not premultiplied, then the identifier of
+    /// the `i`th state is `i`.
+    ///
+    /// The advantage of premultiplying state identifiers is that is saves a
+    /// multiplication instruction per byte when searching with a DFA. This has
+    /// been observed to lead to a 20% performance benefit in micro-benchmarks.
+    ///
+    /// The primary disadvantage of premultiplying state identifiers is
+    /// that they require a larger integer size to represent. For example,
+    /// if the DFA has 200 states, then its premultiplied form requires 16
+    /// bits to represent every possible state identifier, where as its
+    /// non-premultiplied form only requires 8 bits.
+    ///
+    /// This option is enabled by default.
+    pub fn premultiply(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
+        self.dfa_builder.premultiply(yes);
+        self
+    }
+}
+
+/// A knob for controlling the match semantics of an Aho-Corasick automaton.
+///
+/// There are two generally different ways that Aho-Corasick automatons can
+/// report matches. The first way is the "standard" approach that results from
+/// implementing most textbook explanations of Aho-Corasick. The second way is
+/// to report only the leftmost non-overlapping matches. The leftmost approach
+/// is in turn split into two different ways of resolving ambiguous matches:
+/// leftmost-first and leftmost-longest.
+///
+/// The `Standard` match kind is the default and is the only one that supports
+/// overlapping matches and stream searching. (Trying to find overlapping
+/// or streaming matches using leftmost match semantics will result in a
+/// panic.) The `Standard` match kind will report matches as they are seen.
+/// When searching for overlapping matches, then all possible matches are
+/// reported. When searching for non-overlapping matches, the first match seen
+/// is reported. For example, for non-overlapping matches, given the patterns
+/// `abcd` and `b` and the subject string `abcdef`, only a match for `b` is
+/// reported since it is detected first. The `abcd` match is never reported
+/// since it overlaps with the `b` match.
+///
+/// In contrast, the leftmost match kind always prefers the leftmost match
+/// among all possible matches. Given the same example as above with `abcd` and
+/// `b` as patterns and `abcdef` as the subject string, the leftmost match is
+/// `abcd` since it begins before the `b` match, even though the `b` match is
+/// detected before the `abcd` match. In this case, the `b` match is not
+/// reported at all since it overlaps with the `abcd` match.
+///
+/// The difference between leftmost-first and leftmost-longest is in how they
+/// resolve ambiguous matches when there are multiple leftmost matches to
+/// choose from. Leftmost-first always chooses the pattern that was provided
+/// earliest, where as leftmost-longest always chooses the longest matching
+/// pattern. For example, given the patterns `a` and `ab` and the subject
+/// string `ab`, the leftmost-first match is `a` but the leftmost-longest match
+/// is `ab`. Conversely, if the patterns were given in reverse order, i.e.,
+/// `ab` and `a`, then both the leftmost-first and leftmost-longest matches
+/// would be `ab`. Stated differently, the leftmost-first match depends on the
+/// order in which the patterns were given to the Aho-Corasick automaton.
+/// Because of that, when leftmost-first matching is used, if a pattern `A`
+/// that appears before a pattern `B` is a prefix of `B`, then it is impossible
+/// to ever observe a match of `B`.
+///
+/// If you're not sure which match kind to pick, then stick with the standard
+/// kind, which is the default. In particular, if you need overlapping or
+/// streaming matches, then you _must_ use the standard kind. The leftmost
+/// kinds are useful in specific circumstances. For example, leftmost-first can
+/// be very useful as a way to implement match priority based on the order of
+/// patterns given and leftmost-longest can be useful for dictionary searching
+/// such that only the longest matching words are reported.
+///
+/// # Relationship with regular expression alternations
+///
+/// Understanding match semantics can be a little tricky, and one easy way
+/// to conceptualize non-overlapping matches from an Aho-Corasick automaton
+/// is to think about them as a simple alternation of literals in a regular
+/// expression. For example, let's say we wanted to match the strings
+/// `Sam` and `Samwise`, which would turn into the regex `Sam|Samwise`. It
+/// turns out that regular expression engines have two different ways of
+/// matching this alternation. The first way, leftmost-longest, is commonly
+/// found in POSIX compatible implementations of regular expressions (such as
+/// `grep`). The second way, leftmost-first, is commonly found in backtracking
+/// implementations such as Perl. (Some regex engines, such as RE2 and Rust's
+/// regex engine do not use backtracking, but still implement leftmost-first
+/// semantics in an effort to match the behavior of dominant backtracking
+/// regex engines such as those found in Perl, Ruby, Python, Javascript and
+/// PHP.)
+///
+/// That is, when matching `Sam|Samwise` against `Samwise`, a POSIX regex
+/// will match `Samwise` because it is the longest possible match, but a
+/// Perl-like regex will match `Sam` since it appears earlier in the
+/// alternation. Indeed, the regex `Sam|Samwise` in a Perl-like regex engine
+/// will never match `Samwise` since `Sam` will always have higher priority.
+/// Conversely, matching the regex `Samwise|Sam` against `Samwise` will lead to
+/// a match of `Samwise` in both POSIX and Perl-like regexes since `Samwise` is
+/// still longest match, but it also appears earlier than `Sam`.
+///
+/// The "standard" match semantics of Aho-Corasick generally don't correspond
+/// to the match semantics of any large group of regex implementations, so
+/// there's no direct analogy that can be made here. Standard match semantics
+/// are generally useful for overlapping matches, or if you just want to see
+/// matches as they are detected.
+///
+/// The main conclusion to draw from this section is that the match semantics
+/// can be tweaked to precisely match either Perl-like regex alternations or
+/// POSIX regex alternations.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum MatchKind {
+    /// Use standard match semantics, which support overlapping matches. When
+    /// used with non-overlapping matches, matches are reported as they are
+    /// seen.
+    Standard,
+    /// Use leftmost-first match semantics, which reports leftmost matches.
+    /// When there are multiple possible leftmost matches, the match
+    /// corresponding to the pattern that appeared earlier when constructing
+    /// the automaton is reported.
+    ///
+    /// This does **not** support overlapping matches or stream searching. If
+    /// this match kind is used, attempting to find overlapping matches or
+    /// stream matches will panic.
+    LeftmostFirst,
+    /// Use leftmost-longest match semantics, which reports leftmost matches.
+    /// When there are multiple possible leftmost matches, the longest match
+    /// is chosen.
+    ///
+    /// This does **not** support overlapping matches or stream searching. If
+    /// this match kind is used, attempting to find overlapping matches or
+    /// stream matches will panic.
+    LeftmostLongest,
+    /// Hints that destructuring should not be exhaustive.
+    ///
+    /// This enum may grow additional variants, so this makes sure clients
+    /// don't count on exhaustive matching. (Otherwise, adding a new variant
+    /// could break existing code.)
+    #[doc(hidden)]
+    __Nonexhaustive,
+}
+
+/// The default match kind is `MatchKind::Standard`.
+impl Default for MatchKind {
+    fn default() -> MatchKind {
+        MatchKind::Standard
+    }
+}
+
+impl MatchKind {
+    fn supports_overlapping(&self) -> bool {
+        self.is_standard()
+    }
+
+    fn supports_stream(&self) -> bool {
+        // TODO: It may be possible to support this. It's hard.
+        //
+        // See: https://github.com/rust-lang/regex/issues/425#issuecomment-471367838
+        self.is_standard()
+    }
+
+    pub(crate) fn is_standard(&self) -> bool {
+        *self == MatchKind::Standard
+    }
+
+    pub(crate) fn is_leftmost(&self) -> bool {
+        *self == MatchKind::LeftmostFirst
+            || *self == MatchKind::LeftmostLongest
+    }
+
+    pub(crate) fn is_leftmost_first(&self) -> bool {
+        *self == MatchKind::LeftmostFirst
+    }
+
+    /// Convert this match kind into a packed match kind. If this match kind
+    /// corresponds to standard semantics, then this returns None, since
+    /// packed searching does not support standard semantics.
+    pub(crate) fn as_packed(&self) -> Option<packed::MatchKind> {
+        match *self {
+            MatchKind::Standard => None,
+            MatchKind::LeftmostFirst => Some(packed::MatchKind::LeftmostFirst),
+            MatchKind::LeftmostLongest => {
+                Some(packed::MatchKind::LeftmostLongest)
+            }
+            MatchKind::__Nonexhaustive => unreachable!(),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn oibits() {
+        use std::panic::{RefUnwindSafe, UnwindSafe};
+
+        fn assert_send<T: Send>() {}
+        fn assert_sync<T: Sync>() {}
+        fn assert_unwind_safe<T: RefUnwindSafe + UnwindSafe>() {}
+
+        assert_send::<AhoCorasick>();
+        assert_sync::<AhoCorasick>();
+        assert_unwind_safe::<AhoCorasick>();
+        assert_send::<AhoCorasickBuilder>();
+        assert_sync::<AhoCorasickBuilder>();
+        assert_unwind_safe::<AhoCorasickBuilder>();
+    }
+}
diff --git a/src/automaton.rs b/src/automaton.rs
new file mode 100644
index 0000000..2ada1a0
--- /dev/null
+++ b/src/automaton.rs
@@ -0,0 +1,573 @@
+use ahocorasick::MatchKind;
+use prefilter::{self, Candidate, Prefilter, PrefilterState};
+use state_id::{dead_id, fail_id, StateID};
+use Match;
+
+// NOTE: This trait essentially started as a copy of the same trait from from
+// regex-automata, with some wording changed since we use this trait for
+// NFAs in addition to DFAs in this crate. Additionally, we do not export
+// this trait. It's only used internally to reduce code duplication. The
+// regex-automata crate needs to expose it because its Regex type is generic
+// over implementations of this trait. In this crate, we encapsulate everything
+// behind the AhoCorasick type.
+//
+// This trait is a bit of a mess, but it's not quite clear how to fix it.
+// Basically, there are several competing concerns:
+//
+// * We need performance, so everything effectively needs to get monomorphized.
+// * There are several variations on searching Aho-Corasick automatons:
+//   overlapping, standard and leftmost. Overlapping and standard are somewhat
+//   combined together below, but there is no real way to combine standard with
+//   leftmost. Namely, leftmost requires continuing a search even after a match
+//   is found, in order to correctly disambiguate a match.
+// * On top of that, *sometimes* callers want to know which state the automaton
+//   is in after searching. This is principally useful for overlapping and
+//   stream searches. However, when callers don't care about this, we really
+//   do not want to be forced to compute it, since it sometimes requires extra
+//   work. Thus, there are effectively two copies of leftmost searching: one
+//   for tracking the state ID and one that doesn't. We should ideally do the
+//   same for standard searching, but my sanity stopped me.
+
+// SAFETY RATIONALE: Previously, the code below went to some length to remove
+// all bounds checks. This generally produced tighter assembly and lead to
+// 20-50% improvements in micro-benchmarks on corpora made up of random
+// characters. This somewhat makes sense, since the branch predictor is going
+// to be at its worse on random text.
+//
+// However, using the aho-corasick-debug tool and manually benchmarking
+// different inputs, the code *with* bounds checks actually wound up being
+// slightly faster:
+//
+//     $ cat input
+//     Sherlock Holmes
+//     John Watson
+//     Professor Moriarty
+//     Irene Adler
+//     Mary Watson
+//
+//     $ aho-corasick-debug-safe \
+//         input OpenSubtitles2018.raw.sample.en --kind leftmost-first --dfa
+//     pattern read time: 32.824µs
+//     automaton build time: 444.687µs
+//     automaton heap usage: 72392 bytes
+//     match count: 639
+//     count time: 1.809961702s
+//
+//     $ aho-corasick-debug-master \
+//         input OpenSubtitles2018.raw.sample.en --kind leftmost-first --dfa
+//     pattern read time: 31.425µs
+//     automaton build time: 317.434µs
+//     automaton heap usage: 72392 bytes
+//     match count: 639
+//     count time: 2.059157705s
+//
+// I was able to reproduce this result on two different machines (an i5 and
+// an i7). Therefore, we go the route of safe code for now.
+
+/// A trait describing the interface of an Aho-Corasick finite state machine.
+///
+/// Every automaton has exactly one fail state, one dead state and exactly one
+/// start state. Generally, these correspond to the first, second and third
+/// states, respectively. The failure state is always treated as a sentinel.
+/// That is, no correct Aho-Corasick automaton will ever transition into the
+/// fail state. The dead state, however, can be transitioned into, but only
+/// when leftmost-first or leftmost-longest match semantics are enabled and
+/// only when at least one match has been observed.
+///
+/// Every automaton also has one or more match states, such that
+/// `Automaton::is_match_state(id)` returns `true` if and only if `id`
+/// corresponds to a match state.
+pub trait Automaton {
+    /// The representation used for state identifiers in this automaton.
+    ///
+    /// Typically, this is one of `u8`, `u16`, `u32`, `u64` or `usize`.
+    type ID: StateID;
+
+    /// The type of matching that should be done.
+    fn match_kind(&self) -> &MatchKind;
+
+    /// Returns true if and only if this automaton uses anchored searches.
+    fn anchored(&self) -> bool;
+
+    /// An optional prefilter for quickly skipping to the next candidate match.
+    /// A prefilter must report at least every match, although it may report
+    /// positions that do not correspond to a match. That is, it must not allow
+    /// false negatives, but can allow false positives.
+    ///
+    /// Currently, a prefilter only runs when the automaton is in the start
+    /// state. That is, the position reported by a prefilter should always
+    /// correspond to the start of a potential match.
+    fn prefilter(&self) -> Option<&dyn Prefilter>;
+
+    /// Return the identifier of this automaton's start state.
+    fn start_state(&self) -> Self::ID;
+
+    /// Returns true if and only if the given state identifier refers to a
+    /// valid state.
+    fn is_valid(&self, id: Self::ID) -> bool;
+
+    /// Returns true if and only if the given identifier corresponds to a match
+    /// state.
+    ///
+    /// The state ID given must be valid, or else implementors may panic.
+    fn is_match_state(&self, id: Self::ID) -> bool;
+
+    /// Returns true if and only if the given identifier corresponds to a state
+    /// that is either the dead state or a match state.
+    ///
+    /// Depending on the implementation of the automaton, this routine can
+    /// be used to save a branch in the core matching loop. Nevertheless,
+    /// `is_match_state(id) || id == dead_id()` is always a valid
+    /// implementation. Indeed, this is the default implementation.
+    ///
+    /// The state ID given must be valid, or else implementors may panic.
+    fn is_match_or_dead_state(&self, id: Self::ID) -> bool {
+        id == dead_id() || self.is_match_state(id)
+    }
+
+    /// If the given state is a match state, return the match corresponding
+    /// to the given match index. `end` must be the ending position of the
+    /// detected match. If no match exists or if `match_index` exceeds the
+    /// number of matches in this state, then `None` is returned.
+    ///
+    /// The state ID given must be valid, or else implementors may panic.
+    ///
+    /// If the given state ID is correct and if the `match_index` is less than
+    /// the number of matches for that state, then this is guaranteed to return
+    /// a match.
+    fn get_match(
+        &self,
+        id: Self::ID,
+        match_index: usize,
+        end: usize,
+    ) -> Option<Match>;
+
+    /// Returns the number of matches for the given state. If the given state
+    /// is not a match state, then this returns 0.
+    ///
+    /// The state ID given must be valid, or else implementors must panic.
+    fn match_count(&self, id: Self::ID) -> usize;
+
+    /// Given the current state that this automaton is in and the next input
+    /// byte, this method returns the identifier of the next state. The
+    /// identifier returned must always be valid and may never correspond to
+    /// the fail state. The returned identifier may, however, point to the
+    /// dead state.
+    ///
+    /// This is not safe so that implementors may look up the next state
+    /// without memory safety checks such as bounds checks. As such, callers
+    /// must ensure that the given identifier corresponds to a valid automaton
+    /// state. Implementors must, in turn, ensure that this routine is safe for
+    /// all valid state identifiers and for all possible `u8` values.
+    fn next_state(&self, current: Self::ID, input: u8) -> Self::ID;
+
+    /// Like next_state, but debug_asserts that the underlying
+    /// implementation never returns a `fail_id()` for the next state.
+    fn next_state_no_fail(&self, current: Self::ID, input: u8) -> Self::ID {
+        let next = self.next_state(current, input);
+        // We should never see a transition to the failure state.
+        debug_assert!(
+            next != fail_id(),
+            "automaton should never return fail_id for next state"
+        );
+        next
+    }
+
+    /// Execute a search using standard match semantics.
+    ///
+    /// This can be used even when the automaton was constructed with leftmost
+    /// match semantics when you want to find the earliest possible match. This
+    /// can also be used as part of an overlapping search implementation.
+    ///
+    /// N.B. This does not report a match if `state_id` is given as a matching
+    /// state. As such, this should not be used directly.
+    #[inline(always)]
+    fn standard_find_at(
+        &self,
+        prestate: &mut PrefilterState,
+        haystack: &[u8],
+        at: usize,
+        state_id: &mut Self::ID,
+    ) -> Option<Match> {
+        if let Some(pre) = self.prefilter() {
+            self.standard_find_at_imp(
+                prestate,
+                Some(pre),
+                haystack,
+                at,
+                state_id,
+            )
+        } else {
+            self.standard_find_at_imp(prestate, None, haystack, at, state_id)
+        }
+    }
+
+    // It's important for this to always be inlined. Namely, its only caller
+    // is standard_find_at, and the inlining should remove the case analysis
+    // for prefilter scanning when there is no prefilter available.
+    #[inline(always)]
+    fn standard_find_at_imp(
+        &self,
+        prestate: &mut PrefilterState,
+        prefilter: Option<&dyn Prefilter>,
+        haystack: &[u8],
+        mut at: usize,
+        state_id: &mut Self::ID,
+    ) -> Option<Match> {
+        while at < haystack.len() {
+            if let Some(pre) = prefilter {
+                if prestate.is_effective(at) && *state_id == self.start_state()
+                {
+                    let c = prefilter::next(prestate, pre, haystack, at)
+                        .into_option();
+                    match c {
+                        None => return None,
+                        Some(i) => {
+                            at = i;
+                        }
+                    }
+                }
+            }
+            // CORRECTNESS: next_state is correct for all possible u8 values,
+            // so the only thing we're concerned about is the validity of
+            // `state_id`. `state_id` either comes from the caller (in which
+            // case, we assume it is correct), or it comes from the return
+            // value of next_state, which is guaranteed to be correct.
+            *state_id = self.next_state_no_fail(*state_id, haystack[at]);
+            at += 1;
+            // This routine always quits immediately after seeing a
+            // match, and since dead states can only come after seeing
+            // a match, seeing a dead state here is impossible. (Unless
+            // we have an anchored automaton, in which case, dead states
+            // are used to stop a search.)
+            debug_assert!(
+                *state_id != dead_id() || self.anchored(),
+                "standard find should never see a dead state"
+            );
+
+            if self.is_match_or_dead_state(*state_id) {
+                return if *state_id == dead_id() {
+                    None
+                } else {
+                    self.get_match(*state_id, 0, at)
+                };
+            }
+        }
+        None
+    }
+
+    /// Execute a search using leftmost (either first or longest) match
+    /// semantics.
+    ///
+    /// The principle difference between searching with standard semantics and
+    /// searching with leftmost semantics is that leftmost searching will
+    /// continue searching even after a match has been found. Once a match
+    /// is found, the search does not stop until either the haystack has been
+    /// exhausted or a dead state is observed in the automaton. (Dead states
+    /// only exist in automatons constructed with leftmost semantics.) That is,
+    /// we rely on the construction of the automaton to tell us when to quit.
+    #[inline(never)]
+    fn leftmost_find_at(
+        &self,
+        prestate: &mut PrefilterState,
+        haystack: &[u8],
+        at: usize,
+        state_id: &mut Self::ID,
+    ) -> Option<Match> {
+        if let Some(pre) = self.prefilter() {
+            self.leftmost_find_at_imp(
+                prestate,
+                Some(pre),
+                haystack,
+                at,
+                state_id,
+            )
+        } else {
+            self.leftmost_find_at_imp(prestate, None, haystack, at, state_id)
+        }
+    }
+
+    // It's important for this to always be inlined. Namely, its only caller
+    // is leftmost_find_at, and the inlining should remove the case analysis
+    // for prefilter scanning when there is no prefilter available.
+    #[inline(always)]
+    fn leftmost_find_at_imp(
+        &self,
+        prestate: &mut PrefilterState,
+        prefilter: Option<&dyn Prefilter>,
+        haystack: &[u8],
+        mut at: usize,
+        state_id: &mut Self::ID,
+    ) -> Option<Match> {
+        debug_assert!(self.match_kind().is_leftmost());
+        if self.anchored() && at > 0 && *state_id == self.start_state() {
+            return None;
+        }
+        let mut last_match = self.get_match(*state_id, 0, at);
+        while at < haystack.len() {
+            if let Some(pre) = prefilter {
+                if prestate.is_effective(at) && *state_id == self.start_state()
+                {
+                    let c = prefilter::next(prestate, pre, haystack, at)
+                        .into_option();
+                    match c {
+                        None => return None,
+                        Some(i) => {
+                            at = i;
+                        }
+                    }
+                }
+            }
+            // CORRECTNESS: next_state is correct for all possible u8 values,
+            // so the only thing we're concerned about is the validity of
+            // `state_id`. `state_id` either comes from the caller (in which
+            // case, we assume it is correct), or it comes from the return
+            // value of next_state, which is guaranteed to be correct.
+            *state_id = self.next_state_no_fail(*state_id, haystack[at]);
+            at += 1;
+            if self.is_match_or_dead_state(*state_id) {
+                if *state_id == dead_id() {
+                    // The only way to enter into a dead state is if a match
+                    // has been found, so we assert as much. This is different
+                    // from normal automata, where you might enter a dead state
+                    // if you know a subsequent match will never be found
+                    // (regardless of whether a match has already been found).
+                    // For Aho-Corasick, it is built so that we can match at
+                    // any position, so the possibility of a match always
+                    // exists.
+                    //
+                    // (Unless we have an anchored automaton, in which case,
+                    // dead states are used to stop a search.)
+                    debug_assert!(
+                        last_match.is_some() || self.anchored(),
+                        "failure state should only be seen after match"
+                    );
+                    return last_match;
+                }
+                last_match = self.get_match(*state_id, 0, at);
+            }
+        }
+        last_match
+    }
+
+    /// This is like leftmost_find_at, but does not need to track a caller
+    /// provided state id. In other words, the only output of this routine is a
+    /// match, if one exists.
+    ///
+    /// It is regrettable that we need to effectively copy a chunk of
+    /// implementation twice, but when we don't need to track the state ID, we
+    /// can allow the prefilter to report matches immediately without having
+    /// to re-confirm them with the automaton. The re-confirmation step is
+    /// necessary in leftmost_find_at because tracing through the automaton is
+    /// the only way to correctly set the state ID. (Perhaps an alternative
+    /// would be to keep a map from pattern ID to matching state ID, but that
+    /// complicates the code and still doesn't permit us to defer to the
+    /// prefilter entirely when possible.)
+    ///
+    /// I did try a few things to avoid the code duplication here, but nothing
+    /// optimized as well as this approach. (In microbenchmarks, there was
+    /// about a 25% difference.)
+    #[inline(never)]
+    fn leftmost_find_at_no_state(
+        &self,
+        prestate: &mut PrefilterState,
+        haystack: &[u8],
+        at: usize,
+    ) -> Option<Match> {
+        if let Some(pre) = self.prefilter() {
+            self.leftmost_find_at_no_state_imp(
+                prestate,
+                Some(pre),
+                haystack,
+                at,
+            )
+        } else {
+            self.leftmost_find_at_no_state_imp(prestate, None, haystack, at)
+        }
+    }
+
+    // It's important for this to always be inlined. Namely, its only caller
+    // is leftmost_find_at_no_state, and the inlining should remove the case
+    // analysis for prefilter scanning when there is no prefilter available.
+    #[inline(always)]
+    fn leftmost_find_at_no_state_imp(
+        &self,
+        prestate: &mut PrefilterState,
+        prefilter: Option<&dyn Prefilter>,
+        haystack: &[u8],
+        mut at: usize,
+    ) -> Option<Match> {
+        debug_assert!(self.match_kind().is_leftmost());
+        if self.anchored() && at > 0 {
+            return None;
+        }
+        // If our prefilter handles confirmation of matches 100% of the
+        // time, and since we don't need to track state IDs, we can avoid
+        // Aho-Corasick completely.
+        if let Some(pre) = prefilter {
+            // We should never have a prefilter during an anchored search.
+            debug_assert!(!self.anchored());
+            if !pre.reports_false_positives() {
+                return match pre.next_candidate(prestate, haystack, at) {
+                    Candidate::None => None,
+                    Candidate::Match(m) => Some(m),
+                    Candidate::PossibleStartOfMatch(_) => unreachable!(),
+                };
+            }
+        }
+
+        let mut state_id = self.start_state();
+        let mut last_match = self.get_match(state_id, 0, at);
+        while at < haystack.len() {
+            if let Some(pre) = prefilter {
+                if prestate.is_effective(at) && state_id == self.start_state()
+                {
+                    match prefilter::next(prestate, pre, haystack, at) {
+                        Candidate::None => return None,
+                        // Since we aren't tracking a state ID, we can
+                        // quit early once we know we have a match.
+                        Candidate::Match(m) => return Some(m),
+                        Candidate::PossibleStartOfMatch(i) => {
+                            at = i;
+                        }
+                    }
+                }
+            }
+            // CORRECTNESS: next_state is correct for all possible u8 values,
+            // so the only thing we're concerned about is the validity of
+            // `state_id`. `state_id` either comes from the caller (in which
+            // case, we assume it is correct), or it comes from the return
+            // value of next_state, which is guaranteed to be correct.
+            state_id = self.next_state_no_fail(state_id, haystack[at]);
+            at += 1;
+            if self.is_match_or_dead_state(state_id) {
+                if state_id == dead_id() {
+                    // The only way to enter into a dead state is if a
+                    // match has been found, so we assert as much. This
+                    // is different from normal automata, where you might
+                    // enter a dead state if you know a subsequent match
+                    // will never be found (regardless of whether a match
+                    // has already been found). For Aho-Corasick, it is
+                    // built so that we can match at any position, so the
+                    // possibility of a match always exists.
+                    //
+                    // (Unless we have an anchored automaton, in which
+                    // case, dead states are used to stop a search.)
+                    debug_assert!(
+                        last_match.is_some() || self.anchored(),
+                        "failure state should only be seen after match"
+                    );
+                    return last_match;
+                }
+                last_match = self.get_match(state_id, 0, at);
+            }
+        }
+        last_match
+    }
+
+    /// Execute an overlapping search.
+    ///
+    /// When executing an overlapping match, the previous state ID in addition
+    /// to the previous match index should be given. If there are more matches
+    /// at the given state, then the match is reported and the given index is
+    /// incremented.
+    #[inline(always)]
+    fn overlapping_find_at(
+        &self,
+        prestate: &mut PrefilterState,
+        haystack: &[u8],
+        at: usize,
+        state_id: &mut Self::ID,
+        match_index: &mut usize,
+    ) -> Option<Match> {
+        if self.anchored() && at > 0 && *state_id == self.start_state() {
+            return None;
+        }
+
+        let match_count = self.match_count(*state_id);
+        if *match_index < match_count {
+            // This is guaranteed to return a match since
+            // match_index < match_count.
+            let result = self.get_match(*state_id, *match_index, at);
+            debug_assert!(result.is_some(), "must be a match");
+            *match_index += 1;
+            return result;
+        }
+
+        *match_index = 0;
+        match self.standard_find_at(prestate, haystack, at, state_id) {
+            None => None,
+            Some(m) => {
+                *match_index = 1;
+                Some(m)
+            }
+        }
+    }
+
+    /// Return the earliest match found. This returns as soon as we know that
+    /// we have a match. As such, this does not necessarily correspond to the
+    /// leftmost starting match, but rather, the leftmost position at which a
+    /// match ends.
+    #[inline(always)]
+    fn earliest_find_at(
+        &self,
+        prestate: &mut PrefilterState,
+        haystack: &[u8],
+        at: usize,
+        state_id: &mut Self::ID,
+    ) -> Option<Match> {
+        if *state_id == self.start_state() {
+            if self.anchored() && at > 0 {
+                return None;
+            }
+            if let Some(m) = self.get_match(*state_id, 0, at) {
+                return Some(m);
+            }
+        }
+        self.standard_find_at(prestate, haystack, at, state_id)
+    }
+
+    /// A convenience function for finding the next match according to the
+    /// match semantics of this automaton. For standard match semantics, this
+    /// finds the earliest match. Otherwise, the leftmost match is found.
+    #[inline(always)]
+    fn find_at(
+        &self,
+        prestate: &mut PrefilterState,
+        haystack: &[u8],
+        at: usize,
+        state_id: &mut Self::ID,
+    ) -> Option<Match> {
+        match *self.match_kind() {
+            MatchKind::Standard => {
+                self.earliest_find_at(prestate, haystack, at, state_id)
+            }
+            MatchKind::LeftmostFirst | MatchKind::LeftmostLongest => {
+                self.leftmost_find_at(prestate, haystack, at, state_id)
+            }
+            MatchKind::__Nonexhaustive => unreachable!(),
+        }
+    }
+
+    /// Like find_at, but does not track state identifiers. This permits some
+    /// optimizations when a prefilter that confirms its own matches is
+    /// present.
+    #[inline(always)]
+    fn find_at_no_state(
+        &self,
+        prestate: &mut PrefilterState,
+        haystack: &[u8],
+        at: usize,
+    ) -> Option<Match> {
+        match *self.match_kind() {
+            MatchKind::Standard => {
+                let mut state = self.start_state();
+                self.earliest_find_at(prestate, haystack, at, &mut state)
+            }
+            MatchKind::LeftmostFirst | MatchKind::LeftmostLongest => {
+                self.leftmost_find_at_no_state(prestate, haystack, at)
+            }
+            MatchKind::__Nonexhaustive => unreachable!(),
+        }
+    }
+}
diff --git a/src/buffer.rs b/src/buffer.rs
new file mode 100644
index 0000000..1008196
--- /dev/null
+++ b/src/buffer.rs
@@ -0,0 +1,130 @@
+use std::cmp;
+use std::io;
+use std::ptr;
+
+/// The default buffer capacity that we use for the stream buffer.
+const DEFAULT_BUFFER_CAPACITY: usize = 8 * (1 << 10); // 8 KB
+
+/// A fairly simple roll buffer for supporting stream searches.
+///
+/// This buffer acts as a temporary place to store a fixed amount of data when
+/// reading from a stream. Its central purpose is to allow "rolling" some
+/// suffix of the data to the beginning of the buffer before refilling it with
+/// more data from the stream. For example, let's say we are trying to match
+/// "foobar" on a stream. When we report the match, we'd like to not only
+/// report the correct offsets at which the match occurs, but also the matching
+/// bytes themselves. So let's say our stream is a file with the following
+/// contents: `test test foobar test test`. Now assume that we happen to read
+/// the aforementioned file in two chunks: `test test foo` and `bar test test`.
+/// Naively, it would not be possible to report a single contiguous `foobar`
+/// match, but this roll buffer allows us to do that. Namely, after the second
+/// read, the contents of the buffer should be `st foobar test test`, where the
+/// search should ultimately resume immediately after `foo`. (The prefix `st `
+/// is included because the roll buffer saves N bytes at the end of the buffer,
+/// where N is the maximum possible length of a match.)
+///
+/// A lot of the logic for dealing with this is unfortunately split out between
+/// this roll buffer and the `StreamChunkIter`.
+#[derive(Debug)]
+pub struct Buffer {
+    /// The raw buffer contents. This has a fixed size and never increases.
+    buf: Vec<u8>,
+    /// The minimum size of the buffer, which is equivalent to the maximum
+    /// possible length of a match. This corresponds to the amount that we
+    /// roll
+    min: usize,
+    /// The end of the contents of this buffer.
+    end: usize,
+}
+
+impl Buffer {
+    /// Create a new buffer for stream searching. The minimum buffer length
+    /// given should be the size of the maximum possible match length.
+    pub fn new(min_buffer_len: usize) -> Buffer {
+        let min = cmp::max(1, min_buffer_len);
+        // The minimum buffer amount is also the amount that we roll our
+        // buffer in order to support incremental searching. To this end,
+        // our actual capacity needs to be at least 1 byte bigger than our
+        // minimum amount, otherwise we won't have any overlap. In actuality,
+        // we want our buffer to be a bit bigger than that for performance
+        // reasons, so we set a lower bound of `8 * min`.
+        //
+        // TODO: It would be good to find a way to test the streaming
+        // implementation with the minimal buffer size.
+        let capacity = cmp::max(min * 8, DEFAULT_BUFFER_CAPACITY);
+        Buffer { buf: vec![0; capacity], min, end: 0 }
+    }
+
+    /// Return the contents of this buffer.
+    #[inline]
+    pub fn buffer(&self) -> &[u8] {
+        &self.buf[..self.end]
+    }
+
+    /// Return the minimum size of the buffer. The only way a buffer may be
+    /// smaller than this is if the stream itself contains less than the
+    /// minimum buffer amount.
+    #[inline]
+    pub fn min_buffer_len(&self) -> usize {
+        self.min
+    }
+
+    /// Return the total length of the contents in the buffer.
+    #[inline]
+    pub fn len(&self) -> usize {
+        self.end
+    }
+
+    /// Return all free capacity in this buffer.
+    fn free_buffer(&mut self) -> &mut [u8] {
+        &mut self.buf[self.end..]
+    }
+
+    /// Refill the contents of this buffer by reading as much as possible into
+    /// this buffer's free capacity. If no more bytes could be read, then this
+    /// returns false. Otherwise, this reads until it has filled the buffer
+    /// past the minimum amount.
+    pub fn fill<R: io::Read>(&mut self, mut rdr: R) -> io::Result<bool> {
+        let mut readany = false;
+        loop {
+            let readlen = rdr.read(self.free_buffer())?;
+            if readlen == 0 {
+                return Ok(readany);
+            }
+            readany = true;
+            self.end += readlen;
+            if self.len() >= self.min {
+                return Ok(true);
+            }
+        }
+    }
+
+    /// Roll the contents of the buffer so that the suffix of this buffer is
+    /// moved to the front and all other contents are dropped. The size of the
+    /// suffix corresponds precisely to the minimum buffer length.
+    ///
+    /// This should only be called when the entire contents of this buffer have
+    /// been searched.
+    pub fn roll(&mut self) {
+        let roll_start = self
+            .end
+            .checked_sub(self.min)
+            .expect("buffer capacity should be bigger than minimum amount");
+        let roll_len = self.min;
+
+        assert!(roll_start + roll_len <= self.end);
+        unsafe {
+            // SAFETY: A buffer contains Copy data, so there's no problem
+            // moving it around. Safety also depends on our indices being in
+            // bounds, which they always should be, given the assert above.
+            //
+            // TODO: Switch to [T]::copy_within once our MSRV is high enough.
+            ptr::copy(
+                self.buf[roll_start..].as_ptr(),
+                self.buf.as_mut_ptr(),
+                roll_len,
+            );
+        }
+        self.end = roll_len;
+    }
+}
diff --git a/src/byte_frequencies.rs b/src/byte_frequencies.rs
new file mode 100644
index 0000000..c313b62
--- /dev/null
+++ b/src/byte_frequencies.rs
@@ -0,0 +1,258 @@
+pub const BYTE_FREQUENCIES: [u8; 256] = [
+    55,  // '\x00'
+    52,  // '\x01'
+    51,  // '\x02'
+    50,  // '\x03'
+    49,  // '\x04'
+    48,  // '\x05'
+    47,  // '\x06'
+    46,  // '\x07'
+    45,  // '\x08'
+    103, // '\t'
+    242, // '\n'
+    66,  // '\x0b'
+    67,  // '\x0c'
+    229, // '\r'
+    44,  // '\x0e'
+    43,  // '\x0f'
+    42,  // '\x10'
+    41,  // '\x11'
+    40,  // '\x12'
+    39,  // '\x13'
+    38,  // '\x14'
+    37,  // '\x15'
+    36,  // '\x16'
+    35,  // '\x17'
+    34,  // '\x18'
+    33,  // '\x19'
+    56,  // '\x1a'
+    32,  // '\x1b'
+    31,  // '\x1c'
+    30,  // '\x1d'
+    29,  // '\x1e'
+    28,  // '\x1f'
+    255, // ' '
+    148, // '!'
+    164, // '"'
+    149, // '#'
+    136, // '$'
+    160, // '%'
+    155, // '&'
+    173, // "'"
+    221, // '('
+    222, // ')'
+    134, // '*'
+    122, // '+'
+    232, // ','
+    202, // '-'
+    215, // '.'
+    224, // '/'
+    208, // '0'
+    220, // '1'
+    204, // '2'
+    187, // '3'
+    183, // '4'
+    179, // '5'
+    177, // '6'
+    168, // '7'
+    178, // '8'
+    200, // '9'
+    226, // ':'
+    195, // ';'
+    154, // '<'
+    184, // '='
+    174, // '>'
+    126, // '?'
+    120, // '@'
+    191, // 'A'
+    157, // 'B'
+    194, // 'C'
+    170, // 'D'
+    189, // 'E'
+    162, // 'F'
+    161, // 'G'
+    150, // 'H'
+    193, // 'I'
+    142, // 'J'
+    137, // 'K'
+    171, // 'L'
+    176, // 'M'
+    185, // 'N'
+    167, // 'O'
+    186, // 'P'
+    112, // 'Q'
+    175, // 'R'
+    192, // 'S'
+    188, // 'T'
+    156, // 'U'
+    140, // 'V'
+    143, // 'W'
+    123, // 'X'
+    133, // 'Y'
+    128, // 'Z'
+    147, // '['
+    138, // '\\'
+    146, // ']'
+    114, // '^'
+    223, // '_'
+    151, // '`'
+    249, // 'a'
+    216, // 'b'
+    238, // 'c'
+    236, // 'd'
+    253, // 'e'
+    227, // 'f'
+    218, // 'g'
+    230, // 'h'
+    247, // 'i'
+    135, // 'j'
+    180, // 'k'
+    241, // 'l'
+    233, // 'm'
+    246, // 'n'
+    244, // 'o'
+    231, // 'p'
+    139, // 'q'
+    245, // 'r'
+    243, // 's'
+    251, // 't'
+    235, // 'u'
+    201, // 'v'
+    196, // 'w'
+    240, // 'x'
+    214, // 'y'
+    152, // 'z'
+    182, // '{'
+    205, // '|'
+    181, // '}'
+    127, // '~'
+    27,  // '\x7f'
+    212, // '\x80'
+    211, // '\x81'
+    210, // '\x82'
+    213, // '\x83'
+    228, // '\x84'
+    197, // '\x85'
+    169, // '\x86'
+    159, // '\x87'
+    131, // '\x88'
+    172, // '\x89'
+    105, // '\x8a'
+    80,  // '\x8b'
+    98,  // '\x8c'
+    96,  // '\x8d'
+    97,  // '\x8e'
+    81,  // '\x8f'
+    207, // '\x90'
+    145, // '\x91'
+    116, // '\x92'
+    115, // '\x93'
+    144, // '\x94'
+    130, // '\x95'
+    153, // '\x96'
+    121, // '\x97'
+    107, // '\x98'
+    132, // '\x99'
+    109, // '\x9a'
+    110, // '\x9b'
+    124, // '\x9c'
+    111, // '\x9d'
+    82,  // '\x9e'
+    108, // '\x9f'
+    118, // '\xa0'
+    141, // '¡'
+    113, // '¢'
+    129, // '£'
+    119, // '¤'
+    125, // '¥'
+    165, // '¦'
+    117, // '§'
+    92,  // '¨'
+    106, // '©'
+    83,  // 'ª'
+    72,  // '«'
+    99,  // '¬'
+    93,  // '\xad'
+    65,  // '®'
+    79,  // '¯'
+    166, // '°'
+    237, // '±'
+    163, // '²'
+    199, // '³'
+    190, // '´'
+    225, // 'µ'
+    209, // '¶'
+    203, // '·'
+    198, // '¸'
+    217, // '¹'
+    219, // 'º'
+    206, // '»'
+    234, // '¼'
+    248, // '½'
+    158, // '¾'
+    239, // '¿'
+    255, // 'À'
+    255, // 'Á'
+    255, // 'Â'
+    255, // 'Ã'
+    255, // 'Ä'
+    255, // 'Å'
+    255, // 'Æ'
+    255, // 'Ç'
+    255, // 'È'
+    255, // 'É'
+    255, // 'Ê'
+    255, // 'Ë'
+    255, // 'Ì'
+    255, // 'Í'
+    255, // 'Î'
+    255, // 'Ï'
+    255, // 'Ð'
+    255, // 'Ñ'
+    255, // 'Ò'
+    255, // 'Ó'
+    255, // 'Ô'
+    255, // 'Õ'
+    255, // 'Ö'
+    255, // '×'
+    255, // 'Ø'
+    255, // 'Ù'
+    255, // 'Ú'
+    255, // 'Û'
+    255, // 'Ü'
+    255, // 'Ý'
+    255, // 'Þ'
+    255, // 'ß'
+    255, // 'à'
+    255, // 'á'
+    255, // 'â'
+    255, // 'ã'
+    255, // 'ä'
+    255, // 'å'
+    255, // 'æ'
+    255, // 'ç'
+    255, // 'è'
+    255, // 'é'
+    255, // 'ê'
+    255, // 'ë'
+    255, // 'ì'
+    255, // 'í'
+    255, // 'î'
+    255, // 'ï'
+    255, // 'ð'
+    255, // 'ñ'
+    255, // 'ò'
+    255, // 'ó'
+    255, // 'ô'
+    255, // 'õ'
+    255, // 'ö'
+    255, // '÷'
+    255, // 'ø'
+    255, // 'ù'
+    255, // 'ú'
+    255, // 'û'
+    255, // 'ü'
+    255, // 'ý'
+    255, // 'þ'
+    255, // 'ÿ'
+];
diff --git a/src/classes.rs b/src/classes.rs
new file mode 100644
index 0000000..1fba7ea
--- /dev/null
+++ b/src/classes.rs
@@ -0,0 +1,238 @@
+use std::fmt;
+
+/// A representation of byte oriented equivalence classes.
+///
+/// This is used in an FSM to reduce the size of the transition table. This can
+/// have a particularly large impact not only on the total size of an FSM, but
+/// also on compile times.
+#[derive(Clone, Copy)]
+pub struct ByteClasses([u8; 256]);
+
+impl ByteClasses {
+    /// Creates a new set of equivalence classes where all bytes are mapped to
+    /// the same class.
+    pub fn empty() -> ByteClasses {
+        ByteClasses([0; 256])
+    }
+
+    /// Creates a new set of equivalence classes where each byte belongs to
+    /// its own equivalence class.
+    pub fn singletons() -> ByteClasses {
+        let mut classes = ByteClasses::empty();
+        for i in 0..256 {
+            classes.set(i as u8, i as u8);
+        }
+        classes
+    }
+
+    /// Set the equivalence class for the given byte.
+    #[inline]
+    pub fn set(&mut self, byte: u8, class: u8) {
+        self.0[byte as usize] = class;
+    }
+
+    /// Get the equivalence class for the given byte.
+    #[inline]
+    pub fn get(&self, byte: u8) -> u8 {
+        // SAFETY: This is safe because all dense transitions have
+        // exactly 256 elements, so all u8 values are valid indices.
+        self.0[byte as usize]
+    }
+
+    /// Return the total number of elements in the alphabet represented by
+    /// these equivalence classes. Equivalently, this returns the total number
+    /// of equivalence classes.
+    #[inline]
+    pub fn alphabet_len(&self) -> usize {
+        self.0[255] as usize + 1
+    }
+
+    /// Returns true if and only if every byte in this class maps to its own
+    /// equivalence class. Equivalently, there are 256 equivalence classes
+    /// and each class contains exactly one byte.
+    #[inline]
+    pub fn is_singleton(&self) -> bool {
+        self.alphabet_len() == 256
+    }
+
+    /// Returns an iterator over a sequence of representative bytes from each
+    /// equivalence class. Namely, this yields exactly N items, where N is
+    /// equivalent to the number of equivalence classes. Each item is an
+    /// arbitrary byte drawn from each equivalence class.
+    ///
+    /// This is useful when one is determinizing an NFA and the NFA's alphabet
+    /// hasn't been converted to equivalence classes yet. Picking an arbitrary
+    /// byte from each equivalence class then permits a full exploration of
+    /// the NFA instead of using every possible byte value.
+    pub fn representatives(&self) -> ByteClassRepresentatives {
+        ByteClassRepresentatives { classes: self, byte: 0, last_class: None }
+    }
+
+    /// Returns all of the bytes in the given equivalence class.
+    ///
+    /// The second element in the tuple indicates the number of elements in
+    /// the array.
+    fn elements(&self, equiv: u8) -> ([u8; 256], usize) {
+        let (mut array, mut len) = ([0; 256], 0);
+        for b in 0..256 {
+            if self.get(b as u8) == equiv {
+                array[len] = b as u8;
+                len += 1;
+            }
+        }
+        (array, len)
+    }
+}
+
+impl fmt::Debug for ByteClasses {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        if self.is_singleton() {
+            write!(f, "ByteClasses({{singletons}})")
+        } else {
+            write!(f, "ByteClasses(")?;
+            for equiv in 0..self.alphabet_len() {
+                let (members, len) = self.elements(equiv as u8);
+                write!(f, " {} => {:?}", equiv, &members[..len])?;
+            }
+            write!(f, ")")
+        }
+    }
+}
+
+/// An iterator over representative bytes from each equivalence class.
+#[derive(Debug)]
+pub struct ByteClassRepresentatives<'a> {
+    classes: &'a ByteClasses,
+    byte: usize,
+    last_class: Option<u8>,
+}
+
+impl<'a> Iterator for ByteClassRepresentatives<'a> {
+    type Item = u8;
+
+    fn next(&mut self) -> Option<u8> {
+        while self.byte < 256 {
+            let byte = self.byte as u8;
+            let class = self.classes.get(byte);
+            self.byte += 1;
+
+            if self.last_class != Some(class) {
+                self.last_class = Some(class);
+                return Some(byte);
+            }
+        }
+        None
+    }
+}
+
+/// A byte class builder keeps track of an *approximation* of equivalence
+/// classes of bytes during NFA construction. That is, every byte in an
+/// equivalence class cannot discriminate between a match and a non-match.
+///
+/// For example, in the literals `abc` and `xyz`, the bytes [\x00-`], [d-w]
+/// and [{-\xFF] never discriminate between a match and a non-match, precisely
+/// because they never occur in the literals anywhere.
+///
+/// Note though that this does not necessarily compute the minimal set of
+/// equivalence classes. For example, in the literals above, the byte ranges
+/// [\x00-`], [d-w] and [{-\xFF] are all treated as distinct equivalence
+/// classes even though they could be treated a single class. The reason for
+/// this is implementation complexity. In the future, we should endeavor to
+/// compute the minimal equivalence classes since they can have a rather large
+/// impact on the size of the DFA.
+///
+/// The representation here is 256 booleans, all initially set to false. Each
+/// boolean maps to its corresponding byte based on position. A `true` value
+/// indicates the end of an equivalence class, where its corresponding byte
+/// and all of the bytes corresponding to all previous contiguous `false`
+/// values are in the same equivalence class.
+///
+/// This particular representation only permits contiguous ranges of bytes to
+/// be in the same equivalence class, which means that we can never discover
+/// the true minimal set of equivalence classes.
+#[derive(Debug)]
+pub struct ByteClassBuilder(Vec<bool>);
+
+impl ByteClassBuilder {
+    /// Create a new builder of byte classes where all bytes are part of the
+    /// same equivalence class.
+    pub fn new() -> ByteClassBuilder {
+        ByteClassBuilder(vec![false; 256])
+    }
+
+    /// Indicate the the range of byte given (inclusive) can discriminate a
+    /// match between it and all other bytes outside of the range.
+    pub fn set_range(&mut self, start: u8, end: u8) {
+        debug_assert!(start <= end);
+        if start > 0 {
+            self.0[start as usize - 1] = true;
+        }
+        self.0[end as usize] = true;
+    }
+
+    /// Build byte classes that map all byte values to their corresponding
+    /// equivalence class. The last mapping indicates the largest equivalence
+    /// class identifier (which is never bigger than 255).
+    pub fn build(&self) -> ByteClasses {
+        let mut classes = ByteClasses::empty();
+        let mut class = 0u8;
+        let mut i = 0;
+        loop {
+            classes.set(i as u8, class as u8);
+            if i >= 255 {
+                break;
+            }
+            if self.0[i] {
+                class = class.checked_add(1).unwrap();
+            }
+            i += 1;
+        }
+        classes
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn byte_classes() {
+        let mut set = ByteClassBuilder::new();
+        set.set_range(b'a', b'z');
+
+        let classes = set.build();
+        assert_eq!(classes.get(0), 0);
+        assert_eq!(classes.get(1), 0);
+        assert_eq!(classes.get(2), 0);
+        assert_eq!(classes.get(b'a' - 1), 0);
+        assert_eq!(classes.get(b'a'), 1);
+        assert_eq!(classes.get(b'm'), 1);
+        assert_eq!(classes.get(b'z'), 1);
+        assert_eq!(classes.get(b'z' + 1), 2);
+        assert_eq!(classes.get(254), 2);
+        assert_eq!(classes.get(255), 2);
+
+        let mut set = ByteClassBuilder::new();
+        set.set_range(0, 2);
+        set.set_range(4, 6);
+        let classes = set.build();
+        assert_eq!(classes.get(0), 0);
+        assert_eq!(classes.get(1), 0);
+        assert_eq!(classes.get(2), 0);
+        assert_eq!(classes.get(3), 1);
+        assert_eq!(classes.get(4), 2);
+        assert_eq!(classes.get(5), 2);
+        assert_eq!(classes.get(6), 2);
+        assert_eq!(classes.get(7), 3);
+        assert_eq!(classes.get(255), 3);
+    }
+
+    #[test]
+    fn full_byte_classes() {
+        let mut set = ByteClassBuilder::new();
+        for i in 0..256u16 {
+            set.set_range(i as u8, i as u8);
+        }
+        assert_eq!(set.build().alphabet_len(), 256);
+    }
+}
diff --git a/src/dfa.rs b/src/dfa.rs
new file mode 100644
index 0000000..1bf37d5
--- /dev/null
+++ b/src/dfa.rs
@@ -0,0 +1,709 @@
+use std::mem::size_of;
+
+use ahocorasick::MatchKind;
+use automaton::Automaton;
+use classes::ByteClasses;
+use error::Result;
+use nfa::{PatternID, PatternLength, NFA};
+use prefilter::{Prefilter, PrefilterObj, PrefilterState};
+use state_id::{dead_id, fail_id, premultiply_overflow_error, StateID};
+use Match;
+
+#[derive(Clone, Debug)]
+pub enum DFA<S> {
+    Standard(Standard<S>),
+    ByteClass(ByteClass<S>),
+    Premultiplied(Premultiplied<S>),
+    PremultipliedByteClass(PremultipliedByteClass<S>),
+}
+
+impl<S: StateID> DFA<S> {
+    fn repr(&self) -> &Repr<S> {
+        match *self {
+            DFA::Standard(ref dfa) => dfa.repr(),
+            DFA::ByteClass(ref dfa) => dfa.repr(),
+            DFA::Premultiplied(ref dfa) => dfa.repr(),
+            DFA::PremultipliedByteClass(ref dfa) => dfa.repr(),
+        }
+    }
+
+    pub fn match_kind(&self) -> &MatchKind {
+        &self.repr().match_kind
+    }
+
+    pub fn heap_bytes(&self) -> usize {
+        self.repr().heap_bytes
+    }
+
+    pub fn max_pattern_len(&self) -> usize {
+        self.repr().max_pattern_len
+    }
+
+    pub fn pattern_count(&self) -> usize {
+        self.repr().pattern_count
+    }
+
+    pub fn start_state(&self) -> S {
+        self.repr().start_id
+    }
+
+    #[inline(always)]
+    pub fn overlapping_find_at(
+        &self,
+        prestate: &mut PrefilterState,
+        haystack: &[u8],
+        at: usize,
+        state_id: &mut S,
+        match_index: &mut usize,
+    ) -> Option<Match> {
+        match *self {
+            DFA::Standard(ref dfa) => dfa.overlapping_find_at(
+                prestate,
+                haystack,
+                at,
+                state_id,
+                match_index,
+            ),
+            DFA::ByteClass(ref dfa) => dfa.overlapping_find_at(
+                prestate,
+                haystack,
+                at,
+                state_id,
+                match_index,
+            ),
+            DFA::Premultiplied(ref dfa) => dfa.overlapping_find_at(
+                prestate,
+                haystack,
+                at,
+                state_id,
+                match_index,
+            ),
+            DFA::PremultipliedByteClass(ref dfa) => dfa.overlapping_find_at(
+                prestate,
+                haystack,
+                at,
+                state_id,
+                match_index,
+            ),
+        }
+    }
+
+    #[inline(always)]
+    pub fn earliest_find_at(
+        &self,
+        prestate: &mut PrefilterState,
+        haystack: &[u8],
+        at: usize,
+        state_id: &mut S,
+    ) -> Option<Match> {
+        match *self {
+            DFA::Standard(ref dfa) => {
+                dfa.earliest_find_at(prestate, haystack, at, state_id)
+            }
+            DFA::ByteClass(ref dfa) => {
+                dfa.earliest_find_at(prestate, haystack, at, state_id)
+            }
+            DFA::Premultiplied(ref dfa) => {
+                dfa.earliest_find_at(prestate, haystack, at, state_id)
+            }
+            DFA::PremultipliedByteClass(ref dfa) => {
+                dfa.earliest_find_at(prestate, haystack, at, state_id)
+            }
+        }
+    }
+
+    #[inline(always)]
+    pub fn find_at_no_state(
+        &self,
+        prestate: &mut PrefilterState,
+        haystack: &[u8],
+        at: usize,
+    ) -> Option<Match> {
+        match *self {
+            DFA::Standard(ref dfa) => {
+                dfa.find_at_no_state(prestate, haystack, at)
+            }
+            DFA::ByteClass(ref dfa) => {
+                dfa.find_at_no_state(prestate, haystack, at)
+            }
+            DFA::Premultiplied(ref dfa) => {
+                dfa.find_at_no_state(prestate, haystack, at)
+            }
+            DFA::PremultipliedByteClass(ref dfa) => {
+                dfa.find_at_no_state(prestate, haystack, at)
+            }
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct Standard<S>(Repr<S>);
+
+impl<S: StateID> Standard<S> {
+    fn repr(&self) -> &Repr<S> {
+        &self.0
+    }
+}
+
+impl<S: StateID> Automaton for Standard<S> {
+    type ID = S;
+
+    fn match_kind(&self) -> &MatchKind {
+        &self.repr().match_kind
+    }
+
+    fn anchored(&self) -> bool {
+        self.repr().anchored
+    }
+
+    fn prefilter(&self) -> Option<&dyn Prefilter> {
+        self.repr().prefilter.as_ref().map(|p| p.as_ref())
+    }
+
+    fn start_state(&self) -> S {
+        self.repr().start_id
+    }
+
+    fn is_valid(&self, id: S) -> bool {
+        id.to_usize() < self.repr().state_count
+    }
+
+    fn is_match_state(&self, id: S) -> bool {
+        self.repr().is_match_state(id)
+    }
+
+    fn is_match_or_dead_state(&self, id: S) -> bool {
+        self.repr().is_match_or_dead_state(id)
+    }
+
+    fn get_match(
+        &self,
+        id: S,
+        match_index: usize,
+        end: usize,
+    ) -> Option<Match> {
+        self.repr().get_match(id, match_index, end)
+    }
+
+    fn match_count(&self, id: S) -> usize {
+        self.repr().match_count(id)
+    }
+
+    fn next_state(&self, current: S, input: u8) -> S {
+        let o = current.to_usize() * 256 + input as usize;
+        self.repr().trans[o]
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct ByteClass<S>(Repr<S>);
+
+impl<S: StateID> ByteClass<S> {
+    fn repr(&self) -> &Repr<S> {
+        &self.0
+    }
+}
+
+impl<S: StateID> Automaton for ByteClass<S> {
+    type ID = S;
+
+    fn match_kind(&self) -> &MatchKind {
+        &self.repr().match_kind
+    }
+
+    fn anchored(&self) -> bool {
+        self.repr().anchored
+    }
+
+    fn prefilter(&self) -> Option<&dyn Prefilter> {
+        self.repr().prefilter.as_ref().map(|p| p.as_ref())
+    }
+
+    fn start_state(&self) -> S {
+        self.repr().start_id
+    }
+
+    fn is_valid(&self, id: S) -> bool {
+        id.to_usize() < self.repr().state_count
+    }
+
+    fn is_match_state(&self, id: S) -> bool {
+        self.repr().is_match_state(id)
+    }
+
+    fn is_match_or_dead_state(&self, id: S) -> bool {
+        self.repr().is_match_or_dead_state(id)
+    }
+
+    fn get_match(
+        &self,
+        id: S,
+        match_index: usize,
+        end: usize,
+    ) -> Option<Match> {
+        self.repr().get_match(id, match_index, end)
+    }
+
+    fn match_count(&self, id: S) -> usize {
+        self.repr().match_count(id)
+    }
+
+    fn next_state(&self, current: S, input: u8) -> S {
+        let alphabet_len = self.repr().byte_classes.alphabet_len();
+        let input = self.repr().byte_classes.get(input);
+        let o = current.to_usize() * alphabet_len + input as usize;
+        self.repr().trans[o]
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct Premultiplied<S>(Repr<S>);
+
+impl<S: StateID> Premultiplied<S> {
+    fn repr(&self) -> &Repr<S> {
+        &self.0
+    }
+}
+
+impl<S: StateID> Automaton for Premultiplied<S> {
+    type ID = S;
+
+    fn match_kind(&self) -> &MatchKind {
+        &self.repr().match_kind
+    }
+
+    fn anchored(&self) -> bool {
+        self.repr().anchored
+    }
+
+    fn prefilter(&self) -> Option<&dyn Prefilter> {
+        self.repr().prefilter.as_ref().map(|p| p.as_ref())
+    }
+
+    fn start_state(&self) -> S {
+        self.repr().start_id
+    }
+
+    fn is_valid(&self, id: S) -> bool {
+        (id.to_usize() / 256) < self.repr().state_count
+    }
+
+    fn is_match_state(&self, id: S) -> bool {
+        self.repr().is_match_state(id)
+    }
+
+    fn is_match_or_dead_state(&self, id: S) -> bool {
+        self.repr().is_match_or_dead_state(id)
+    }
+
+    fn get_match(
+        &self,
+        id: S,
+        match_index: usize,
+        end: usize,
+    ) -> Option<Match> {
+        if id > self.repr().max_match {
+            return None;
+        }
+        self.repr()
+            .matches
+            .get(id.to_usize() / 256)
+            .and_then(|m| m.get(match_index))
+            .map(|&(id, len)| Match { pattern: id, len, end })
+    }
+
+    fn match_count(&self, id: S) -> usize {
+        let o = id.to_usize() / 256;
+        self.repr().matches[o].len()
+    }
+
+    fn next_state(&self, current: S, input: u8) -> S {
+        let o = current.to_usize() + input as usize;
+        self.repr().trans[o]
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct PremultipliedByteClass<S>(Repr<S>);
+
+impl<S: StateID> PremultipliedByteClass<S> {
+    fn repr(&self) -> &Repr<S> {
+        &self.0
+    }
+}
+
+impl<S: StateID> Automaton for PremultipliedByteClass<S> {
+    type ID = S;
+
+    fn match_kind(&self) -> &MatchKind {
+        &self.repr().match_kind
+    }
+
+    fn anchored(&self) -> bool {
+        self.repr().anchored
+    }
+
+    fn prefilter(&self) -> Option<&dyn Prefilter> {
+        self.repr().prefilter.as_ref().map(|p| p.as_ref())
+    }
+
+    fn start_state(&self) -> S {
+        self.repr().start_id
+    }
+
+    fn is_valid(&self, id: S) -> bool {
+        (id.to_usize() / self.repr().alphabet_len()) < self.repr().state_count
+    }
+
+    fn is_match_state(&self, id: S) -> bool {
+        self.repr().is_match_state(id)
+    }
+
+    fn is_match_or_dead_state(&self, id: S) -> bool {
+        self.repr().is_match_or_dead_state(id)
+    }
+
+    fn get_match(
+        &self,
+        id: S,
+        match_index: usize,
+        end: usize,
+    ) -> Option<Match> {
+        if id > self.repr().max_match {
+            return None;
+        }
+        self.repr()
+            .matches
+            .get(id.to_usize() / self.repr().alphabet_len())
+            .and_then(|m| m.get(match_index))
+            .map(|&(id, len)| Match { pattern: id, len, end })
+    }
+
+    fn match_count(&self, id: S) -> usize {
+        let o = id.to_usize() / self.repr().alphabet_len();
+        self.repr().matches[o].len()
+    }
+
+    fn next_state(&self, current: S, input: u8) -> S {
+        let input = self.repr().byte_classes.get(input);
+        let o = current.to_usize() + input as usize;
+        self.repr().trans[o]
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct Repr<S> {
+    match_kind: MatchKind,
+    anchored: bool,
+    premultiplied: bool,
+    start_id: S,
+    /// The length, in bytes, of the longest pattern in this automaton. This
+    /// information is useful for keeping correct buffer sizes when searching
+    /// on streams.
+    max_pattern_len: usize,
+    /// The total number of patterns added to this automaton. This includes
+    /// patterns that may never match.
+    pattern_count: usize,
+    state_count: usize,
+    max_match: S,
+    /// The number of bytes of heap used by this NFA's transition table.
+    heap_bytes: usize,
+    /// A prefilter for quickly detecting candidate matchs, if pertinent.
+    prefilter: Option<PrefilterObj>,
+    byte_classes: ByteClasses,
+    trans: Vec<S>,
+    matches: Vec<Vec<(PatternID, PatternLength)>>,
+}
+
+impl<S: StateID> Repr<S> {
+    /// Returns the total alphabet size for this DFA.
+    ///
+    /// If byte classes are enabled, then this corresponds to the number of
+    /// equivalence classes. If they are disabled, then this is always 256.
+    fn alphabet_len(&self) -> usize {
+        self.byte_classes.alphabet_len()
+    }
+
+    /// Returns true only if the given state is a match state.
+    fn is_match_state(&self, id: S) -> bool {
+        id <= self.max_match && id > dead_id()
+    }
+
+    /// Returns true only if the given state is either a dead state or a match
+    /// state.
+    fn is_match_or_dead_state(&self, id: S) -> bool {
+        id <= self.max_match
+    }
+
+    /// Get the ith match for the given state, where the end position of a
+    /// match was found at `end`.
+    ///
+    /// # Panics
+    ///
+    /// The caller must ensure that the given state identifier is valid,
+    /// otherwise this may panic. The `match_index` need not be valid. That is,
+    /// if the given state has no matches then this returns `None`.
+    fn get_match(
+        &self,
+        id: S,
+        match_index: usize,
+        end: usize,
+    ) -> Option<Match> {
+        if id > self.max_match {
+            return None;
+        }
+        self.matches
+            .get(id.to_usize())
+            .and_then(|m| m.get(match_index))
+            .map(|&(id, len)| Match { pattern: id, len, end })
+    }
+
+    /// Return the total number of matches for the given state.
+    ///
+    /// # Panics
+    ///
+    /// The caller must ensure that the given identifier is valid, or else
+    /// this panics.
+    fn match_count(&self, id: S) -> usize {
+        self.matches[id.to_usize()].len()
+    }
+
+    /// Get the next state given `from` as the current state and `byte` as the
+    /// current input byte.
+    fn next_state(&self, from: S, byte: u8) -> S {
+        let alphabet_len = self.alphabet_len();
+        let byte = self.byte_classes.get(byte);
+        self.trans[from.to_usize() * alphabet_len + byte as usize]
+    }
+
+    /// Set the `byte` transition for the `from` state to point to `to`.
+    fn set_next_state(&mut self, from: S, byte: u8, to: S) {
+        let alphabet_len = self.alphabet_len();
+        let byte = self.byte_classes.get(byte);
+        self.trans[from.to_usize() * alphabet_len + byte as usize] = to;
+    }
+
+    /// Swap the given states in place.
+    fn swap_states(&mut self, id1: S, id2: S) {
+        assert!(!self.premultiplied, "can't swap states in premultiplied DFA");
+
+        let o1 = id1.to_usize() * self.alphabet_len();
+        let o2 = id2.to_usize() * self.alphabet_len();
+        for b in 0..self.alphabet_len() {
+            self.trans.swap(o1 + b, o2 + b);
+        }
+        self.matches.swap(id1.to_usize(), id2.to_usize());
+    }
+
+    /// This routine shuffles all match states in this DFA to the beginning
+    /// of the DFA such that every non-match state appears after every match
+    /// state. (With one exception: the special fail and dead states remain as
+    /// the first two states.)
+    ///
+    /// The purpose of doing this shuffling is to avoid an extra conditional
+    /// in the search loop, and in particular, detecting whether a state is a
+    /// match or not does not need to access any memory.
+    ///
+    /// This updates `self.max_match` to point to the last matching state as
+    /// well as `self.start` if the starting state was moved.
+    fn shuffle_match_states(&mut self) {
+        assert!(
+            !self.premultiplied,
+            "cannot shuffle match states of premultiplied DFA"
+        );
+
+        if self.state_count <= 1 {
+            return;
+        }
+
+        let mut first_non_match = self.start_id.to_usize();
+        while first_non_match < self.state_count
+            && self.matches[first_non_match].len() > 0
+        {
+            first_non_match += 1;
+        }
+
+        let mut swaps: Vec<S> = vec![fail_id(); self.state_count];
+        let mut cur = self.state_count - 1;
+        while cur > first_non_match {
+            if self.matches[cur].len() > 0 {
+                self.swap_states(
+                    S::from_usize(cur),
+                    S::from_usize(first_non_match),
+                );
+                swaps[cur] = S::from_usize(first_non_match);
+                swaps[first_non_match] = S::from_usize(cur);
+
+                first_non_match += 1;
+                while first_non_match < cur
+                    && self.matches[first_non_match].len() > 0
+                {
+                    first_non_match += 1;
+                }
+            }
+            cur -= 1;
+        }
+        for id in (0..self.state_count).map(S::from_usize) {
+            let alphabet_len = self.alphabet_len();
+            let offset = id.to_usize() * alphabet_len;
+            for next in &mut self.trans[offset..offset + alphabet_len] {
+                if swaps[next.to_usize()] != fail_id() {
+                    *next = swaps[next.to_usize()];
+                }
+            }
+        }
+        if swaps[self.start_id.to_usize()] != fail_id() {
+            self.start_id = swaps[self.start_id.to_usize()];
+        }
+        self.max_match = S::from_usize(first_non_match - 1);
+    }
+
+    fn premultiply(&mut self) -> Result<()> {
+        if self.premultiplied || self.state_count <= 1 {
+            return Ok(());
+        }
+
+        let alpha_len = self.alphabet_len();
+        premultiply_overflow_error(
+            S::from_usize(self.state_count - 1),
+            alpha_len,
+        )?;
+
+        for id in (2..self.state_count).map(S::from_usize) {
+            let offset = id.to_usize() * alpha_len;
+            for next in &mut self.trans[offset..offset + alpha_len] {
+                if *next == dead_id() {
+                    continue;
+                }
+                *next = S::from_usize(next.to_usize() * alpha_len);
+            }
+        }
+        self.premultiplied = true;
+        self.start_id = S::from_usize(self.start_id.to_usize() * alpha_len);
+        self.max_match = S::from_usize(self.max_match.to_usize() * alpha_len);
+        Ok(())
+    }
+
+    /// Computes the total amount of heap used by this NFA in bytes.
+    fn calculate_size(&mut self) {
+        let mut size = (self.trans.len() * size_of::<S>())
+            + (self.matches.len()
+                * size_of::<Vec<(PatternID, PatternLength)>>());
+        for state_matches in &self.matches {
+            size +=
+                state_matches.len() * size_of::<(PatternID, PatternLength)>();
+        }
+        size += self.prefilter.as_ref().map_or(0, |p| p.as_ref().heap_bytes());
+        self.heap_bytes = size;
+    }
+}
+
+/// A builder for configuring the determinization of an NFA into a DFA.
+#[derive(Clone, Debug)]
+pub struct Builder {
+    premultiply: bool,
+    byte_classes: bool,
+}
+
+impl Builder {
+    /// Create a new builder for a DFA.
+    pub fn new() -> Builder {
+        Builder { premultiply: true, byte_classes: true }
+    }
+
+    /// Build a DFA from the given NFA.
+    ///
+    /// This returns an error if the state identifiers exceed their
+    /// representation size. This can only happen when state ids are
+    /// premultiplied (which is enabled by default).
+    pub fn build<S: StateID>(&self, nfa: &NFA<S>) -> Result<DFA<S>> {
+        let byte_classes = if self.byte_classes {
+            nfa.byte_classes().clone()
+        } else {
+            ByteClasses::singletons()
+        };
+        let alphabet_len = byte_classes.alphabet_len();
+        let trans = vec![fail_id(); alphabet_len * nfa.state_len()];
+        let matches = vec![vec![]; nfa.state_len()];
+        let mut repr = Repr {
+            match_kind: nfa.match_kind().clone(),
+            anchored: nfa.anchored(),
+            premultiplied: false,
+            start_id: nfa.start_state(),
+            max_pattern_len: nfa.max_pattern_len(),
+            pattern_count: nfa.pattern_count(),
+            state_count: nfa.state_len(),
+            max_match: fail_id(),
+            heap_bytes: 0,
+            prefilter: nfa.prefilter_obj().map(|p| p.clone()),
+            byte_classes: byte_classes.clone(),
+            trans,
+            matches,
+        };
+        for id in (0..nfa.state_len()).map(S::from_usize) {
+            repr.matches[id.to_usize()].extend_from_slice(nfa.matches(id));
+
+            let fail = nfa.failure_transition(id);
+            nfa.iter_all_transitions(&byte_classes, id, |b, mut next| {
+                if next == fail_id() {
+                    next = nfa_next_state_memoized(nfa, &repr, id, fail, b);
+                }
+                repr.set_next_state(id, b, next);
+            });
+        }
+        repr.shuffle_match_states();
+        repr.calculate_size();
+        if self.premultiply {
+            repr.premultiply()?;
+            if byte_classes.is_singleton() {
+                Ok(DFA::Premultiplied(Premultiplied(repr)))
+            } else {
+                Ok(DFA::PremultipliedByteClass(PremultipliedByteClass(repr)))
+            }
+        } else {
+            if byte_classes.is_singleton() {
+                Ok(DFA::Standard(Standard(repr)))
+            } else {
+                Ok(DFA::ByteClass(ByteClass(repr)))
+            }
+        }
+    }
+
+    /// Whether to use byte classes or in the DFA.
+    pub fn byte_classes(&mut self, yes: bool) -> &mut Builder {
+        self.byte_classes = yes;
+        self
+    }
+
+    /// Whether to premultiply state identifier in the DFA.
+    pub fn premultiply(&mut self, yes: bool) -> &mut Builder {
+        self.premultiply = yes;
+        self
+    }
+}
+
+/// This returns the next NFA transition (including resolving failure
+/// transitions), except once it sees a state id less than the id of the DFA
+/// state that is currently being populated, then we no longer need to follow
+/// failure transitions and can instead query the pre-computed state id from
+/// the DFA itself.
+///
+/// In general, this should only be called when a failure transition is seen.
+fn nfa_next_state_memoized<S: StateID>(
+    nfa: &NFA<S>,
+    dfa: &Repr<S>,
+    populating: S,
+    mut current: S,
+    input: u8,
+) -> S {
+    loop {
+        if current < populating {
+            return dfa.next_state(current, input);
+        }
+        let next = nfa.next_state(current, input);
+        if next != fail_id() {
+            return next;
+        }
+        current = nfa.failure_transition(current);
+    }
+}
diff --git a/src/error.rs b/src/error.rs
new file mode 100644
index 0000000..7dace63
--- /dev/null
+++ b/src/error.rs
@@ -0,0 +1,101 @@
+use std::error;
+use std::fmt;
+use std::result;
+
+pub type Result<T> = result::Result<T, Error>;
+
+/// An error that occurred during the construction of an Aho-Corasick
+/// automaton.
+#[derive(Clone, Debug)]
+pub struct Error {
+    kind: ErrorKind,
+}
+
+/// The kind of error that occurred.
+#[derive(Clone, Debug)]
+pub enum ErrorKind {
+    /// An error that occurs when constructing an automaton would require the
+    /// use of a state ID that overflows the chosen state ID representation.
+    /// For example, if one is using `u8` for state IDs and builds a DFA with
+    /// 257 states, then the last state's ID will be `256` which cannot be
+    /// represented with `u8`.
+    StateIDOverflow {
+        /// The maximum possible state ID.
+        max: usize,
+    },
+    /// An error that occurs when premultiplication of state IDs is requested
+    /// when constructing an Aho-Corasick DFA, but doing so would overflow the
+    /// chosen state ID representation.
+    ///
+    /// When `max == requested_max`, then the state ID would overflow `usize`.
+    PremultiplyOverflow {
+        /// The maximum possible state id.
+        max: usize,
+        /// The maximum ID required by premultiplication.
+        requested_max: usize,
+    },
+}
+
+impl Error {
+    /// Return the kind of this error.
+    pub fn kind(&self) -> &ErrorKind {
+        &self.kind
+    }
+
+    pub(crate) fn state_id_overflow(max: usize) -> Error {
+        Error { kind: ErrorKind::StateIDOverflow { max } }
+    }
+
+    pub(crate) fn premultiply_overflow(
+        max: usize,
+        requested_max: usize,
+    ) -> Error {
+        Error { kind: ErrorKind::PremultiplyOverflow { max, requested_max } }
+    }
+}
+
+impl error::Error for Error {
+    fn description(&self) -> &str {
+        match self.kind {
+            ErrorKind::StateIDOverflow { .. } => {
+                "state id representation too small"
+            }
+            ErrorKind::PremultiplyOverflow { .. } => {
+                "state id representation too small for premultiplication"
+            }
+        }
+    }
+}
+
+impl fmt::Display for Error {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match self.kind {
+            ErrorKind::StateIDOverflow { max } => write!(
+                f,
+                "building the automaton failed because it required \
+                 building more states that can be identified, where the \
+                 maximum ID for the chosen representation is {}",
+                max,
+            ),
+            ErrorKind::PremultiplyOverflow { max, requested_max } => {
+                if max == requested_max {
+                    write!(
+                        f,
+                        "premultiplication of states requires the ability to \
+                         represent a state ID greater than what can fit on \
+                         this platform's usize, which is {}",
+                        ::std::usize::MAX,
+                    )
+                } else {
+                    write!(
+                        f,
+                        "premultiplication of states requires the ability to \
+                         represent at least a state ID of {}, but the chosen \
+                         representation only permits a maximum state ID of {}",
+                        requested_max, max,
+                    )
+                }
+            }
+        }
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..28e984b
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,297 @@
+/*!
+A library for finding occurrences of many patterns at once. This library
+provides multiple pattern search principally through an implementation of the
+[Aho-Corasick algorithm](https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm),
+which builds a fast finite state machine for executing searches in linear time.
+
+Additionally, this library provides a number of configuration options for
+building the automaton that permit controlling the space versus time trade
+off. Other features include simple ASCII case insensitive matching, finding
+overlapping matches, replacements, searching streams and even searching and
+replacing text in streams.
+
+Finally, unlike all other (known) Aho-Corasick implementations, this one
+supports enabling
+[leftmost-first](enum.MatchKind.html#variant.LeftmostFirst)
+or
+[leftmost-longest](enum.MatchKind.html#variant.LeftmostFirst)
+match semantics, using a (seemingly) novel alternative construction algorithm.
+For more details on what match semantics means, see the
+[`MatchKind`](enum.MatchKind.html)
+type.
+
+# Overview
+
+This section gives a brief overview of the primary types in this crate:
+
+* [`AhoCorasick`](struct.AhoCorasick.html) is the primary type and represents
+  an Aho-Corasick automaton. This is the type you use to execute searches.
+* [`AhoCorasickBuilder`](struct.AhoCorasickBuilder.html) can be used to build
+  an Aho-Corasick automaton, and supports configuring a number of options.
+* [`Match`](struct.Match.html) represents a single match reported by an
+  Aho-Corasick automaton. Each match has two pieces of information: the pattern
+  that matched and the start and end byte offsets corresponding to the position
+  in the haystack at which it matched.
+
+Additionally, the [`packed`](packed/index.html) sub-module contains a lower
+level API for using fast vectorized routines for finding a small number of
+patterns in a haystack.
+
+# Example: basic searching
+
+This example shows how to search for occurrences of multiple patterns
+simultaneously. Each match includes the pattern that matched along with the
+byte offsets of the match.
+
+```
+use aho_corasick::AhoCorasick;
+
+let patterns = &["apple", "maple", "Snapple"];
+let haystack = "Nobody likes maple in their apple flavored Snapple.";
+
+let ac = AhoCorasick::new(patterns);
+let mut matches = vec![];
+for mat in ac.find_iter(haystack) {
+    matches.push((mat.pattern(), mat.start(), mat.end()));
+}
+assert_eq!(matches, vec![
+    (1, 13, 18),
+    (0, 28, 33),
+    (2, 43, 50),
+]);
+```
+
+# Example: case insensitivity
+
+This is like the previous example, but matches `Snapple` case insensitively
+using `AhoCorasickBuilder`:
+
+```
+use aho_corasick::AhoCorasickBuilder;
+
+let patterns = &["apple", "maple", "snapple"];
+let haystack = "Nobody likes maple in their apple flavored Snapple.";
+
+let ac = AhoCorasickBuilder::new()
+    .ascii_case_insensitive(true)
+    .build(patterns);
+let mut matches = vec![];
+for mat in ac.find_iter(haystack) {
+    matches.push((mat.pattern(), mat.start(), mat.end()));
+}
+assert_eq!(matches, vec![
+    (1, 13, 18),
+    (0, 28, 33),
+    (2, 43, 50),
+]);
+```
+
+# Example: replacing matches in a stream
+
+This example shows how to execute a search and replace on a stream without
+loading the entire stream into memory first.
+
+```
+use aho_corasick::AhoCorasick;
+
+# fn example() -> Result<(), ::std::io::Error> {
+let patterns = &["fox", "brown", "quick"];
+let replace_with = &["sloth", "grey", "slow"];
+
+// In a real example, these might be `std::fs::File`s instead. All you need to
+// do is supply a pair of `std::io::Read` and `std::io::Write` implementations.
+let rdr = "The quick brown fox.";
+let mut wtr = vec![];
+
+let ac = AhoCorasick::new(patterns);
+ac.stream_replace_all(rdr.as_bytes(), &mut wtr, replace_with)?;
+assert_eq!(b"The slow grey sloth.".to_vec(), wtr);
+# Ok(()) }; example().unwrap()
+```
+
+# Example: finding the leftmost first match
+
+In the textbook description of Aho-Corasick, its formulation is typically
+structured such that it reports all possible matches, even when they overlap
+with another. In many cases, overlapping matches may not be desired, such as
+the case of finding all successive non-overlapping matches like you might with
+a standard regular expression.
+
+Unfortunately the "obvious" way to modify the Aho-Corasick algorithm to do
+this doesn't always work in the expected way, since it will report matches as
+soon as they are seen. For example, consider matching the regex `Samwise|Sam`
+against the text `Samwise`. Most regex engines (that are Perl-like, or
+non-POSIX) will report `Samwise` as a match, but the standard Aho-Corasick
+algorithm modified for reporting non-overlapping matches will report `Sam`.
+
+A novel contribution of this library is the ability to change the match
+semantics of Aho-Corasick (without additional search time overhead) such that
+`Samwise` is reported instead. For example, here's the standard approach:
+
+```
+use aho_corasick::AhoCorasick;
+
+let patterns = &["Samwise", "Sam"];
+let haystack = "Samwise";
+
+let ac = AhoCorasick::new(patterns);
+let mat = ac.find(haystack).expect("should have a match");
+assert_eq!("Sam", &haystack[mat.start()..mat.end()]);
+```
+
+And now here's the leftmost-first version, which matches how a Perl-like
+regex will work:
+
+```
+use aho_corasick::{AhoCorasickBuilder, MatchKind};
+
+let patterns = &["Samwise", "Sam"];
+let haystack = "Samwise";
+
+let ac = AhoCorasickBuilder::new()
+    .match_kind(MatchKind::LeftmostFirst)
+    .build(patterns);
+let mat = ac.find(haystack).expect("should have a match");
+assert_eq!("Samwise", &haystack[mat.start()..mat.end()]);
+```
+
+In addition to leftmost-first semantics, this library also supports
+leftmost-longest semantics, which match the POSIX behavior of a regular
+expression alternation. See
+[`MatchKind`](enum.MatchKind.html)
+for more details.
+
+# Prefilters
+
+While an Aho-Corasick automaton can perform admirably when compared to more
+naive solutions, it is generally slower than more specialized algorithms that
+are accelerated using vector instructions such as SIMD.
+
+For that reason, this library will internally use a "prefilter" to attempt
+to accelerate searches when possible. Currently, this library has fairly
+limited implementation that only applies when there are 3 or fewer unique
+starting bytes among all patterns in an automaton.
+
+While a prefilter is generally good to have on by default since it works well
+in the common case, it can lead to less predictable or even sub-optimal
+performance in some cases. For that reason, prefilters can be disabled via
+[`AhoCorasickBuilder::prefilter`](struct.AhoCorasickBuilder.html#method.prefilter).
+*/
+
+#![deny(missing_docs)]
+
+// We can never be truly no_std, but we could be alloc-only some day, so
+// require the std feature for now.
+#[cfg(not(feature = "std"))]
+compile_error!("`std` feature is currently required to build this crate");
+
+extern crate memchr;
+#[cfg(test)]
+#[macro_use]
+extern crate doc_comment;
+
+#[cfg(test)]
+doctest!("../README.md");
+
+pub use ahocorasick::{
+    AhoCorasick, AhoCorasickBuilder, FindIter, FindOverlappingIter, MatchKind,
+    StreamFindIter,
+};
+pub use error::{Error, ErrorKind};
+pub use state_id::StateID;
+
+mod ahocorasick;
+mod automaton;
+mod buffer;
+mod byte_frequencies;
+mod classes;
+mod dfa;
+mod error;
+mod nfa;
+pub mod packed;
+mod prefilter;
+mod state_id;
+#[cfg(test)]
+mod tests;
+
+/// A representation of a match reported by an Aho-Corasick automaton.
+///
+/// A match has two essential pieces of information: the identifier of the
+/// pattern that matched, along with the start and end offsets of the match
+/// in the haystack.
+///
+/// # Examples
+///
+/// Basic usage:
+///
+/// ```
+/// use aho_corasick::AhoCorasick;
+///
+/// let ac = AhoCorasick::new(&[
+///     "foo", "bar", "baz",
+/// ]);
+/// let mat = ac.find("xxx bar xxx").expect("should have a match");
+/// assert_eq!(1, mat.pattern());
+/// assert_eq!(4, mat.start());
+/// assert_eq!(7, mat.end());
+/// ```
+#[derive(Clone, Debug, Eq, Hash, PartialEq)]
+pub struct Match {
+    /// The pattern id.
+    pattern: usize,
+    /// The length of this match, such that the starting position of the match
+    /// is `end - len`.
+    ///
+    /// We use length here because, other than the pattern id, the only
+    /// information about each pattern that the automaton stores is its length.
+    /// So using the length here is just a bit more natural. But it isn't
+    /// technically required.
+    len: usize,
+    /// The end offset of the match, exclusive.
+    end: usize,
+}
+
+impl Match {
+    /// Returns the identifier of the pattern that matched.
+    ///
+    /// The identifier of a pattern is derived from the position in which it
+    /// was originally inserted into the corresponding automaton. The first
+    /// pattern has identifier `0`, and each subsequent pattern is `1`, `2`
+    /// and so on.
+    #[inline]
+    pub fn pattern(&self) -> usize {
+        self.pattern
+    }
+
+    /// The starting position of the match.
+    #[inline]
+    pub fn start(&self) -> usize {
+        self.end - self.len
+    }
+
+    /// The ending position of the match.
+    #[inline]
+    pub fn end(&self) -> usize {
+        self.end
+    }
+
+    /// Returns true if and only if this match is empty. That is, when
+    /// `start() == end()`.
+    ///
+    /// An empty match can only be returned when the empty string was among
+    /// the patterns used to build the Aho-Corasick automaton.
+    #[inline]
+    pub fn is_empty(&self) -> bool {
+        self.len == 0
+    }
+
+    #[inline]
+    fn increment(&self, by: usize) -> Match {
+        Match { pattern: self.pattern, len: self.len, end: self.end + by }
+    }
+
+    #[inline]
+    fn from_span(id: usize, start: usize, end: usize) -> Match {
+        Match { pattern: id, len: end - start, end }
+    }
+}
diff --git a/src/nfa.rs b/src/nfa.rs
new file mode 100644
index 0000000..809d5ef
--- /dev/null
+++ b/src/nfa.rs
@@ -0,0 +1,1363 @@
+use std::cmp;
+use std::collections::{BTreeSet, VecDeque};
+use std::fmt;
+use std::mem::size_of;
+use std::ops::{Index, IndexMut};
+
+use ahocorasick::MatchKind;
+use automaton::Automaton;
+use classes::{ByteClassBuilder, ByteClasses};
+use error::Result;
+use prefilter::{self, opposite_ascii_case, Prefilter, PrefilterObj};
+use state_id::{dead_id, fail_id, usize_to_state_id, StateID};
+use Match;
+
+/// The identifier for a pattern, which is simply the position of the pattern
+/// in the sequence of patterns given by the caller.
+pub type PatternID = usize;
+
+/// The length of a pattern, in bytes.
+pub type PatternLength = usize;
+
+/// An Aho-Corasick automaton, represented as an NFA.
+///
+/// This is the classical formulation of Aho-Corasick, which involves building
+/// up a prefix trie of a given set of patterns, and then wiring up failure
+/// transitions between states in order to guarantee linear time matching. The
+/// standard formulation is, technically, an NFA because of these failure
+/// transitions. That is, one can see them as enabling the automaton to be in
+/// multiple states at once. Indeed, during search, it is possible to check
+/// the transitions on multiple states for a single input byte.
+///
+/// This particular implementation not only supports the standard style of
+/// matching, but also provides a mode for choosing leftmost-first or
+/// leftmost-longest match semantics. When a leftmost mode is chosen, some
+/// failure transitions that would otherwise be added are elided. See
+/// the documentation of `MatchKind` for more details and examples on how the
+/// match semantics may differ.
+///
+/// If one wants a DFA, then it is necessary to first build an NFA and convert
+/// it into a DFA. Note, however, that because we've constrained ourselves to
+/// matching literal patterns, this does not need to use subset construction
+/// for determinization. Instead, the DFA has at most a number of states
+/// equivalent to the number of NFA states. The only real difference between
+/// them is that all failure transitions are followed and pre-computed. This
+/// uses much more memory, but also executes searches more quickly.
+#[derive(Clone)]
+pub struct NFA<S> {
+    /// The match semantics built into this NFA.
+    match_kind: MatchKind,
+    /// The start state id as an index into `states`.
+    start_id: S,
+    /// The length, in bytes, of the longest pattern in this automaton. This
+    /// information is useful for keeping correct buffer sizes when searching
+    /// on streams.
+    max_pattern_len: usize,
+    /// The total number of patterns added to this automaton, including
+    /// patterns that may never be matched.
+    pattern_count: usize,
+    /// The number of bytes of heap used by this NFA's transition table.
+    heap_bytes: usize,
+    /// A prefilter for quickly skipping to candidate matches, if pertinent.
+    prefilter: Option<PrefilterObj>,
+    /// Whether this automaton anchors all matches to the start of input.
+    anchored: bool,
+    /// A set of equivalence classes in terms of bytes. We compute this while
+    /// building the NFA, but don't use it in the NFA's states. Instead, we
+    /// use this for building the DFA. We store it on the NFA since it's easy
+    /// to compute while visiting the the patterns.
+    byte_classes: ByteClasses,
+    /// A set of states. Each state defines its own transitions, a fail
+    /// transition and a set of indices corresponding to matches.
+    ///
+    /// The first state is always the fail state, which is used only as a
+    /// sentinel. Namely, in the final NFA, no transition into the fail state
+    /// exists. (Well, they do, but they aren't followed. Instead, the state's
+    /// failure transition is followed.)
+    ///
+    /// The second state (index 1) is always the dead state. Dead states are
+    /// in every automaton, but only used when leftmost-{first,longest} match
+    /// semantics are enabled. Specifically, they instruct search to stop
+    /// at specific points in order to report the correct match location. In
+    /// the standard Aho-Corasick construction, there are no transitions to
+    /// the dead state.
+    ///
+    /// The third state (index 2) is generally intended to be the starting or
+    /// "root" state.
+    states: Vec<State<S>>,
+}
+
+impl<S: StateID> NFA<S> {
+    /// Returns the equivalence classes of bytes found while constructing
+    /// this NFA.
+    ///
+    /// Note that the NFA doesn't actually make use of these equivalence
+    /// classes. Instead, these are useful for building the DFA when desired.
+    pub fn byte_classes(&self) -> &ByteClasses {
+        &self.byte_classes
+    }
+
+    /// Returns a prefilter, if one exists.
+    pub fn prefilter_obj(&self) -> Option<&PrefilterObj> {
+        self.prefilter.as_ref()
+    }
+
+    /// Returns the total number of heap bytes used by this NFA's transition
+    /// table.
+    pub fn heap_bytes(&self) -> usize {
+        self.heap_bytes
+            + self.prefilter.as_ref().map_or(0, |p| p.as_ref().heap_bytes())
+    }
+
+    /// Return the length of the longest pattern in this automaton.
+    pub fn max_pattern_len(&self) -> usize {
+        self.max_pattern_len
+    }
+
+    /// Return the total number of patterns added to this automaton.
+    pub fn pattern_count(&self) -> usize {
+        self.pattern_count
+    }
+
+    /// Returns the total number of states in this NFA.
+    pub fn state_len(&self) -> usize {
+        self.states.len()
+    }
+
+    /// Returns the matches for the given state.
+    pub fn matches(&self, id: S) -> &[(PatternID, PatternLength)] {
+        &self.states[id.to_usize()].matches
+    }
+
+    /// Returns an iterator over all transitions in the given state according
+    /// to the given equivalence classes, including transitions to `fail_id()`.
+    /// The number of transitions returned is always equivalent to the number
+    /// of equivalence classes.
+    pub fn iter_all_transitions<F: FnMut(u8, S)>(
+        &self,
+        byte_classes: &ByteClasses,
+        id: S,
+        f: F,
+    ) {
+        self.states[id.to_usize()].trans.iter_all(byte_classes, f);
+    }
+
+    /// Returns the failure transition for the given state.
+    pub fn failure_transition(&self, id: S) -> S {
+        self.states[id.to_usize()].fail
+    }
+
+    /// Returns the next state for the given state and input byte.
+    ///
+    /// Note that this does not follow failure transitions. As such, the id
+    /// returned may be `fail_id`.
+    pub fn next_state(&self, current: S, input: u8) -> S {
+        self.states[current.to_usize()].next_state(input)
+    }
+
+    fn state(&self, id: S) -> &State<S> {
+        &self.states[id.to_usize()]
+    }
+
+    fn state_mut(&mut self, id: S) -> &mut State<S> {
+        &mut self.states[id.to_usize()]
+    }
+
+    fn start(&self) -> &State<S> {
+        self.state(self.start_id)
+    }
+
+    fn start_mut(&mut self) -> &mut State<S> {
+        let id = self.start_id;
+        self.state_mut(id)
+    }
+
+    fn iter_transitions_mut(&mut self, id: S) -> IterTransitionsMut<S> {
+        IterTransitionsMut::new(self, id)
+    }
+
+    fn copy_matches(&mut self, src: S, dst: S) {
+        let (src, dst) =
+            get_two_mut(&mut self.states, src.to_usize(), dst.to_usize());
+        dst.matches.extend_from_slice(&src.matches);
+    }
+
+    fn copy_empty_matches(&mut self, dst: S) {
+        let start_id = self.start_id;
+        self.copy_matches(start_id, dst);
+    }
+
+    fn add_dense_state(&mut self, depth: usize) -> Result<S> {
+        let trans = Transitions::Dense(Dense::new());
+        let id = usize_to_state_id(self.states.len())?;
+        self.states.push(State {
+            trans,
+            // Anchored automatons do not have any failure transitions.
+            fail: if self.anchored { dead_id() } else { self.start_id },
+            depth,
+            matches: vec![],
+        });
+        Ok(id)
+    }
+
+    fn add_sparse_state(&mut self, depth: usize) -> Result<S> {
+        let trans = Transitions::Sparse(vec![]);
+        let id = usize_to_state_id(self.states.len())?;
+        self.states.push(State {
+            trans,
+            // Anchored automatons do not have any failure transitions.
+            fail: if self.anchored { dead_id() } else { self.start_id },
+            depth,
+            matches: vec![],
+        });
+        Ok(id)
+    }
+}
+
+impl<S: StateID> Automaton for NFA<S> {
+    type ID = S;
+
+    fn match_kind(&self) -> &MatchKind {
+        &self.match_kind
+    }
+
+    fn anchored(&self) -> bool {
+        self.anchored
+    }
+
+    fn prefilter(&self) -> Option<&dyn Prefilter> {
+        self.prefilter.as_ref().map(|p| p.as_ref())
+    }
+
+    fn start_state(&self) -> S {
+        self.start_id
+    }
+
+    fn is_valid(&self, id: S) -> bool {
+        id.to_usize() < self.states.len()
+    }
+
+    fn is_match_state(&self, id: S) -> bool {
+        self.states[id.to_usize()].is_match()
+    }
+
+    fn get_match(
+        &self,
+        id: S,
+        match_index: usize,
+        end: usize,
+    ) -> Option<Match> {
+        let state = match self.states.get(id.to_usize()) {
+            None => return None,
+            Some(state) => state,
+        };
+        state.matches.get(match_index).map(|&(id, len)| Match {
+            pattern: id,
+            len,
+            end,
+        })
+    }
+
+    fn match_count(&self, id: S) -> usize {
+        self.states[id.to_usize()].matches.len()
+    }
+
+    fn next_state(&self, mut current: S, input: u8) -> S {
+        // This terminates since:
+        //
+        // 1. `State.fail` never points to fail_id().
+        // 2. All `State.fail` values point to a state closer to `start`.
+        // 3. The start state has no transitions to fail_id().
+        loop {
+            let state = &self.states[current.to_usize()];
+            let next = state.next_state(input);
+            if next != fail_id() {
+                return next;
+            }
+            current = state.fail;
+        }
+    }
+}
+
+/// A representation of an NFA state for an Aho-Corasick automaton.
+///
+/// It contains the transitions to the next state, a failure transition for
+/// cases where there exists no other transition for the current input byte,
+/// the matches implied by visiting this state (if any) and the depth of this
+/// state. The depth of a state is simply the distance from it to the start
+/// state in the automaton, where the depth of the start state is 0.
+#[derive(Clone, Debug)]
+pub struct State<S> {
+    trans: Transitions<S>,
+    fail: S,
+    matches: Vec<(PatternID, PatternLength)>,
+    // TODO: Strictly speaking, this isn't needed for searching. It's only
+    // used when building an NFA that supports leftmost match semantics. We
+    // could drop this from the state and dynamically build a map only when
+    // computing failure transitions, but it's not clear which is better.
+    // Benchmark this.
+    depth: usize,
+}
+
+impl<S: StateID> State<S> {
+    fn heap_bytes(&self) -> usize {
+        self.trans.heap_bytes()
+            + (self.matches.len() * size_of::<(PatternID, PatternLength)>())
+    }
+
+    fn add_match(&mut self, i: PatternID, len: PatternLength) {
+        self.matches.push((i, len));
+    }
+
+    fn is_match(&self) -> bool {
+        !self.matches.is_empty()
+    }
+
+    fn get_longest_match_len(&self) -> Option<usize> {
+        // Why is this true? Because the first match in any matching state
+        // will always correspond to the match added to it during trie
+        // construction (since when we copy matches due to failure transitions,
+        // we always append them). Therefore, it follows that the first match
+        // must always be longest since any subsequent match must be from a
+        // failure transition, and a failure transition by construction points
+        // to a proper suffix. A proper suffix is, by definition, smaller.
+        self.matches.get(0).map(|&(_, len)| len)
+    }
+
+    fn next_state(&self, input: u8) -> S {
+        self.trans.next_state(input)
+    }
+
+    fn set_next_state(&mut self, input: u8, next: S) {
+        self.trans.set_next_state(input, next);
+    }
+}
+
+/// Represents the transitions for a single dense state.
+///
+/// The primary purpose here is to encapsulate index access. Namely, since a
+/// dense representation always contains 256 elements, all values of `u8` are
+/// valid indices.
+#[derive(Clone, Debug)]
+struct Dense<S>(Vec<S>);
+
+impl<S> Dense<S>
+where
+    S: StateID,
+{
+    fn new() -> Self {
+        Dense(vec![fail_id(); 256])
+    }
+
+    #[inline]
+    fn len(&self) -> usize {
+        self.0.len()
+    }
+}
+
+impl<S> Index<u8> for Dense<S> {
+    type Output = S;
+
+    #[inline]
+    fn index(&self, i: u8) -> &S {
+        // SAFETY: This is safe because all dense transitions have
+        // exactly 256 elements, so all u8 values are valid indices.
+        &self.0[i as usize]
+    }
+}
+
+impl<S> IndexMut<u8> for Dense<S> {
+    #[inline]
+    fn index_mut(&mut self, i: u8) -> &mut S {
+        // SAFETY: This is safe because all dense transitions have
+        // exactly 256 elements, so all u8 values are valid indices.
+        &mut self.0[i as usize]
+    }
+}
+
+/// A representation of transitions in an NFA.
+///
+/// Transitions have either a sparse representation, which is slower for
+/// lookups but uses less memory, or a dense representation, which is faster
+/// for lookups but uses more memory. In the sparse representation, the absence
+/// of a state implies a transition to `fail_id()`. Transitions to `dead_id()`
+/// are still explicitly represented.
+///
+/// For the NFA, by default, we use a dense representation for transitions for
+/// states close to the start state because it's likely these are the states
+/// that will be most frequently visited.
+#[derive(Clone, Debug)]
+enum Transitions<S> {
+    Sparse(Vec<(u8, S)>),
+    Dense(Dense<S>),
+}
+
+impl<S: StateID> Transitions<S> {
+    fn heap_bytes(&self) -> usize {
+        match *self {
+            Transitions::Sparse(ref sparse) => {
+                sparse.len() * size_of::<(u8, S)>()
+            }
+            Transitions::Dense(ref dense) => dense.len() * size_of::<S>(),
+        }
+    }
+
+    fn next_state(&self, input: u8) -> S {
+        match *self {
+            Transitions::Sparse(ref sparse) => {
+                for &(b, id) in sparse {
+                    if b == input {
+                        return id;
+                    }
+                }
+                fail_id()
+            }
+            Transitions::Dense(ref dense) => dense[input],
+        }
+    }
+
+    fn set_next_state(&mut self, input: u8, next: S) {
+        match *self {
+            Transitions::Sparse(ref mut sparse) => {
+                match sparse.binary_search_by_key(&input, |&(b, _)| b) {
+                    Ok(i) => sparse[i] = (input, next),
+                    Err(i) => sparse.insert(i, (input, next)),
+                }
+            }
+            Transitions::Dense(ref mut dense) => {
+                dense[input] = next;
+            }
+        }
+    }
+
+    /// Iterate over transitions in this state while skipping over transitions
+    /// to `fail_id()`.
+    fn iter<F: FnMut(u8, S)>(&self, mut f: F) {
+        match *self {
+            Transitions::Sparse(ref sparse) => {
+                for &(b, id) in sparse {
+                    f(b, id);
+                }
+            }
+            Transitions::Dense(ref dense) => {
+                for b in AllBytesIter::new() {
+                    let id = dense[b];
+                    if id != fail_id() {
+                        f(b, id);
+                    }
+                }
+            }
+        }
+    }
+
+    /// Iterate over all transitions in this state according to the given
+    /// equivalence classes, including transitions to `fail_id()`.
+    fn iter_all<F: FnMut(u8, S)>(&self, classes: &ByteClasses, mut f: F) {
+        if classes.is_singleton() {
+            match *self {
+                Transitions::Sparse(ref sparse) => {
+                    sparse_iter(sparse, f);
+                }
+                Transitions::Dense(ref dense) => {
+                    for b in AllBytesIter::new() {
+                        f(b, dense[b]);
+                    }
+                }
+            }
+        } else {
+            // In this case, we only want to yield a single byte for each
+            // equivalence class.
+            match *self {
+                Transitions::Sparse(ref sparse) => {
+                    let mut last_class = None;
+                    sparse_iter(sparse, |b, next| {
+                        let class = classes.get(b);
+                        if last_class != Some(class) {
+                            last_class = Some(class);
+                            f(b, next);
+                        }
+                    })
+                }
+                Transitions::Dense(ref dense) => {
+                    for b in classes.representatives() {
+                        f(b, dense[b]);
+                    }
+                }
+            }
+        }
+    }
+}
+
+/// Iterator over transitions in a state, skipping transitions to `fail_id()`.
+///
+/// This abstracts over the representation of NFA transitions, which may be
+/// either in a sparse or dense representation.
+///
+/// This somewhat idiosyncratically borrows the NFA mutably, so that when one
+/// is iterating over transitions, the caller can still mutate the NFA. This
+/// is useful when creating failure transitions.
+#[derive(Debug)]
+struct IterTransitionsMut<'a, S: StateID + 'a> {
+    nfa: &'a mut NFA<S>,
+    state_id: S,
+    cur: usize,
+}
+
+impl<'a, S: StateID> IterTransitionsMut<'a, S> {
+    fn new(nfa: &'a mut NFA<S>, state_id: S) -> IterTransitionsMut<'a, S> {
+        IterTransitionsMut { nfa, state_id, cur: 0 }
+    }
+
+    fn nfa(&mut self) -> &mut NFA<S> {
+        self.nfa
+    }
+}
+
+impl<'a, S: StateID> Iterator for IterTransitionsMut<'a, S> {
+    type Item = (u8, S);
+
+    fn next(&mut self) -> Option<(u8, S)> {
+        match self.nfa.states[self.state_id.to_usize()].trans {
+            Transitions::Sparse(ref sparse) => {
+                if self.cur >= sparse.len() {
+                    return None;
+                }
+                let i = self.cur;
+                self.cur += 1;
+                Some(sparse[i])
+            }
+            Transitions::Dense(ref dense) => {
+                while self.cur < dense.len() {
+                    // There are always exactly 255 transitions in dense repr.
+                    debug_assert!(self.cur < 256);
+
+                    let b = self.cur as u8;
+                    let id = dense[b];
+                    self.cur += 1;
+                    if id != fail_id() {
+                        return Some((b, id));
+                    }
+                }
+                None
+            }
+        }
+    }
+}
+
+/// A simple builder for configuring the NFA construction of Aho-Corasick.
+#[derive(Clone, Debug)]
+pub struct Builder {
+    dense_depth: usize,
+    match_kind: MatchKind,
+    prefilter: bool,
+    anchored: bool,
+    ascii_case_insensitive: bool,
+}
+
+impl Default for Builder {
+    fn default() -> Builder {
+        Builder {
+            dense_depth: 2,
+            match_kind: MatchKind::default(),
+            prefilter: true,
+            anchored: false,
+            ascii_case_insensitive: false,
+        }
+    }
+}
+
+impl Builder {
+    pub fn new() -> Builder {
+        Builder::default()
+    }
+
+    pub fn build<I, P, S: StateID>(&self, patterns: I) -> Result<NFA<S>>
+    where
+        I: IntoIterator<Item = P>,
+        P: AsRef<[u8]>,
+    {
+        Compiler::new(self)?.compile(patterns)
+    }
+
+    pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder {
+        self.match_kind = kind;
+        self
+    }
+
+    pub fn dense_depth(&mut self, depth: usize) -> &mut Builder {
+        self.dense_depth = depth;
+        self
+    }
+
+    pub fn prefilter(&mut self, yes: bool) -> &mut Builder {
+        self.prefilter = yes;
+        self
+    }
+
+    pub fn anchored(&mut self, yes: bool) -> &mut Builder {
+        self.anchored = yes;
+        self
+    }
+
+    pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder {
+        self.ascii_case_insensitive = yes;
+        self
+    }
+}
+
+/// A compiler uses a builder configuration and builds up the NFA formulation
+/// of an Aho-Corasick automaton. This roughly corresponds to the standard
+/// formulation described in textbooks.
+#[derive(Debug)]
+struct Compiler<'a, S: StateID> {
+    builder: &'a Builder,
+    prefilter: prefilter::Builder,
+    nfa: NFA<S>,
+    byte_classes: ByteClassBuilder,
+}
+
+impl<'a, S: StateID> Compiler<'a, S> {
+    fn new(builder: &'a Builder) -> Result<Compiler<'a, S>> {
+        Ok(Compiler {
+            builder,
+            prefilter: prefilter::Builder::new(builder.match_kind)
+                .ascii_case_insensitive(builder.ascii_case_insensitive),
+            nfa: NFA {
+                match_kind: builder.match_kind,
+                start_id: usize_to_state_id(2)?,
+                max_pattern_len: 0,
+                pattern_count: 0,
+                heap_bytes: 0,
+                prefilter: None,
+                anchored: builder.anchored,
+                byte_classes: ByteClasses::singletons(),
+                states: vec![],
+            },
+            byte_classes: ByteClassBuilder::new(),
+        })
+    }
+
+    fn compile<I, P>(mut self, patterns: I) -> Result<NFA<S>>
+    where
+        I: IntoIterator<Item = P>,
+        P: AsRef<[u8]>,
+    {
+        self.add_state(0)?; // the fail state, which is never entered
+        self.add_state(0)?; // the dead state, only used for leftmost
+        self.add_state(0)?; // the start state
+        self.build_trie(patterns)?;
+        self.add_start_state_loop();
+        self.add_dead_state_loop();
+        if !self.builder.anchored {
+            if self.match_kind().is_leftmost() {
+                self.fill_failure_transitions_leftmost();
+            } else {
+                self.fill_failure_transitions_standard();
+            }
+        }
+        self.close_start_state_loop();
+        self.nfa.byte_classes = self.byte_classes.build();
+        if !self.builder.anchored {
+            self.nfa.prefilter = self.prefilter.build();
+        }
+        self.calculate_size();
+        Ok(self.nfa)
+    }
+
+    /// This sets up the initial prefix trie that makes up the Aho-Corasick
+    /// automaton. Effectively, it creates the basic structure of the
+    /// automaton, where every pattern given has a path from the start state to
+    /// the end of the pattern.
+    fn build_trie<I, P>(&mut self, patterns: I) -> Result<()>
+    where
+        I: IntoIterator<Item = P>,
+        P: AsRef<[u8]>,
+    {
+        'PATTERNS: for (pati, pat) in patterns.into_iter().enumerate() {
+            let pat = pat.as_ref();
+            self.nfa.max_pattern_len =
+                cmp::max(self.nfa.max_pattern_len, pat.len());
+            self.nfa.pattern_count += 1;
+
+            let mut prev = self.nfa.start_id;
+            let mut saw_match = false;
+            for (depth, &b) in pat.iter().enumerate() {
+                // When leftmost-first match semantics are requested, we
+                // specifically stop adding patterns when a previously added
+                // pattern is a prefix of it. We avoid adding it because
+                // leftmost-first semantics imply that the pattern can never
+                // match. This is not just an optimization to save space! It
+                // is necessary for correctness. In fact, this is the only
+                // difference in the automaton between the implementations for
+                // leftmost-first and leftmost-longest.
+                saw_match = saw_match || self.nfa.state(prev).is_match();
+                if self.builder.match_kind.is_leftmost_first() && saw_match {
+                    // Skip to the next pattern immediately. This avoids
+                    // incorrectly adding a match after this loop terminates.
+                    continue 'PATTERNS;
+                }
+
+                // Add this byte to our equivalence classes. We don't use these
+                // for NFA construction. These are instead used only if we're
+                // building a DFA. They would technically be useful for the
+                // NFA, but it would require a second pass over the patterns.
+                self.byte_classes.set_range(b, b);
+                if self.builder.ascii_case_insensitive {
+                    let b = opposite_ascii_case(b);
+                    self.byte_classes.set_range(b, b);
+                }
+
+                // If the transition from prev using the current byte already
+                // exists, then just move through it. Otherwise, add a new
+                // state. We track the depth here so that we can determine
+                // how to represent transitions. States near the start state
+                // use a dense representation that uses more memory but is
+                // faster. Other states use a sparse representation that uses
+                // less memory but is slower.
+                let next = self.nfa.state(prev).next_state(b);
+                if next != fail_id() {
+                    prev = next;
+                } else {
+                    let next = self.add_state(depth + 1)?;
+                    self.nfa.state_mut(prev).set_next_state(b, next);
+                    if self.builder.ascii_case_insensitive {
+                        let b = opposite_ascii_case(b);
+                        self.nfa.state_mut(prev).set_next_state(b, next);
+                    }
+                    prev = next;
+                }
+            }
+            // Once the pattern has been added, log the match in the final
+            // state that it reached.
+            self.nfa.state_mut(prev).add_match(pati, pat.len());
+            // ... and hand it to the prefilter builder, if applicable.
+            if self.builder.prefilter {
+                self.prefilter.add(pat);
+            }
+        }
+        Ok(())
+    }
+
+    /// This routine creates failure transitions according to the standard
+    /// textbook formulation of the Aho-Corasick algorithm.
+    ///
+    /// Building failure transitions is the most interesting part of building
+    /// the Aho-Corasick automaton, because they are what allow searches to
+    /// be performed in linear time. Specifically, a failure transition is
+    /// a single transition associated with each state that points back to
+    /// the longest proper suffix of the pattern being searched. The failure
+    /// transition is followed whenever there exists no transition on the
+    /// current state for the current input byte. If there is no other proper
+    /// suffix, then the failure transition points back to the starting state.
+    ///
+    /// For example, let's say we built an Aho-Corasick automaton with the
+    /// following patterns: 'abcd' and 'cef'. The trie looks like this:
+    ///
+    /// ```ignore
+    ///          a - S1 - b - S2 - c - S3 - d - S4*
+    ///         /
+    ///     S0 - c - S5 - e - S6 - f - S7*
+    /// ```
+    ///
+    /// At this point, it should be fairly straight-forward to see how this
+    /// trie can be used in a simplistic way. At any given position in the
+    /// text we're searching (called the "subject" string), all we need to do
+    /// is follow the transitions in the trie by consuming one transition for
+    /// each byte in the subject string. If we reach a match state, then we can
+    /// report that location as a match.
+    ///
+    /// The trick comes when searching a subject string like 'abcef'. We'll
+    /// initially follow the transition from S0 to S1 and wind up in S3 after
+    /// observng the 'c' byte. At this point, the next byte is 'e' but state
+    /// S3 has no transition for 'e', so the search fails. We then would need
+    /// to restart the search at the next position in 'abcef', which
+    /// corresponds to 'b'. The match would fail, but the next search starting
+    /// at 'c' would finally succeed. The problem with this approach is that
+    /// we wind up searching the subject string potentially many times. In
+    /// effect, this makes the algorithm have worst case `O(n * m)` complexity,
+    /// where `n ~ len(subject)` and `m ~ len(all patterns)`. We would instead
+    /// like to achieve a `O(n + m)` worst case complexity.
+    ///
+    /// This is where failure transitions come in. Instead of dying at S3 in
+    /// the first search, the automaton can instruct the search to move to
+    /// another part of the automaton that corresponds to a suffix of what
+    /// we've seen so far. Recall that we've seen 'abc' in the subject string,
+    /// and the automaton does indeed have a non-empty suffix, 'c', that could
+    /// potentially lead to another match. Thus, the actual Aho-Corasick
+    /// automaton for our patterns in this case looks like this:
+    ///
+    /// ```ignore
+    ///          a - S1 - b - S2 - c - S3 - d - S4*
+    ///         /                      /
+    ///        /       ----------------
+    ///       /       /
+    ///     S0 - c - S5 - e - S6 - f - S7*
+    /// ```
+    ///
+    /// That is, we have a failure transition from S3 to S5, which is followed
+    /// exactly in cases when we are in state S3 but see any byte other than
+    /// 'd' (that is, we've "failed" to find a match in this portion of our
+    /// trie). We know we can transition back to S5 because we've already seen
+    /// a 'c' byte, so we don't need to re-scan it. We can then pick back up
+    /// with the search starting at S5 and complete our match.
+    ///
+    /// Adding failure transitions to a trie is fairly simple, but subtle. The
+    /// key issue is that you might have multiple failure transition that you
+    /// need to follow. For example, look at the trie for the patterns
+    /// 'abcd', 'b', 'bcd' and 'cd':
+    ///
+    /// ```ignore
+    ///        - a - S1 - b - S2 - c - S3 - d - S4*
+    ///       /
+    ///     S0 - b - S5* - c - S6 - d - S7*
+    ///       \
+    ///        - c - S8 - d - S9*
+    /// ```
+    ///
+    /// The failure transitions for this trie are defined from S2 to S5,
+    /// S3 to S6 and S6 to S8. Moreover, state S2 needs to track that it
+    /// corresponds to a match, since its failure transition to S5 is itself
+    /// a match state.
+    ///
+    /// Perhaps simplest way to think about adding these failure transitions
+    /// is recursively. That is, if you know the failure transitions for every
+    /// possible previous state that could be visited (e.g., when computing the
+    /// failure transition for S3, you already know the failure transitions
+    /// for S0, S1 and S2), then you can simply follow the failure transition
+    /// of the previous state and check whether the incoming transition is
+    /// defined after following the failure transition.
+    ///
+    /// For example, when determining the failure state for S3, by our
+    /// assumptions, we already know that there is a failure transition from
+    /// S2 (the previous state) to S5. So we follow that transition and check
+    /// whether the transition connecting S2 to S3 is defined. Indeed, it is,
+    /// as there is a transition from S5 to S6 for the byte 'c'. If no such
+    /// transition existed, we could keep following the failure transitions
+    /// until we reach the start state, which is the failure transition for
+    /// every state that has no corresponding proper suffix.
+    ///
+    /// We don't actually use recursion to implement this, but instead, use a
+    /// breadth first search of the automaton. Our base case is the start
+    /// state, whose failure transition is just a transition to itself.
+    fn fill_failure_transitions_standard(&mut self) {
+        // Initialize the queue for breadth first search with all transitions
+        // out of the start state. We handle the start state specially because
+        // we only want to follow non-self transitions. If we followed self
+        // transitions, then this would never terminate.
+        let mut queue = VecDeque::new();
+        let mut seen = self.queued_set();
+        for b in AllBytesIter::new() {
+            let next = self.nfa.start().next_state(b);
+            if next != self.nfa.start_id {
+                if !seen.contains(next) {
+                    queue.push_back(next);
+                    seen.insert(next);
+                }
+            }
+        }
+        while let Some(id) = queue.pop_front() {
+            let mut it = self.nfa.iter_transitions_mut(id);
+            while let Some((b, next)) = it.next() {
+                if !seen.contains(next) {
+                    queue.push_back(next);
+                    seen.insert(next);
+                }
+
+                let mut fail = it.nfa().state(id).fail;
+                while it.nfa().state(fail).next_state(b) == fail_id() {
+                    fail = it.nfa().state(fail).fail;
+                }
+                fail = it.nfa().state(fail).next_state(b);
+                it.nfa().state_mut(next).fail = fail;
+                it.nfa().copy_matches(fail, next);
+            }
+            // If the start state is a match state, then this automaton can
+            // match the empty string. This implies all states are match states
+            // since every position matches the empty string, so copy the
+            // matches from the start state to every state. Strictly speaking,
+            // this is only necessary for overlapping matches since each
+            // non-empty non-start match state needs to report empty matches
+            // in addition to its own. For the non-overlapping case, such
+            // states only report the first match, which is never empty since
+            // it isn't a start state.
+            it.nfa().copy_empty_matches(id);
+        }
+    }
+
+    /// This routine is just like fill_failure_transitions_standard, except
+    /// it adds failure transitions in a way that preserves leftmost match
+    /// semantics (for both leftmost-first and leftmost-longest).
+    ///
+    /// The algorithms are so similar that it would be possible to write it
+    /// generically. But doing so without overhead would require a bit of
+    /// ceremony, so we just copy it and add in the extra leftmost logic.
+    /// Moreover, the standard algorithm above is so simple that it feels like
+    /// a crime to disturb it.
+    ///
+    /// In effect, this proceeds just like the standard approach, but we
+    /// specifically add only a subset of all failure transitions. Namely, we
+    /// only add failure transitions that either do not occur after a match
+    /// or failure transitions that do occur after a match but preserve the
+    /// match. The comments in the implementation below should help.
+    ///
+    /// N.B. The only differences in the automaton between leftmost-first and
+    /// leftmost-longest are in trie construction. Otherwise, both have exactly
+    /// the same set of failure transitions. leftmost-longest adds everything
+    /// to the trie, where as leftmost-first skips any patterns for which there
+    /// exists a prefix of it that was added earlier.
+    ///
+    /// N.B. I came up with this algorithm on my own, and after scouring all of
+    /// the other AC implementations I know of (Perl, Snort, many on GitHub).
+    /// I couldn't find any that implement leftmost semantics like this.
+    /// Perl of course needs leftmost-first semantics, but they implement it
+    /// with a seeming hack at *search* time instead of encoding it into the
+    /// automaton. There are also a couple Java libraries that support leftmost
+    /// longest semantics, but they do it by building a queue of matches at
+    /// search time, which is even worse than what Perl is doing. ---AG
+    fn fill_failure_transitions_leftmost(&mut self) {
+        /// Represents an item in our queue of states to process.
+        ///
+        /// Fundamentally, this queue serves the same purpose as the queue
+        /// for filling failure transitions using the standard formulation.
+        /// In the leftmost case, though, we need to track a bit more
+        /// information. See comments below.
+        #[derive(Clone, Copy, Debug)]
+        struct QueuedState<S> {
+            /// The id of the state to visit.
+            id: S,
+            /// The depth at which the first match was observed in the path
+            /// to this state. Note that this corresponds to the depth at
+            /// which the beginning of the match was detected. If no match
+            /// has been seen, then this is None.
+            match_at_depth: Option<usize>,
+        }
+
+        impl<S: StateID> QueuedState<S> {
+            /// Create a queued state corresponding to the given NFA's start
+            /// state.
+            fn start(nfa: &NFA<S>) -> QueuedState<S> {
+                let match_at_depth =
+                    if nfa.start().is_match() { Some(0) } else { None };
+                QueuedState { id: nfa.start_id, match_at_depth }
+            }
+
+            /// Return the next state to queue up. The given id must be a state
+            /// corresponding to a single transition from this queued state.
+            fn next_queued_state(
+                &self,
+                nfa: &NFA<S>,
+                id: S,
+            ) -> QueuedState<S> {
+                let match_at_depth = self.next_match_at_depth(nfa, id);
+                QueuedState { id, match_at_depth }
+            }
+
+            /// Return the earliest depth at which a match has occurred for
+            /// the given state. The given state must correspond to a single
+            /// transition from this queued state.
+            fn next_match_at_depth(
+                &self,
+                nfa: &NFA<S>,
+                next: S,
+            ) -> Option<usize> {
+                // This is a little tricky. If the previous state has already
+                // seen a match or if `next` isn't a match state, then nothing
+                // needs to change since a later state cannot find an earlier
+                // match.
+                match self.match_at_depth {
+                    Some(x) => return Some(x),
+                    None if nfa.state(next).is_match() => {}
+                    None => return None,
+                }
+                let depth = nfa.state(next).depth
+                    - nfa.state(next).get_longest_match_len().unwrap()
+                    + 1;
+                Some(depth)
+            }
+        }
+
+        // Initialize the queue for breadth first search with all transitions
+        // out of the start state. We handle the start state specially because
+        // we only want to follow non-self transitions. If we followed self
+        // transitions, then this would never terminate.
+        let mut queue: VecDeque<QueuedState<S>> = VecDeque::new();
+        let mut seen = self.queued_set();
+        let start = QueuedState::start(&self.nfa);
+        for b in AllBytesIter::new() {
+            let next_id = self.nfa.start().next_state(b);
+            if next_id != start.id {
+                let next = start.next_queued_state(&self.nfa, next_id);
+                if !seen.contains(next.id) {
+                    queue.push_back(next);
+                    seen.insert(next.id);
+                }
+                // If a state immediately following the start state is a match
+                // state, then we never want to follow its failure transition
+                // since the failure transition necessarily leads back to the
+                // start state, which we never want to do for leftmost matching
+                // after a match has been found.
+                //
+                // N.B. This is a special case of the more general handling
+                // found below.
+                if self.nfa.state(next_id).is_match() {
+                    self.nfa.state_mut(next_id).fail = dead_id();
+                }
+            }
+        }
+        while let Some(item) = queue.pop_front() {
+            let mut any_trans = false;
+            let mut it = self.nfa.iter_transitions_mut(item.id);
+            while let Some((b, next_id)) = it.next() {
+                any_trans = true;
+
+                // Queue up the next state.
+                let next = item.next_queued_state(it.nfa(), next_id);
+                if !seen.contains(next.id) {
+                    queue.push_back(next);
+                    seen.insert(next.id);
+                }
+
+                // Find the failure state for next. Same as standard.
+                let mut fail = it.nfa().state(item.id).fail;
+                while it.nfa().state(fail).next_state(b) == fail_id() {
+                    fail = it.nfa().state(fail).fail;
+                }
+                fail = it.nfa().state(fail).next_state(b);
+
+                // This is the key difference from the standard formulation.
+                // Namely, if we've seen a match, then we only want a failure
+                // transition if the failure transition preserves the match
+                // we've seen. In general, this is not true of all failure
+                // transitions since they can point back to any suffix of what
+                // we've seen so far. Instead, we only want to point back to
+                // suffixes that contain any match we've seen.
+                //
+                // We achieve this by comparing the depth of the failure
+                // transition with the number of states between this state
+                // and the beginning of the earliest match detected. If the
+                // depth of the failure state is smaller than this difference,
+                // then it cannot contain the match. If it's bigger or equal
+                // to the difference, then it necessarily includes the match
+                // we've seen since all failure transitions correspond to a
+                // suffix.
+                //
+                // If we've determined that we don't want the failure
+                // transition, then we set this state's failure transition to
+                // the dead state. In other words, when a search hits this
+                // state, it will not continue and correctly stop. (N.B. A
+                // dead state is different than a fail state. A dead state
+                // MUST be preceded by a match and acts as a sentinel to search
+                // routines to terminate.)
+                //
+                // Understanding this is tricky, and it took me several days
+                // to think through this and get it right. If you want to grok
+                // it, then I'd recommend: 1) switch the implementation to
+                // always use the standard algorithm for filling in failure
+                // transitions, 2) run the test suite and 3) examine the test
+                // failures. Write out the automatons for them and try to work
+                // backwards by figuring out which failure transitions should
+                // be removed. You should arrive at the same rule used below.
+                if let Some(match_depth) = next.match_at_depth {
+                    let fail_depth = it.nfa().state(fail).depth;
+                    let next_depth = it.nfa().state(next.id).depth;
+                    if next_depth - match_depth + 1 > fail_depth {
+                        it.nfa().state_mut(next.id).fail = dead_id();
+                        continue;
+                    }
+                    assert_ne!(
+                        start.id,
+                        it.nfa().state(next.id).fail,
+                        "states that are match states or follow match \
+                         states should never have a failure transition \
+                         back to the start state in leftmost searching",
+                    );
+                }
+                it.nfa().state_mut(next.id).fail = fail;
+                it.nfa().copy_matches(fail, next.id);
+            }
+            // If there are no transitions for this state and if it's a match
+            // state, then we must set its failure transition to the dead
+            // state since we never want it to restart the search.
+            if !any_trans && it.nfa().state(item.id).is_match() {
+                it.nfa().state_mut(item.id).fail = dead_id();
+            }
+            // We don't need to copy empty matches from the start state here
+            // because that's only necessary for overlapping matches and
+            // leftmost match kinds don't support overlapping matches.
+        }
+    }
+
+    /// Returns a set that tracked queued states.
+    ///
+    /// This is only necessary when ASCII case insensitivity is enabled, since
+    /// it is the only way to visit the same state twice. Otherwise, this
+    /// returns an inert set that nevers adds anything and always reports
+    /// `false` for every member test.
+    fn queued_set(&self) -> QueuedSet<S> {
+        if self.builder.ascii_case_insensitive {
+            QueuedSet::active()
+        } else {
+            QueuedSet::inert()
+        }
+    }
+
+    /// Set the failure transitions on the start state to loop back to the
+    /// start state. This effectively permits the Aho-Corasick automaton to
+    /// match at any position. This is also required for finding the next
+    /// state to terminate, namely, finding the next state should never return
+    /// a fail_id.
+    ///
+    /// This must be done after building the initial trie, since trie
+    /// construction depends on transitions to `fail_id` to determine whether a
+    /// state already exists or not.
+    fn add_start_state_loop(&mut self) {
+        let start_id = self.nfa.start_id;
+        let start = self.nfa.start_mut();
+        for b in AllBytesIter::new() {
+            if start.next_state(b) == fail_id() {
+                start.set_next_state(b, start_id);
+            }
+        }
+    }
+
+    /// Remove the start state loop by rewriting any transitions on the start
+    /// state back to the start state with transitions to the dead state.
+    ///
+    /// The loop is only closed when two conditions are met: the start state
+    /// is a match state and the match kind is leftmost-first or
+    /// leftmost-longest. (Alternatively, if this is an anchored automaton,
+    /// then the start state is always closed, regardless of aforementioned
+    /// conditions.)
+    ///
+    /// The reason for this is that under leftmost semantics, a start state
+    /// that is also a match implies that we should never restart the search
+    /// process. We allow normal transitions out of the start state, but if
+    /// none exist, we transition to the dead state, which signals that
+    /// searching should stop.
+    fn close_start_state_loop(&mut self) {
+        if self.builder.anchored
+            || (self.match_kind().is_leftmost() && self.nfa.start().is_match())
+        {
+            let start_id = self.nfa.start_id;
+            let start = self.nfa.start_mut();
+            for b in AllBytesIter::new() {
+                if start.next_state(b) == start_id {
+                    start.set_next_state(b, dead_id());
+                }
+            }
+        }
+    }
+
+    /// Sets all transitions on the dead state to point back to the dead state.
+    /// Normally, missing transitions map back to the failure state, but the
+    /// point of the dead state is to act as a sink that can never be escaped.
+    fn add_dead_state_loop(&mut self) {
+        let dead = self.nfa.state_mut(dead_id());
+        for b in AllBytesIter::new() {
+            dead.set_next_state(b, dead_id());
+        }
+    }
+
+    /// Computes the total amount of heap used by this NFA in bytes.
+    fn calculate_size(&mut self) {
+        let mut size = 0;
+        for state in &self.nfa.states {
+            size += state.heap_bytes();
+        }
+        self.nfa.heap_bytes = size;
+    }
+
+    /// Add a new state to the underlying NFA with the given depth. The depth
+    /// is used to determine how to represent the transitions.
+    ///
+    /// If adding the new state would overflow the chosen state ID
+    /// representation, then this returns an error.
+    fn add_state(&mut self, depth: usize) -> Result<S> {
+        if depth < self.builder.dense_depth {
+            self.nfa.add_dense_state(depth)
+        } else {
+            self.nfa.add_sparse_state(depth)
+        }
+    }
+
+    /// Returns the match kind configured on the underlying builder.
+    fn match_kind(&self) -> MatchKind {
+        self.builder.match_kind
+    }
+}
+
+/// A set of state identifiers used to avoid revisiting the same state multiple
+/// times when filling in failure transitions.
+///
+/// This set has an "inert" and an "active" mode. When inert, the set never
+/// stores anything and always returns `false` for every member test. This is
+/// useful to avoid the performance and memory overhead of maintaining this
+/// set when it is not needed.
+#[derive(Debug)]
+struct QueuedSet<S> {
+    set: Option<BTreeSet<S>>,
+}
+
+impl<S: StateID> QueuedSet<S> {
+    /// Return an inert set that returns `false` for every state ID membership
+    /// test.
+    fn inert() -> QueuedSet<S> {
+        QueuedSet { set: None }
+    }
+
+    /// Return an active set that tracks state ID membership.
+    fn active() -> QueuedSet<S> {
+        QueuedSet { set: Some(BTreeSet::new()) }
+    }
+
+    /// Inserts the given state ID into this set. (If the set is inert, then
+    /// this is a no-op.)
+    fn insert(&mut self, state_id: S) {
+        if let Some(ref mut set) = self.set {
+            set.insert(state_id);
+        }
+    }
+
+    /// Returns true if and only if the given state ID is in this set. If the
+    /// set is inert, this always returns false.
+    fn contains(&self, state_id: S) -> bool {
+        match self.set {
+            None => false,
+            Some(ref set) => set.contains(&state_id),
+        }
+    }
+}
+
+/// An iterator over every byte value.
+///
+/// We use this instead of (0..256).map(|b| b as u8) because this optimizes
+/// better in debug builds.
+///
+/// We also use this instead of 0..=255 because we're targeting Rust 1.24 and
+/// inclusive range syntax was stabilized in Rust 1.26. We can get rid of this
+/// once our MSRV is Rust 1.26 or newer.
+#[derive(Debug)]
+struct AllBytesIter(u16);
+
+impl AllBytesIter {
+    fn new() -> AllBytesIter {
+        AllBytesIter(0)
+    }
+}
+
+impl Iterator for AllBytesIter {
+    type Item = u8;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.0 >= 256 {
+            None
+        } else {
+            let b = self.0 as u8;
+            self.0 += 1;
+            Some(b)
+        }
+    }
+}
+
+impl<S: StateID> fmt::Debug for NFA<S> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        writeln!(f, "NFA(")?;
+        writeln!(f, "match_kind: {:?}", self.match_kind)?;
+        writeln!(f, "prefilter: {:?}", self.prefilter)?;
+        writeln!(f, "{}", "-".repeat(79))?;
+        for (id, s) in self.states.iter().enumerate() {
+            let mut trans = vec![];
+            s.trans.iter(|byte, next| {
+                // The start state has a bunch of uninteresting transitions
+                // back into itself. It's questionable to hide them since they
+                // are critical to understanding the automaton, but they are
+                // very noisy without better formatting for contiugous ranges
+                // to the same state.
+                if id == self.start_id.to_usize() && next == self.start_id {
+                    return;
+                }
+                // Similarly, the dead state has a bunch of uninteresting
+                // transitions too.
+                if id == dead_id() {
+                    return;
+                }
+                trans.push(format!("{} => {}", escape(byte), next.to_usize()));
+            });
+            writeln!(f, "{:04}: {}", id, trans.join(", "))?;
+
+            let matches: Vec<String> = s
+                .matches
+                .iter()
+                .map(|&(pattern_id, _)| pattern_id.to_string())
+                .collect();
+            writeln!(f, "  matches: {}", matches.join(", "))?;
+            writeln!(f, "     fail: {}", s.fail.to_usize())?;
+            writeln!(f, "    depth: {}", s.depth)?;
+        }
+        writeln!(f, "{}", "-".repeat(79))?;
+        writeln!(f, ")")?;
+        Ok(())
+    }
+}
+
+/// Iterate over all possible byte transitions given a sparse set.
+fn sparse_iter<S: StateID, F: FnMut(u8, S)>(trans: &[(u8, S)], mut f: F) {
+    let mut byte = 0u16;
+    for &(b, id) in trans {
+        while byte < (b as u16) {
+            f(byte as u8, fail_id());
+            byte += 1;
+        }
+        f(b, id);
+        byte += 1;
+    }
+    for b in byte..256 {
+        f(b as u8, fail_id());
+    }
+}
+
+/// Safely return two mutable borrows to two different locations in the given
+/// slice.
+///
+/// This panics if i == j.
+fn get_two_mut<T>(xs: &mut [T], i: usize, j: usize) -> (&mut T, &mut T) {
+    assert!(i != j, "{} must not be equal to {}", i, j);
+    if i < j {
+        let (before, after) = xs.split_at_mut(j);
+        (&mut before[i], &mut after[0])
+    } else {
+        let (before, after) = xs.split_at_mut(i);
+        (&mut after[0], &mut before[j])
+    }
+}
+
+/// Return the given byte as its escaped string form.
+fn escape(b: u8) -> String {
+    use std::ascii;
+
+    String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn scratch() {
+        let nfa: NFA<usize> = Builder::new()
+            .dense_depth(0)
+            // .match_kind(MatchKind::LeftmostShortest)
+            // .match_kind(MatchKind::LeftmostLongest)
+            .match_kind(MatchKind::LeftmostFirst)
+            // .build(&["abcd", "ce", "b"])
+            // .build(&["ab", "bc"])
+            // .build(&["b", "bcd", "ce"])
+            // .build(&["abc", "bx"])
+            // .build(&["abc", "bd", "ab"])
+            // .build(&["abcdefghi", "hz", "abcdefgh"])
+            // .build(&["abcd", "bce", "b"])
+            .build(&["abcdefg", "bcde", "bcdef"])
+            .unwrap();
+        println!("{:?}", nfa);
+    }
+}
diff --git a/src/packed/api.rs b/src/packed/api.rs
new file mode 100644
index 0000000..3a316b5
--- /dev/null
+++ b/src/packed/api.rs
@@ -0,0 +1,632 @@
+use std::u16;
+
+use packed::pattern::Patterns;
+use packed::rabinkarp::RabinKarp;
+use packed::teddy::{self, Teddy};
+use Match;
+
+/// This is a limit placed on the total number of patterns we're willing to try
+/// and match at once. As more sophisticated algorithms are added, this number
+/// may be increased.
+const PATTERN_LIMIT: usize = 128;
+
+/// A knob for controlling the match semantics of a packed multiple string
+/// searcher.
+///
+/// This differs from the
+/// [`MatchKind`](../enum.MatchKind.html)
+/// type in the top-level crate module in that it doesn't support
+/// "standard" match semantics, and instead only supports leftmost-first or
+/// leftmost-longest. Namely, "standard" semantics cannot be easily supported
+/// by packed searchers.
+///
+/// For more information on the distinction between leftmost-first and
+/// leftmost-longest, see the docs on the top-level `MatchKind` type.
+///
+/// Unlike the top-level `MatchKind` type, the default match semantics for this
+/// type are leftmost-first.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum MatchKind {
+    /// Use leftmost-first match semantics, which reports leftmost matches.
+    /// When there are multiple possible leftmost matches, the match
+    /// corresponding to the pattern that appeared earlier when constructing
+    /// the automaton is reported.
+    ///
+    /// This is the default.
+    LeftmostFirst,
+    /// Use leftmost-longest match semantics, which reports leftmost matches.
+    /// When there are multiple possible leftmost matches, the longest match
+    /// is chosen.
+    LeftmostLongest,
+    /// Hints that destructuring should not be exhaustive.
+    ///
+    /// This enum may grow additional variants, so this makes sure clients
+    /// don't count on exhaustive matching. (Otherwise, adding a new variant
+    /// could break existing code.)
+    #[doc(hidden)]
+    __Nonexhaustive,
+}
+
+impl Default for MatchKind {
+    fn default() -> MatchKind {
+        MatchKind::LeftmostFirst
+    }
+}
+
+/// The configuration for a packed multiple pattern searcher.
+///
+/// The configuration is currently limited only to being able to select the
+/// match semantics (leftmost-first or leftmost-longest) of a searcher. In the
+/// future, more knobs may be made available.
+///
+/// A configuration produces a [`packed::Builder`](struct.Builder.html), which
+/// in turn can be used to construct a
+/// [`packed::Searcher`](struct.Searcher.html) for searching.
+///
+/// # Example
+///
+/// This example shows how to use leftmost-longest semantics instead of the
+/// default (leftmost-first).
+///
+/// ```
+/// use aho_corasick::packed::{Config, MatchKind};
+///
+/// # fn example() -> Option<()> {
+/// let searcher = Config::new()
+///     .match_kind(MatchKind::LeftmostLongest)
+///     .builder()
+///     .add("foo")
+///     .add("foobar")
+///     .build()?;
+/// let matches: Vec<usize> = searcher
+///     .find_iter("foobar")
+///     .map(|mat| mat.pattern())
+///     .collect();
+/// assert_eq!(vec![1], matches);
+/// # Some(()) }
+/// # if cfg!(target_arch = "x86_64") {
+/// #     example().unwrap()
+/// # } else {
+/// #     assert!(example().is_none());
+/// # }
+/// ```
+#[derive(Clone, Debug)]
+pub struct Config {
+    kind: MatchKind,
+    force: Option<ForceAlgorithm>,
+    force_teddy_fat: Option<bool>,
+    force_avx: Option<bool>,
+}
+
+/// An internal option for forcing the use of a particular packed algorithm.
+///
+/// When an algorithm is forced, if a searcher could not be constructed for it,
+/// then no searcher will be returned even if an alternative algorithm would
+/// work.
+#[derive(Clone, Debug)]
+enum ForceAlgorithm {
+    Teddy,
+    RabinKarp,
+}
+
+impl Default for Config {
+    fn default() -> Config {
+        Config::new()
+    }
+}
+
+impl Config {
+    /// Create a new default configuration. A default configuration uses
+    /// leftmost-first match semantics.
+    pub fn new() -> Config {
+        Config {
+            kind: MatchKind::LeftmostFirst,
+            force: None,
+            force_teddy_fat: None,
+            force_avx: None,
+        }
+    }
+
+    /// Create a packed builder from this configuration. The builder can be
+    /// used to accumulate patterns and create a
+    /// [`Searcher`](struct.Searcher.html)
+    /// from them.
+    pub fn builder(&self) -> Builder {
+        Builder::from_config(self.clone())
+    }
+
+    /// Set the match semantics for this configuration.
+    pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config {
+        self.kind = kind;
+        self
+    }
+
+    /// An undocumented method for forcing the use of the Teddy algorithm.
+    ///
+    /// This is only exposed for more precise testing and benchmarks. Callers
+    /// should not use it as it is not part of the API stability guarantees of
+    /// this crate.
+    #[doc(hidden)]
+    pub fn force_teddy(&mut self, yes: bool) -> &mut Config {
+        if yes {
+            self.force = Some(ForceAlgorithm::Teddy);
+        } else {
+            self.force = None;
+        }
+        self
+    }
+
+    /// An undocumented method for forcing the use of the Fat Teddy algorithm.
+    ///
+    /// This is only exposed for more precise testing and benchmarks. Callers
+    /// should not use it as it is not part of the API stability guarantees of
+    /// this crate.
+    #[doc(hidden)]
+    pub fn force_teddy_fat(&mut self, yes: Option<bool>) -> &mut Config {
+        self.force_teddy_fat = yes;
+        self
+    }
+
+    /// An undocumented method for forcing the use of SSE (`Some(false)`) or
+    /// AVX (`Some(true)`) algorithms.
+    ///
+    /// This is only exposed for more precise testing and benchmarks. Callers
+    /// should not use it as it is not part of the API stability guarantees of
+    /// this crate.
+    #[doc(hidden)]
+    pub fn force_avx(&mut self, yes: Option<bool>) -> &mut Config {
+        self.force_avx = yes;
+        self
+    }
+
+    /// An undocumented method for forcing the use of the Rabin-Karp algorithm.
+    ///
+    /// This is only exposed for more precise testing and benchmarks. Callers
+    /// should not use it as it is not part of the API stability guarantees of
+    /// this crate.
+    #[doc(hidden)]
+    pub fn force_rabin_karp(&mut self, yes: bool) -> &mut Config {
+        if yes {
+            self.force = Some(ForceAlgorithm::RabinKarp);
+        } else {
+            self.force = None;
+        }
+        self
+    }
+}
+
+/// A builder for constructing a packed searcher from a collection of patterns.
+///
+/// # Example
+///
+/// This example shows how to use a builder to construct a searcher. By
+/// default, leftmost-first match semantics are used.
+///
+/// ```
+/// use aho_corasick::packed::{Builder, MatchKind};
+///
+/// # fn example() -> Option<()> {
+/// let searcher = Builder::new()
+///     .add("foobar")
+///     .add("foo")
+///     .build()?;
+/// let matches: Vec<usize> = searcher
+///     .find_iter("foobar")
+///     .map(|mat| mat.pattern())
+///     .collect();
+/// assert_eq!(vec![0], matches);
+/// # Some(()) }
+/// # if cfg!(target_arch = "x86_64") {
+/// #     example().unwrap()
+/// # } else {
+/// #     assert!(example().is_none());
+/// # }
+/// ```
+#[derive(Clone, Debug)]
+pub struct Builder {
+    /// The configuration of this builder and subsequent matcher.
+    config: Config,
+    /// Set to true if the builder detects that a matcher cannot be built.
+    inert: bool,
+    /// The patterns provided by the caller.
+    patterns: Patterns,
+}
+
+impl Builder {
+    /// Create a new builder for constructing a multi-pattern searcher. This
+    /// constructor uses the default configuration.
+    pub fn new() -> Builder {
+        Builder::from_config(Config::new())
+    }
+
+    fn from_config(config: Config) -> Builder {
+        Builder { config, inert: false, patterns: Patterns::new() }
+    }
+
+    /// Build a searcher from the patterns added to this builder so far.
+    pub fn build(&self) -> Option<Searcher> {
+        if self.inert || self.patterns.is_empty() {
+            return None;
+        }
+        let mut patterns = self.patterns.clone();
+        patterns.set_match_kind(self.config.kind);
+        let rabinkarp = RabinKarp::new(&patterns);
+        // Effectively, we only want to return a searcher if we can use Teddy,
+        // since Teddy is our only fast packed searcher at the moment.
+        // Rabin-Karp is only used when searching haystacks smaller than what
+        // Teddy can support. Thus, the only way to get a Rabin-Karp searcher
+        // is to force it using undocumented APIs (for tests/benchmarks).
+        let (search_kind, minimum_len) = match self.config.force {
+            None | Some(ForceAlgorithm::Teddy) => {
+                let teddy = match self.build_teddy(&patterns) {
+                    None => return None,
+                    Some(teddy) => teddy,
+                };
+                let minimum_len = teddy.minimum_len();
+                (SearchKind::Teddy(teddy), minimum_len)
+            }
+            Some(ForceAlgorithm::RabinKarp) => (SearchKind::RabinKarp, 0),
+        };
+        Some(Searcher {
+            config: self.config.clone(),
+            patterns,
+            rabinkarp,
+            search_kind,
+            minimum_len,
+        })
+    }
+
+    fn build_teddy(&self, patterns: &Patterns) -> Option<Teddy> {
+        teddy::Builder::new()
+            .avx(self.config.force_avx)
+            .fat(self.config.force_teddy_fat)
+            .build(&patterns)
+    }
+
+    /// Add the given pattern to this set to match.
+    ///
+    /// The order in which patterns are added is significant. Namely, when
+    /// using leftmost-first match semantics, then when multiple patterns can
+    /// match at a particular location, the pattern that was added first is
+    /// used as the match.
+    ///
+    /// If the number of patterns added exceeds the amount supported by packed
+    /// searchers, then the builder will stop accumulating patterns and render
+    /// itself inert. At this point, constructing a searcher will always return
+    /// `None`.
+    pub fn add<P: AsRef<[u8]>>(&mut self, pattern: P) -> &mut Builder {
+        if self.inert {
+            return self;
+        } else if self.patterns.len() >= PATTERN_LIMIT {
+            self.inert = true;
+            self.patterns.reset();
+            return self;
+        }
+        // Just in case PATTERN_LIMIT increases beyond u16::MAX.
+        assert!(self.patterns.len() <= u16::MAX as usize);
+
+        let pattern = pattern.as_ref();
+        if pattern.is_empty() {
+            self.inert = true;
+            self.patterns.reset();
+            return self;
+        }
+        self.patterns.add(pattern);
+        self
+    }
+
+    /// Add the given iterator of patterns to this set to match.
+    ///
+    /// The iterator must yield elements that can be converted into a `&[u8]`.
+    ///
+    /// The order in which patterns are added is significant. Namely, when
+    /// using leftmost-first match semantics, then when multiple patterns can
+    /// match at a particular location, the pattern that was added first is
+    /// used as the match.
+    ///
+    /// If the number of patterns added exceeds the amount supported by packed
+    /// searchers, then the builder will stop accumulating patterns and render
+    /// itself inert. At this point, constructing a searcher will always return
+    /// `None`.
+    pub fn extend<I, P>(&mut self, patterns: I) -> &mut Builder
+    where
+        I: IntoIterator<Item = P>,
+        P: AsRef<[u8]>,
+    {
+        for p in patterns {
+            self.add(p);
+        }
+        self
+    }
+}
+
+impl Default for Builder {
+    fn default() -> Builder {
+        Builder::new()
+    }
+}
+
+/// A packed searcher for quickly finding occurrences of multiple patterns.
+///
+/// If callers need more flexible construction, or if one wants to change the
+/// match semantics (either leftmost-first or leftmost-longest), then one can
+/// use the [`Config`](struct.Config.html) and/or
+/// [`Builder`](struct.Builder.html) types for more fine grained control.
+///
+/// # Example
+///
+/// This example shows how to create a searcher from an iterator of patterns.
+/// By default, leftmost-first match semantics are used.
+///
+/// ```
+/// use aho_corasick::packed::{MatchKind, Searcher};
+///
+/// # fn example() -> Option<()> {
+/// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
+/// let matches: Vec<usize> = searcher
+///     .find_iter("foobar")
+///     .map(|mat| mat.pattern())
+///     .collect();
+/// assert_eq!(vec![0], matches);
+/// # Some(()) }
+/// # if cfg!(target_arch = "x86_64") {
+/// #     example().unwrap()
+/// # } else {
+/// #     assert!(example().is_none());
+/// # }
+/// ```
+#[derive(Clone, Debug)]
+pub struct Searcher {
+    config: Config,
+    patterns: Patterns,
+    rabinkarp: RabinKarp,
+    search_kind: SearchKind,
+    minimum_len: usize,
+}
+
+#[derive(Clone, Debug)]
+enum SearchKind {
+    Teddy(Teddy),
+    RabinKarp,
+}
+
+impl Searcher {
+    /// A convenience function for constructing a searcher from an iterator
+    /// of things that can be converted to a `&[u8]`.
+    ///
+    /// If a searcher could not be constructed (either because of an
+    /// unsupported CPU or because there are too many patterns), then `None`
+    /// is returned.
+    ///
+    /// # Example
+    ///
+    /// Basic usage:
+    ///
+    /// ```
+    /// use aho_corasick::packed::{MatchKind, Searcher};
+    ///
+    /// # fn example() -> Option<()> {
+    /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
+    /// let matches: Vec<usize> = searcher
+    ///     .find_iter("foobar")
+    ///     .map(|mat| mat.pattern())
+    ///     .collect();
+    /// assert_eq!(vec![0], matches);
+    /// # Some(()) }
+    /// # if cfg!(target_arch = "x86_64") {
+    /// #     example().unwrap()
+    /// # } else {
+    /// #     assert!(example().is_none());
+    /// # }
+    /// ```
+    pub fn new<I, P>(patterns: I) -> Option<Searcher>
+    where
+        I: IntoIterator<Item = P>,
+        P: AsRef<[u8]>,
+    {
+        Builder::new().extend(patterns).build()
+    }
+
+    /// Return the first occurrence of any of the patterns in this searcher,
+    /// according to its match semantics, in the given haystack. The `Match`
+    /// returned will include the identifier of the pattern that matched, which
+    /// corresponds to the index of the pattern (starting from `0`) in which it
+    /// was added.
+    ///
+    /// # Example
+    ///
+    /// Basic usage:
+    ///
+    /// ```
+    /// use aho_corasick::packed::{MatchKind, Searcher};
+    ///
+    /// # fn example() -> Option<()> {
+    /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
+    /// let mat = searcher.find("foobar")?;
+    /// assert_eq!(0, mat.pattern());
+    /// assert_eq!(0, mat.start());
+    /// assert_eq!(6, mat.end());
+    /// # Some(()) }
+    /// # if cfg!(target_arch = "x86_64") {
+    /// #     example().unwrap()
+    /// # } else {
+    /// #     assert!(example().is_none());
+    /// # }
+    /// ```
+    pub fn find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match> {
+        self.find_at(haystack, 0)
+    }
+
+    /// Return the first occurrence of any of the patterns in this searcher,
+    /// according to its match semantics, in the given haystack starting from
+    /// the given position.
+    ///
+    /// The `Match` returned will include the identifier of the pattern that
+    /// matched, which corresponds to the index of the pattern (starting from
+    /// `0`) in which it was added. The offsets in the `Match` will be relative
+    /// to the start of `haystack` (and not `at`).
+    ///
+    /// # Example
+    ///
+    /// Basic usage:
+    ///
+    /// ```
+    /// use aho_corasick::packed::{MatchKind, Searcher};
+    ///
+    /// # fn example() -> Option<()> {
+    /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
+    /// let mat = searcher.find_at("foofoobar", 3)?;
+    /// assert_eq!(0, mat.pattern());
+    /// assert_eq!(3, mat.start());
+    /// assert_eq!(9, mat.end());
+    /// # Some(()) }
+    /// # if cfg!(target_arch = "x86_64") {
+    /// #     example().unwrap()
+    /// # } else {
+    /// #     assert!(example().is_none());
+    /// # }
+    /// ```
+    pub fn find_at<B: AsRef<[u8]>>(
+        &self,
+        haystack: B,
+        at: usize,
+    ) -> Option<Match> {
+        let haystack = haystack.as_ref();
+        match self.search_kind {
+            SearchKind::Teddy(ref teddy) => {
+                if haystack[at..].len() < teddy.minimum_len() {
+                    return self.slow_at(haystack, at);
+                }
+                teddy.find_at(&self.patterns, haystack, at)
+            }
+            SearchKind::RabinKarp => {
+                self.rabinkarp.find_at(&self.patterns, haystack, at)
+            }
+        }
+    }
+
+    /// Return an iterator of non-overlapping occurrences of the patterns in
+    /// this searcher, according to its match semantics, in the given haystack.
+    ///
+    /// # Example
+    ///
+    /// Basic usage:
+    ///
+    /// ```
+    /// use aho_corasick::packed::{MatchKind, Searcher};
+    ///
+    /// # fn example() -> Option<()> {
+    /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
+    /// let matches: Vec<usize> = searcher
+    ///     .find_iter("foobar fooba foofoo")
+    ///     .map(|mat| mat.pattern())
+    ///     .collect();
+    /// assert_eq!(vec![0, 1, 1, 1], matches);
+    /// # Some(()) }
+    /// # if cfg!(target_arch = "x86_64") {
+    /// #     example().unwrap()
+    /// # } else {
+    /// #     assert!(example().is_none());
+    /// # }
+    /// ```
+    pub fn find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>(
+        &'a self,
+        haystack: &'b B,
+    ) -> FindIter<'a, 'b> {
+        FindIter { searcher: self, haystack: haystack.as_ref(), at: 0 }
+    }
+
+    /// Returns the match kind used by this packed searcher.
+    ///
+    /// # Examples
+    ///
+    /// Basic usage:
+    ///
+    /// ```
+    /// use aho_corasick::packed::{MatchKind, Searcher};
+    ///
+    /// # fn example() -> Option<()> {
+    /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
+    /// // leftmost-first is the default.
+    /// assert_eq!(&MatchKind::LeftmostFirst, searcher.match_kind());
+    /// # Some(()) }
+    /// # if cfg!(target_arch = "x86_64") {
+    /// #     example().unwrap()
+    /// # } else {
+    /// #     assert!(example().is_none());
+    /// # }
+    /// ```
+    pub fn match_kind(&self) -> &MatchKind {
+        self.patterns.match_kind()
+    }
+
+    /// Returns the minimum length of a haystack that is required in order for
+    /// packed searching to be effective.
+    ///
+    /// In some cases, the underlying packed searcher may not be able to search
+    /// very short haystacks. When that occurs, the implementation will defer
+    /// to a slower non-packed searcher (which is still generally faster than
+    /// Aho-Corasick for a small number of patterns). However, callers may
+    /// want to avoid ever using the slower variant, which one can do by
+    /// never passing a haystack shorter than the minimum length returned by
+    /// this method.
+    pub fn minimum_len(&self) -> usize {
+        self.minimum_len
+    }
+
+    /// Returns the approximate total amount of heap used by this searcher, in
+    /// units of bytes.
+    pub fn heap_bytes(&self) -> usize {
+        self.patterns.heap_bytes()
+            + self.rabinkarp.heap_bytes()
+            + self.search_kind.heap_bytes()
+    }
+
+    /// Use a slow (non-packed) searcher.
+    ///
+    /// This is useful when a packed searcher could be constructed, but could
+    /// not be used to search a specific haystack. For example, if Teddy was
+    /// built but the haystack is smaller than ~34 bytes, then Teddy might not
+    /// be able to run.
+    fn slow_at(&self, haystack: &[u8], at: usize) -> Option<Match> {
+        self.rabinkarp.find_at(&self.patterns, haystack, at)
+    }
+}
+
+impl SearchKind {
+    fn heap_bytes(&self) -> usize {
+        match *self {
+            SearchKind::Teddy(ref ted) => ted.heap_bytes(),
+            SearchKind::RabinKarp => 0,
+        }
+    }
+}
+
+/// An iterator over non-overlapping matches from a packed searcher.
+///
+/// The lifetime `'s` refers to the lifetime of the underlying
+/// [`Searcher`](struct.Searcher.html), while the lifetime `'h` refers to the
+/// lifetime of the haystack being searched.
+#[derive(Debug)]
+pub struct FindIter<'s, 'h> {
+    searcher: &'s Searcher,
+    haystack: &'h [u8],
+    at: usize,
+}
+
+impl<'s, 'h> Iterator for FindIter<'s, 'h> {
+    type Item = Match;
+
+    fn next(&mut self) -> Option<Match> {
+        if self.at > self.haystack.len() {
+            return None;
+        }
+        match self.searcher.find_at(&self.haystack, self.at) {
+            None => None,
+            Some(c) => {
+                self.at = c.end;
+                Some(c)
+            }
+        }
+    }
+}
diff --git a/src/packed/mod.rs b/src/packed/mod.rs
new file mode 100644
index 0000000..5a3aa2e
--- /dev/null
+++ b/src/packed/mod.rs
@@ -0,0 +1,117 @@
+/*!
+A lower level API for packed multiple substring search, principally for a small
+number of patterns.
+
+This sub-module provides vectorized routines for quickly finding matches of a
+small number of patterns. In general, users of this crate shouldn't need to
+interface with this module directory, as the primary
+[`AhoCorasick`](../struct.AhoCorasick.html)
+searcher will use these routines automatically as a prefilter when applicable.
+However, in some cases, callers may want to bypass the Aho-Corasick machinery
+entirely and use this vectorized searcher directly.
+
+# Overview
+
+The primary types in this sub-module are:
+
+* [`Searcher`](struct.Searcher.html) executes the actual search algorithm to
+  report matches in a haystack.
+* [`Builder`](struct.Builder.html) accumulates patterns incrementally and can
+  construct a `Searcher`.
+* [`Config`](struct.Config.html) permits tuning the searcher, and itself will
+  produce a `Builder` (which can then be used to build a `Searcher`).
+  Currently, the only tuneable knob are the match semantics, but this may be
+  expanded in the future.
+
+# Examples
+
+This example shows how to create a searcher from an iterator of patterns.
+By default, leftmost-first match semantics are used. (See the top-level
+[`MatchKind`](../enum.MatchKind.html) type for more details about match
+semantics, which apply similarly to packed substring search.)
+
+```
+use aho_corasick::packed::{MatchKind, Searcher};
+
+# fn example() -> Option<()> {
+let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
+let matches: Vec<usize> = searcher
+    .find_iter("foobar")
+    .map(|mat| mat.pattern())
+    .collect();
+assert_eq!(vec![0], matches);
+# Some(()) }
+# if cfg!(target_arch = "x86_64") {
+#     example().unwrap()
+# } else {
+#     assert!(example().is_none());
+# }
+```
+
+This example shows how to use [`Config`](struct.Config.html) to change the
+match semantics to leftmost-longest:
+
+```
+use aho_corasick::packed::{Config, MatchKind};
+
+# fn example() -> Option<()> {
+let searcher = Config::new()
+    .match_kind(MatchKind::LeftmostLongest)
+    .builder()
+    .add("foo")
+    .add("foobar")
+    .build()?;
+let matches: Vec<usize> = searcher
+    .find_iter("foobar")
+    .map(|mat| mat.pattern())
+    .collect();
+assert_eq!(vec![1], matches);
+# Some(()) }
+# if cfg!(target_arch = "x86_64") {
+#     example().unwrap()
+# } else {
+#     assert!(example().is_none());
+# }
+```
+
+# Packed substring searching
+
+Packed substring searching refers to the use of SIMD (Single Instruction,
+Multiple Data) to accelerate the detection of matches in a haystack. Unlike
+conventional algorithms, such as Aho-Corasick, SIMD algorithms for substring
+search tend to do better with a small number of patterns, where as Aho-Corasick
+generally maintains reasonably consistent performance regardless of the number
+of patterns you give it. Because of this, the vectorized searcher in this
+sub-module cannot be used as a general purpose searcher, since building the
+searcher may fail. However, in exchange, when searching for a small number of
+patterns, searching can be quite a bit faster than Aho-Corasick (sometimes by
+an order of magnitude).
+
+The key take away here is that constructing a searcher from a list of patterns
+is a fallible operation. While the precise conditions under which building a
+searcher can fail is specifically an implementation detail, here are some
+common reasons:
+
+* Too many patterns were given. Typically, the limit is on the order of 100 or
+  so, but this limit may fluctuate based on available CPU features.
+* The available packed algorithms require CPU features that aren't available.
+  For example, currently, this crate only provides packed algorithms for
+  `x86_64`. Therefore, constructing a packed searcher on any other target
+  (e.g., ARM) will always fail.
+* Zero patterns were given, or one of the patterns given was empty. Packed
+  searchers require at least one pattern and that all patterns are non-empty.
+* Something else about the nature of the patterns (typically based on
+  heuristics) suggests that a packed searcher would perform very poorly, so
+  no searcher is built.
+*/
+
+pub use packed::api::{Builder, Config, FindIter, MatchKind, Searcher};
+
+mod api;
+mod pattern;
+mod rabinkarp;
+mod teddy;
+#[cfg(test)]
+mod tests;
+#[cfg(target_arch = "x86_64")]
+mod vector;
diff --git a/src/packed/pattern.rs b/src/packed/pattern.rs
new file mode 100644
index 0000000..dfb07e9
--- /dev/null
+++ b/src/packed/pattern.rs
@@ -0,0 +1,318 @@
+use std::cmp;
+use std::fmt;
+use std::mem;
+use std::u16;
+use std::usize;
+
+use packed::api::MatchKind;
+
+/// The type used for representing a pattern identifier.
+///
+/// We don't use `usize` here because our packed searchers don't scale to
+/// huge numbers of patterns, so we keep things a bit smaller.
+pub type PatternID = u16;
+
+/// A non-empty collection of non-empty patterns to search for.
+///
+/// This collection of patterns is what is passed around to both execute
+/// searches and to construct the searchers themselves. Namely, this permits
+/// searches to avoid copying all of the patterns, and allows us to keep only
+/// one copy throughout all packed searchers.
+///
+/// Note that this collection is not a set. The same pattern can appear more
+/// than once.
+#[derive(Clone, Debug)]
+pub struct Patterns {
+    /// The match semantics supported by this collection of patterns.
+    ///
+    /// The match semantics determines the order of the iterator over patterns.
+    /// For leftmost-first, patterns are provided in the same order as were
+    /// provided by the caller. For leftmost-longest, patterns are provided in
+    /// descending order of length, with ties broken by the order in which they
+    /// were provided by the caller.
+    kind: MatchKind,
+    /// The collection of patterns, indexed by their identifier.
+    by_id: Vec<Vec<u8>>,
+    /// The order of patterns defined for iteration, given by pattern
+    /// identifiers. The order of `by_id` and `order` is always the same for
+    /// leftmost-first semantics, but may be different for leftmost-longest
+    /// semantics.
+    order: Vec<PatternID>,
+    /// The length of the smallest pattern, in bytes.
+    minimum_len: usize,
+    /// The largest pattern identifier. This should always be equivalent to
+    /// the number of patterns minus one in this collection.
+    max_pattern_id: PatternID,
+    /// The total number of pattern bytes across the entire collection. This
+    /// is used for reporting total heap usage in constant time.
+    total_pattern_bytes: usize,
+}
+
+impl Patterns {
+    /// Create a new collection of patterns for the given match semantics. The
+    /// ID of each pattern is the index of the pattern at which it occurs in
+    /// the `by_id` slice.
+    ///
+    /// If any of the patterns in the slice given are empty, then this panics.
+    /// Similarly, if the number of patterns given is zero, then this also
+    /// panics.
+    pub fn new() -> Patterns {
+        Patterns {
+            kind: MatchKind::default(),
+            by_id: vec![],
+            order: vec![],
+            minimum_len: usize::MAX,
+            max_pattern_id: 0,
+            total_pattern_bytes: 0,
+        }
+    }
+
+    /// Add a pattern to this collection.
+    ///
+    /// This panics if the pattern given is empty.
+    pub fn add(&mut self, bytes: &[u8]) {
+        assert!(!bytes.is_empty());
+        assert!(self.by_id.len() <= u16::MAX as usize);
+
+        let id = self.by_id.len() as u16;
+        self.max_pattern_id = id;
+        self.order.push(id);
+        self.by_id.push(bytes.to_vec());
+        self.minimum_len = cmp::min(self.minimum_len, bytes.len());
+        self.total_pattern_bytes += bytes.len();
+    }
+
+    /// Set the match kind semantics for this collection of patterns.
+    ///
+    /// If the kind is not set, then the default is leftmost-first.
+    pub fn set_match_kind(&mut self, kind: MatchKind) {
+        match kind {
+            MatchKind::LeftmostFirst => {
+                self.order.sort();
+            }
+            MatchKind::LeftmostLongest => {
+                let (order, by_id) = (&mut self.order, &mut self.by_id);
+                order.sort_by(|&id1, &id2| {
+                    by_id[id1 as usize]
+                        .len()
+                        .cmp(&by_id[id2 as usize].len())
+                        .reverse()
+                });
+            }
+            MatchKind::__Nonexhaustive => unreachable!(),
+        }
+    }
+
+    /// Return the number of patterns in this collection.
+    ///
+    /// This is guaranteed to be greater than zero.
+    pub fn len(&self) -> usize {
+        self.by_id.len()
+    }
+
+    /// Returns true if and only if this collection of patterns is empty.
+    pub fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+
+    /// Returns the approximate total amount of heap used by these patterns, in
+    /// units of bytes.
+    pub fn heap_bytes(&self) -> usize {
+        self.order.len() * mem::size_of::<PatternID>()
+            + self.by_id.len() * mem::size_of::<Vec<u8>>()
+            + self.total_pattern_bytes
+    }
+
+    /// Clears all heap memory associated with this collection of patterns and
+    /// resets all state such that it is a valid empty collection.
+    pub fn reset(&mut self) {
+        self.kind = MatchKind::default();
+        self.by_id.clear();
+        self.order.clear();
+        self.minimum_len = usize::MAX;
+        self.max_pattern_id = 0;
+    }
+
+    /// Return the maximum pattern identifier in this collection. This can be
+    /// useful in searchers for ensuring that the collection of patterns they
+    /// are provided at search time and at build time have the same size.
+    pub fn max_pattern_id(&self) -> PatternID {
+        assert_eq!((self.max_pattern_id + 1) as usize, self.len());
+        self.max_pattern_id
+    }
+
+    /// Returns the length, in bytes, of the smallest pattern.
+    ///
+    /// This is guaranteed to be at least one.
+    pub fn minimum_len(&self) -> usize {
+        self.minimum_len
+    }
+
+    /// Returns the match semantics used by these patterns.
+    pub fn match_kind(&self) -> &MatchKind {
+        &self.kind
+    }
+
+    /// Return the pattern with the given identifier. If such a pattern does
+    /// not exist, then this panics.
+    pub fn get(&self, id: PatternID) -> Pattern {
+        Pattern(&self.by_id[id as usize])
+    }
+
+    /// Return the pattern with the given identifier without performing bounds
+    /// checks.
+    ///
+    /// # Safety
+    ///
+    /// Callers must ensure that a pattern with the given identifier exists
+    /// before using this method.
+    #[cfg(target_arch = "x86_64")]
+    pub unsafe fn get_unchecked(&self, id: PatternID) -> Pattern {
+        Pattern(self.by_id.get_unchecked(id as usize))
+    }
+
+    /// Return an iterator over all the patterns in this collection, in the
+    /// order in which they should be matched.
+    ///
+    /// Specifically, in a naive multi-pattern matcher, the following is
+    /// guaranteed to satisfy the match semantics of this collection of
+    /// patterns:
+    ///
+    /// ```ignore
+    /// for i in 0..haystack.len():
+    ///   for p in patterns.iter():
+    ///     if haystack[i..].starts_with(p.bytes()):
+    ///       return Match(p.id(), i, i + p.bytes().len())
+    /// ```
+    ///
+    /// Namely, among the patterns in a collection, if they are matched in
+    /// the order provided by this iterator, then the result is guaranteed
+    /// to satisfy the correct match semantics. (Either leftmost-first or
+    /// leftmost-longest.)
+    pub fn iter(&self) -> PatternIter {
+        PatternIter { patterns: self, i: 0 }
+    }
+}
+
+/// An iterator over the patterns in the `Patterns` collection.
+///
+/// The order of the patterns provided by this iterator is consistent with the
+/// match semantics of the originating collection of patterns.
+///
+/// The lifetime `'p` corresponds to the lifetime of the collection of patterns
+/// this is iterating over.
+#[derive(Debug)]
+pub struct PatternIter<'p> {
+    patterns: &'p Patterns,
+    i: usize,
+}
+
+impl<'p> Iterator for PatternIter<'p> {
+    type Item = (PatternID, Pattern<'p>);
+
+    fn next(&mut self) -> Option<(PatternID, Pattern<'p>)> {
+        if self.i >= self.patterns.len() {
+            return None;
+        }
+        let id = self.patterns.order[self.i];
+        let p = self.patterns.get(id);
+        self.i += 1;
+        Some((id, p))
+    }
+}
+
+/// A pattern that is used in packed searching.
+#[derive(Clone)]
+pub struct Pattern<'a>(&'a [u8]);
+
+impl<'a> fmt::Debug for Pattern<'a> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        f.debug_struct("Pattern")
+            .field("lit", &String::from_utf8_lossy(&self.0))
+            .finish()
+    }
+}
+
+impl<'p> Pattern<'p> {
+    /// Returns the length of this pattern, in bytes.
+    pub fn len(&self) -> usize {
+        self.0.len()
+    }
+
+    /// Returns the bytes of this pattern.
+    pub fn bytes(&self) -> &[u8] {
+        &self.0
+    }
+
+    /// Returns the first `len` low nybbles from this pattern. If this pattern
+    /// is shorter than `len`, then this panics.
+    #[cfg(target_arch = "x86_64")]
+    pub fn low_nybbles(&self, len: usize) -> Vec<u8> {
+        let mut nybs = vec![];
+        for &b in self.bytes().iter().take(len) {
+            nybs.push(b & 0xF);
+        }
+        nybs
+    }
+
+    /// Returns true if this pattern is a prefix of the given bytes.
+    #[inline(always)]
+    pub fn is_prefix(&self, bytes: &[u8]) -> bool {
+        self.len() <= bytes.len() && self.equals(&bytes[..self.len()])
+    }
+
+    /// Returns true if and only if this pattern equals the given bytes.
+    #[inline(always)]
+    pub fn equals(&self, bytes: &[u8]) -> bool {
+        // Why not just use memcmp for this? Well, memcmp requires calling out
+        // to libc, and this routine is called in fairly hot code paths. Other
+        // than just calling out to libc, it also seems to result in worse
+        // codegen. By rolling our own memcpy in pure Rust, it seems to appear
+        // more friendly to the optimizer.
+        //
+        // This results in an improvement in just about every benchmark. Some
+        // smaller than others, but in some cases, up to 30% faster.
+
+        if self.len() != bytes.len() {
+            return false;
+        }
+        if self.len() < 8 {
+            for (&b1, &b2) in self.bytes().iter().zip(bytes) {
+                if b1 != b2 {
+                    return false;
+                }
+            }
+            return true;
+        }
+        // When we have 8 or more bytes to compare, then proceed in chunks of
+        // 8 at a time using unaligned loads.
+        let mut p1 = self.bytes().as_ptr();
+        let mut p2 = bytes.as_ptr();
+        let p1end = self.bytes()[self.len() - 8..].as_ptr();
+        let p2end = bytes[bytes.len() - 8..].as_ptr();
+        // SAFETY: Via the conditional above, we know that both `p1` and `p2`
+        // have the same length, so `p1 < p1end` implies that `p2 < p2end`.
+        // Thus, derefencing both `p1` and `p2` in the loop below is safe.
+        //
+        // Moreover, we set `p1end` and `p2end` to be 8 bytes before the actual
+        // end of of `p1` and `p2`. Thus, the final dereference outside of the
+        // loop is guaranteed to be valid.
+        //
+        // Finally, we needn't worry about 64-bit alignment here, since we
+        // do unaligned loads.
+        unsafe {
+            while p1 < p1end {
+                let v1 = (p1 as *const u64).read_unaligned();
+                let v2 = (p2 as *const u64).read_unaligned();
+                if v1 != v2 {
+                    return false;
+                }
+                p1 = p1.add(8);
+                p2 = p2.add(8);
+            }
+            let v1 = (p1end as *const u64).read_unaligned();
+            let v2 = (p2end as *const u64).read_unaligned();
+            v1 == v2
+        }
+    }
+}
diff --git a/src/packed/rabinkarp.rs b/src/packed/rabinkarp.rs
new file mode 100644
index 0000000..3992296
--- /dev/null
+++ b/src/packed/rabinkarp.rs
@@ -0,0 +1,185 @@
+use std::mem;
+
+use packed::pattern::{PatternID, Patterns};
+use Match;
+
+/// The type of the rolling hash used in the Rabin-Karp algorithm.
+type Hash = usize;
+
+/// The number of buckets to store our patterns in. We don't want this to be
+/// too big in order to avoid wasting memory, but we don't want it to be too
+/// small either to avoid spending too much time confirming literals.
+///
+/// The number of buckets MUST be a power of two. Otherwise, determining the
+/// bucket from a hash will slow down the code considerably. Using a power
+/// of two means `hash % NUM_BUCKETS` can compile down to a simple `and`
+/// instruction.
+const NUM_BUCKETS: usize = 64;
+
+/// An implementation of the Rabin-Karp algorithm. The main idea of this
+/// algorithm is to maintain a rolling hash as it moves through the input, and
+/// then check whether that hash corresponds to the same hash for any of the
+/// patterns we're looking for.
+///
+/// A draw back of naively scaling Rabin-Karp to multiple patterns is that
+/// it requires all of the patterns to be the same length, which in turn
+/// corresponds to the number of bytes to hash. We adapt this to work for
+/// multiple patterns of varying size by fixing the number of bytes to hash
+/// to be the length of the smallest pattern. We also split the patterns into
+/// several buckets to hopefully make the confirmation step faster.
+///
+/// Wikipedia has a decent explanation, if a bit heavy on the theory:
+/// https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm
+///
+/// But ESMAJ provides something a bit more concrete:
+/// http://www-igm.univ-mlv.fr/~lecroq/string/node5.html
+#[derive(Clone, Debug)]
+pub struct RabinKarp {
+    /// The order of patterns in each bucket is significant. Namely, they are
+    /// arranged such that the first one to match is the correct match. This
+    /// may not necessarily correspond to the order provided by the caller.
+    /// For example, if leftmost-longest semantics are used, then the patterns
+    /// are sorted by their length in descending order. If leftmost-first
+    /// semantics are used, then the patterns are sorted by their pattern ID
+    /// in ascending order (which corresponds to the caller's order).
+    buckets: Vec<Vec<(Hash, PatternID)>>,
+    /// The length of the hashing window. Generally, this corresponds to the
+    /// length of the smallest pattern.
+    hash_len: usize,
+    /// The factor to subtract out of a hash before updating it with a new
+    /// byte.
+    hash_2pow: usize,
+    /// The maximum identifier of a pattern. This is used as a sanity check
+    /// to ensure that the patterns provided by the caller are the same as
+    /// the patterns that were used to compile the matcher. This sanity check
+    /// possibly permits safely eliminating bounds checks regardless of what
+    /// patterns are provided by the caller.
+    ///
+    /// (Currently, we don't use this to elide bounds checks since it doesn't
+    /// result in a measurable performance improvement, but we do use it for
+    /// better failure modes.)
+    max_pattern_id: PatternID,
+}
+
+impl RabinKarp {
+    /// Compile a new Rabin-Karp matcher from the patterns given.
+    ///
+    /// This panics if any of the patterns in the collection are empty, or if
+    /// the collection is itself empty.
+    pub fn new(patterns: &Patterns) -> RabinKarp {
+        assert!(patterns.len() >= 1);
+        let hash_len = patterns.minimum_len();
+        assert!(hash_len >= 1);
+
+        let mut hash_2pow = 1usize;
+        for _ in 1..hash_len {
+            hash_2pow = hash_2pow.wrapping_shl(1);
+        }
+
+        let mut rk = RabinKarp {
+            buckets: vec![vec![]; NUM_BUCKETS],
+            hash_len,
+            hash_2pow,
+            max_pattern_id: patterns.max_pattern_id(),
+        };
+        for (id, pat) in patterns.iter() {
+            let hash = rk.hash(&pat.bytes()[..rk.hash_len]);
+            let bucket = hash % NUM_BUCKETS;
+            rk.buckets[bucket].push((hash, id));
+        }
+        rk
+    }
+
+    /// Return the first matching pattern in the given haystack, begining the
+    /// search at `at`.
+    pub fn find_at(
+        &self,
+        patterns: &Patterns,
+        haystack: &[u8],
+        mut at: usize,
+    ) -> Option<Match> {
+        assert_eq!(NUM_BUCKETS, self.buckets.len());
+        assert_eq!(
+            self.max_pattern_id,
+            patterns.max_pattern_id(),
+            "Rabin-Karp must be called with same patterns it was built with",
+        );
+
+        if at + self.hash_len > haystack.len() {
+            return None;
+        }
+        let mut hash = self.hash(&haystack[at..at + self.hash_len]);
+        loop {
+            let bucket = &self.buckets[hash % NUM_BUCKETS];
+            for &(phash, pid) in bucket {
+                if phash == hash {
+                    if let Some(c) = self.verify(patterns, pid, haystack, at) {
+                        return Some(c);
+                    }
+                }
+            }
+            if at + self.hash_len >= haystack.len() {
+                return None;
+            }
+            hash = self.update_hash(
+                hash,
+                haystack[at],
+                haystack[at + self.hash_len],
+            );
+            at += 1;
+        }
+    }
+
+    /// Returns the approximate total amount of heap used by this searcher, in
+    /// units of bytes.
+    pub fn heap_bytes(&self) -> usize {
+        let num_patterns = self.max_pattern_id as usize + 1;
+        self.buckets.len() * mem::size_of::<Vec<(Hash, PatternID)>>()
+            + num_patterns * mem::size_of::<(Hash, PatternID)>()
+    }
+
+    /// Verify whether the pattern with the given id matches at
+    /// `haystack[at..]`.
+    ///
+    /// We tag this function as `cold` because it helps improve codegen.
+    /// Intuitively, it would seem like inlining it would be better. However,
+    /// the only time this is called and a match is not found is when there
+    /// there is a hash collision, or when a prefix of a pattern matches but
+    /// the entire pattern doesn't match. This is hopefully fairly rare, and
+    /// if it does occur a lot, it's going to be slow no matter what we do.
+    #[cold]
+    fn verify(
+        &self,
+        patterns: &Patterns,
+        id: PatternID,
+        haystack: &[u8],
+        at: usize,
+    ) -> Option<Match> {
+        let pat = patterns.get(id);
+        if pat.is_prefix(&haystack[at..]) {
+            Some(Match::from_span(id as usize, at, at + pat.len()))
+        } else {
+            None
+        }
+    }
+
+    /// Hash the given bytes.
+    fn hash(&self, bytes: &[u8]) -> Hash {
+        assert_eq!(self.hash_len, bytes.len());
+
+        let mut hash = 0usize;
+        for &b in bytes {
+            hash = hash.wrapping_shl(1).wrapping_add(b as usize);
+        }
+        hash
+    }
+
+    /// Update the hash given based on removing `old_byte` at the beginning
+    /// of some byte string, and appending `new_byte` to the end of that same
+    /// byte string.
+    fn update_hash(&self, prev: Hash, old_byte: u8, new_byte: u8) -> Hash {
+        prev.wrapping_sub((old_byte as usize).wrapping_mul(self.hash_2pow))
+            .wrapping_shl(1)
+            .wrapping_add(new_byte as usize)
+    }
+}
diff --git a/src/packed/teddy/README.md b/src/packed/teddy/README.md
new file mode 100644
index 0000000..0c42383
--- /dev/null
+++ b/src/packed/teddy/README.md
@@ -0,0 +1,386 @@
+Teddy is a simd accelerated multiple substring matching algorithm. The name
+and the core ideas in the algorithm were learned from the [Hyperscan][1_u]
+project. The implementation in this repository was mostly motivated for use in
+accelerating regex searches by searching for small sets of required literals
+extracted from the regex.
+
+
+# Background
+
+The key idea of Teddy is to do *packed* substring matching. In the literature,
+packed substring matching is the idea of examining multiple bytes in a haystack
+at a time to detect matches. Implementations of, for example, memchr (which
+detects matches of a single byte) have been doing this for years. Only
+recently, with the introduction of various SIMD instructions, has this been
+extended to substring matching. The PCMPESTRI instruction (and its relatives),
+for example, implements substring matching in hardware. It is, however, limited
+to substrings of length 16 bytes or fewer, but this restriction is fine in a
+regex engine, since we rarely care about the performance difference between
+searching for a 16 byte literal and a 16 + N literal; 16 is already long
+enough. The key downside of the PCMPESTRI instruction, on current (2016) CPUs
+at least, is its latency and throughput. As a result, it is often faster to
+do substring search with a Boyer-Moore (or Two-Way) variant and a well placed
+memchr to quickly skip through the haystack.
+
+There are fewer results from the literature on packed substring matching,
+and even fewer for packed multiple substring matching. Ben-Kiki et al. [2]
+describes use of PCMPESTRI for substring matching, but is mostly theoretical
+and hand-waves performance. There is other theoretical work done by Bille [3]
+as well.
+
+The rest of the work in the field, as far as I'm aware, is by Faro and Kulekci
+and is generally focused on multiple pattern search. Their first paper [4a]
+introduces the concept of a fingerprint, which is computed for every block of
+N bytes in every pattern. The haystack is then scanned N bytes at a time and
+a fingerprint is computed in the same way it was computed for blocks in the
+patterns. If the fingerprint corresponds to one that was found in a pattern,
+then a verification step follows to confirm that one of the substrings with the
+corresponding fingerprint actually matches at the current location. Various
+implementation tricks are employed to make sure the fingerprint lookup is fast;
+typically by truncating the fingerprint. (This may, of course, provoke more
+steps in the verification process, so a balance must be struck.)
+
+The main downside of [4a] is that the minimum substring length is 32 bytes,
+presumably because of how the algorithm uses certain SIMD instructions. This
+essentially makes it useless for general purpose regex matching, where a small
+number of short patterns is far more likely.
+
+Faro and Kulekci published another paper [4b] that is conceptually very similar
+to [4a]. The key difference is that it uses the CRC32 instruction (introduced
+as part of SSE 4.2) to compute fingerprint values. This also enables the
+algorithm to work effectively on substrings as short as 7 bytes with 4 byte
+windows. 7 bytes is unfortunately still too long. The window could be
+technically shrunk to 2 bytes, thereby reducing minimum length to 3, but the
+small window size ends up negating most performance benefits—and it's likely
+the common case in a general purpose regex engine.
+
+Faro and Kulekci also published [4c] that appears to be intended as a
+replacement to using PCMPESTRI. In particular, it is specifically motivated by
+the high throughput/latency time of PCMPESTRI and therefore chooses other SIMD
+instructions that are faster. While this approach works for short substrings,
+I personally couldn't see a way to generalize it to multiple substring search.
+
+Faro and Kulekci have another paper [4d] that I haven't been able to read
+because it is behind a paywall.
+
+
+# Teddy
+
+Finally, we get to Teddy. If the above literature review is complete, then it
+appears that Teddy is a novel algorithm. More than that, in my experience, it
+completely blows away the competition for short substrings, which is exactly
+what we want in a general purpose regex engine. Again, the algorithm appears
+to be developed by the authors of [Hyperscan][1_u]. Hyperscan was open sourced
+late 2015, and no earlier history could be found. Therefore, tracking the exact
+provenance of the algorithm with respect to the published literature seems
+difficult.
+
+At a high level, Teddy works somewhat similarly to the fingerprint algorithms
+published by Faro and Kulekci, but Teddy does it in a way that scales a bit
+better. Namely:
+
+1. Teddy's core algorithm scans the haystack in 16 (for SSE, or 32 for AVX)
+   byte chunks. 16 (or 32) is significant because it corresponds to the number
+   of bytes in a SIMD vector.
+2. Bitwise operations are performed on each chunk to discover if any region of
+   it matches a set of precomputed fingerprints from the patterns. If there are
+   matches, then a verification step is performed. In this implementation, our
+   verification step is naive. This can be improved upon.
+
+The details to make this work are quite clever. First, we must choose how to
+pick our fingerprints. In Hyperscan's implementation, I *believe* they use the
+last N bytes of each substring, where N must be at least the minimum length of
+any substring in the set being searched. In this implementation, we use the
+first N bytes of each substring. (The tradeoffs between these choices aren't
+yet clear to me.) We then must figure out how to quickly test whether an
+occurrence of any fingerprint from the set of patterns appears in a 16 byte
+block from the haystack. To keep things simple, let's assume N = 1 and examine
+some examples to motivate the approach. Here are our patterns:
+
+```ignore
+foo
+bar
+baz
+```
+
+The corresponding fingerprints, for N = 1, are `f`, `b` and `b`. Now let's set
+our 16 byte block to:
+
+```ignore
+bat cat foo bump
+xxxxxxxxxxxxxxxx
+```
+
+To cut to the chase, Teddy works by using bitsets. In particular, Teddy creates
+a mask that allows us to quickly compute membership of a fingerprint in a 16
+byte block that also tells which pattern the fingerprint corresponds to. In
+this case, our fingerprint is a single byte, so an appropriate abstraction is
+a map from a single byte to a list of patterns that contain that fingerprint:
+
+```ignore
+f |--> foo
+b |--> bar, baz
+```
+
+Now, all we need to do is figure out how to represent this map in vector space
+and use normal SIMD operations to perform a lookup. The first simplification
+we can make is to represent our patterns as bit fields occupying a single
+byte. This is important, because a single SIMD vector can store 16 bytes.
+
+```ignore
+f |--> 00000001
+b |--> 00000010, 00000100
+```
+
+How do we perform lookup though? It turns out that SSSE3 introduced a very cool
+instruction called PSHUFB. The instruction takes two SIMD vectors, `A` and `B`,
+and returns a third vector `C`. All vectors are treated as 16 8-bit integers.
+`C` is formed by `C[i] = A[B[i]]`. (This is a bit of a simplification, but true
+for the purposes of this algorithm. For full details, see [Intel's Intrinsics
+Guide][5_u].) This essentially lets us use the values in `B` to lookup values
+in `A`.
+
+If we could somehow cause `B` to contain our 16 byte block from the haystack,
+and if `A` could contain our bitmasks, then we'd end up with something like
+this for `A`:
+
+```ignore
+    0x00 0x01 ... 0x62      ... 0x66      ... 0xFF
+A = 0    0        00000110      00000001      0
+```
+
+And if `B` contains our window from our haystack, we could use shuffle to take
+the values from `B` and use them to look up our bitsets in `A`. But of course,
+we can't do this because `A` in the above example contains 256 bytes, which
+is much larger than the size of a SIMD vector.
+
+Nybbles to the rescue! A nybble is 4 bits. Instead of one mask to hold all of
+our bitsets, we can use two masks, where one mask corresponds to the lower four
+bits of our fingerprint and the other mask corresponds to the upper four bits.
+So our map now looks like:
+
+```ignore
+'f' & 0xF = 0x6 |--> 00000001
+'f' >> 4  = 0x6 |--> 00000111
+'b' & 0xF = 0x2 |--> 00000110
+'b' >> 4  = 0x6 |--> 00000111
+```
+
+Notice that the bitsets for each nybble correspond to the union of all
+fingerprints that contain that nybble. For example, both `f` and `b` have the
+same upper 4 bits but differ on the lower 4 bits. Putting this together, we
+have `A0`, `A1` and `B`, where `A0` is our mask for the lower nybble, `A1` is
+our mask for the upper nybble and `B` is our 16 byte block from the haystack:
+
+```ignore
+      0x00 0x01 0x02      0x03 ... 0x06      ... 0xF
+A0 =  0    0    00000110  0        00000001      0
+A1 =  0    0    0         0        00000111      0
+B  =  b    a    t         _        t             p
+B  =  0x62 0x61 0x74      0x20     0x74          0x70
+```
+
+But of course, we can't use `B` with `PSHUFB` yet, since its values are 8 bits,
+and we need indexes that are at most 4 bits (corresponding to one of 16
+values). We can apply the same transformation to split `B` into lower and upper
+nybbles as we did `A`. As before, `B0` corresponds to the lower nybbles and
+`B1` corresponds to the upper nybbles:
+
+```ignore
+     b   a   t   _   c   a   t   _   f   o   o   _   b   u   m   p
+B0 = 0x2 0x1 0x4 0x0 0x3 0x1 0x4 0x0 0x6 0xF 0xF 0x0 0x2 0x5 0xD 0x0
+B1 = 0x6 0x6 0x7 0x2 0x6 0x6 0x7 0x2 0x6 0x6 0x6 0x2 0x6 0x7 0x6 0x7
+```
+
+And now we have a nice correspondence. `B0` can index `A0` and `B1` can index
+`A1`. Here's what we get when we apply `C0 = PSHUFB(A0, B0)`:
+
+```ignore
+     b         a        ... f         o         ... p
+     A0[0x2]   A0[0x1]      A0[0x6]   A0[0xF]       A0[0x0]
+C0 = 00000110  0            00000001  0             0
+```
+
+And `C1 = PSHUFB(A1, B1)`:
+
+```ignore
+     b         a        ... f         o        ... p
+     A1[0x6]   A1[0x6]      A1[0x6]   A1[0x6]      A1[0x7]
+C1 = 00000111  00000111     00000111  00000111     0
+```
+
+Notice how neither one of `C0` or `C1` is guaranteed to report fully correct
+results all on its own. For example, `C1` claims that `b` is a fingerprint for
+the pattern `foo` (since `A1[0x6] = 00000111`), and that `o` is a fingerprint
+for all of our patterns. But if we combined `C0` and `C1` with an `AND`
+operation:
+
+```ignore
+     b         a        ... f         o        ... p
+C  = 00000110  0            00000001  0            0
+```
+
+Then we now have that `C[i]` contains a bitset corresponding to the matching
+fingerprints in a haystack's 16 byte block, where `i` is the `ith` byte in that
+block.
+
+Once we have that, we can look for the position of the least significant bit
+in `C`. (Least significant because we only target `x86_64` here, which is
+always little endian. Thus, the least significant bytes correspond to bytes
+in our haystack at a lower address.) That position, modulo `8`, gives us
+the pattern that the fingerprint matches. That position, integer divided by
+`8`, also gives us the byte offset that the fingerprint occurs in inside the
+16 byte haystack block. Using those two pieces of information, we can run a
+verification procedure that tries to match all substrings containing that
+fingerprint at that position in the haystack.
+
+
+# Implementation notes
+
+The problem with the algorithm as described above is that it uses a single byte
+for a fingerprint. This will work well if the fingerprints are rare in the
+haystack (e.g., capital letters or special characters in normal English text),
+but if the fingerprints are common, you'll wind up spending too much time in
+the verification step, which effectively negates the performance benefits of
+scanning 16 bytes at a time. Remember, the key to the performance of this
+algorithm is to do as little work as possible per 16 (or 32) bytes.
+
+This algorithm can be extrapolated in a relatively straight-forward way to use
+larger fingerprints. That is, instead of a single byte prefix, we might use a
+two or three byte prefix. The implementation here implements N = {1, 2, 3}
+and always picks the largest N possible. The rationale is that the bigger the
+fingerprint, the fewer verification steps we'll do. Of course, if N is too
+large, then we'll end up doing too much on each step.
+
+The way to extend it is:
+
+1. Add a mask for each byte in the fingerprint. (Remember that each mask is
+   composed of two SIMD vectors.) This results in a value of `C` for each byte
+   in the fingerprint while searching.
+2. When testing each 16 (or 32) byte block, each value of `C` must be shifted
+   so that they are aligned. Once aligned, they should all be `AND`'d together.
+   This will give you only the bitsets corresponding to the full match of the
+   fingerprint. To do this, one needs to save the last byte (for N=2) or last
+   two bytes (for N=3) from the previous iteration, and then line them up with
+   the first one or two bytes of the next iteration.
+
+## Verification
+
+Verification generally follows the procedure outlined above. The tricky parts
+are in the right formulation of operations to get our bits out of our vectors.
+We have a limited set of operations available to us on SIMD vectors as 128-bit
+or 256-bit numbers, so we wind up needing to rip out 2 (or 4) 64-bit integers
+from our vectors, and then run our verification step on each of those. The
+verification step looks at the least significant bit set, and from its
+position, we can derive the byte offset and bucket. (Again, as described
+above.) Once we know the bucket, we do a fairly naive exhaustive search for
+every literal in that bucket. (Hyperscan is a bit smarter here and uses a hash
+table, but I haven't had time to thoroughly explore that. A few initial
+half-hearted attempts resulted in worse performance.)
+
+## AVX
+
+The AVX version of Teddy extrapolates almost perfectly from the SSE version.
+The only hickup is that PALIGNR is used to align chunks in the 16-bit version,
+and there is no equivalent instruction in AVX. AVX does have VPALIGNR, but it
+only works within 128-bit lanes. So there's a bit of tomfoolery to get around
+this by shuffling the vectors before calling VPALIGNR.
+
+The only other aspect to AVX is that since our masks are still fundamentally
+16-bytes (0x0-0xF), they are duplicated to 32-bytes, so that they can apply to
+32-byte chunks.
+
+## Fat Teddy
+
+In the version of Teddy described above, 8 buckets are used to group patterns
+that we want to search for. However, when AVX is available, we can extend the
+number of buckets to 16 by permitting each byte in our masks to use 16-bits
+instead of 8-bits to represent the buckets it belongs to. (This variant is also
+in Hyperscan.) However, what we give up is the ability to scan 32 bytes at a
+time, even though we're using AVX. Instead, we have to scan 16 bytes at a time.
+What we gain, though, is (hopefully) less work in our verification routine.
+It patterns are more spread out across more buckets, then there should overall
+be fewer false positives. In general, Fat Teddy permits us to grow our capacity
+a bit and search for more literals before Teddy gets overwhelmed.
+
+The tricky part of Fat Teddy is in how we adjust our masks and our verification
+procedure. For the masks, we simply represent the first 8 buckets in each of
+the low 16 bytes, and then the second 8 buckets in each of the high 16 bytes.
+Then, in the search loop, instead of loading 32 bytes from the haystack, we
+load the same 16 bytes from the haystack into both the low and high 16 byte
+portions of our 256-bit vector. So for example, a mask might look like this:
+
+    bits:   00100001 00000000 ... 11000000 00000000 00000001 ... 00000000
+    byte:      31       30           16       15       14            0
+    offset:    15       14           0        15       14            0
+    buckets:  8-15     8-15         8-15      0-7      0-7           0-7
+
+Where `byte` is the position in the vector (higher numbers corresponding to
+more significant bits), `offset` is the corresponding position in the haystack
+chunk, and `buckets` corresponds to the bucket assignments for that particular
+byte.
+
+In particular, notice that the bucket assignments for offset `0` are spread
+out between bytes `0` and `16`. This works well for the chunk-by-chunk search
+procedure, but verification really wants to process all bucket assignments for
+each offset at once. Otherwise, we might wind up finding a match at offset
+`1` in one the first 8 buckets, when we really should have reported a match
+at offset `0` in one of the second 8 buckets. (Because we want the leftmost
+match.)
+
+Thus, for verification, we rearrange the above vector such that it is a
+sequence of 16-bit integers, where the least significant 16-bit integer
+corresponds to all of the bucket assignments for offset `0`. So with the
+above vector, the least significant 16-bit integer would be
+
+    11000000 000000
+
+which was taken from bytes `16` and `0`. Then the verification step pretty much
+runs as described, except with 16 buckets instead of 8.
+
+
+# References
+
+- **[1]** [Hyperscan on GitHub](https://github.com/01org/hyperscan),
+    [webpage](https://01.org/hyperscan)
+- **[2a]** Ben-Kiki, O., Bille, P., Breslauer, D., Gasieniec, L., Grossi, R.,
+    & Weimann, O. (2011).
+    _Optimal packed string matching_.
+    In LIPIcs-Leibniz International Proceedings in Informatics (Vol. 13).
+    Schloss Dagstuhl-Leibniz-Zentrum fuer Informatik.
+    DOI: 10.4230/LIPIcs.FSTTCS.2011.423.
+    [PDF](http://drops.dagstuhl.de/opus/volltexte/2011/3355/pdf/37.pdf).
+- **[2b]** Ben-Kiki, O., Bille, P., Breslauer, D., Ga̧sieniec, L., Grossi, R.,
+    & Weimann, O. (2014).
+    _Towards optimal packed string matching_.
+    Theoretical Computer Science, 525, 111-129.
+    DOI: 10.1016/j.tcs.2013.06.013.
+    [PDF](http://www.cs.haifa.ac.il/~oren/Publications/bpsm.pdf).
+- **[3]** Bille, P. (2011).
+    _Fast searching in packed strings_.
+    Journal of Discrete Algorithms, 9(1), 49-56.
+    DOI: 10.1016/j.jda.2010.09.003.
+    [PDF](http://www.sciencedirect.com/science/article/pii/S1570866710000353).
+- **[4a]** Faro, S., & Külekci, M. O. (2012, October).
+    _Fast multiple string matching using streaming SIMD extensions technology_.
+    In String Processing and Information Retrieval (pp. 217-228).
+    Springer Berlin Heidelberg.
+    DOI: 10.1007/978-3-642-34109-0_23.
+    [PDF](http://www.dmi.unict.it/~faro/papers/conference/faro32.pdf).
+- **[4b]** Faro, S., & Külekci, M. O. (2013, September).
+    _Towards a Very Fast Multiple String Matching Algorithm for Short Patterns_.
+    In Stringology (pp. 78-91).
+    [PDF](http://www.dmi.unict.it/~faro/papers/conference/faro36.pdf).
+- **[4c]** Faro, S., & Külekci, M. O. (2013, January).
+    _Fast packed string matching for short patterns_.
+    In Proceedings of the Meeting on Algorithm Engineering & Expermiments
+    (pp. 113-121).
+    Society for Industrial and Applied Mathematics.
+    [PDF](http://arxiv.org/pdf/1209.6449.pdf).
+- **[4d]** Faro, S., & Külekci, M. O. (2014).
+    _Fast and flexible packed string matching_.
+    Journal of Discrete Algorithms, 28, 61-72.
+    DOI: 10.1016/j.jda.2014.07.003.
+
+[1_u]: https://github.com/01org/hyperscan
+[5_u]: https://software.intel.com/sites/landingpage/IntrinsicsGuide
diff --git a/src/packed/teddy/compile.rs b/src/packed/teddy/compile.rs
new file mode 100644
index 0000000..a7a48b7
--- /dev/null
+++ b/src/packed/teddy/compile.rs
@@ -0,0 +1,414 @@
+// See the README in this directory for an explanation of the Teddy algorithm.
+
+use std::cmp;
+use std::collections::BTreeMap;
+use std::fmt;
+
+use packed::pattern::{PatternID, Patterns};
+use packed::teddy::Teddy;
+
+/// A builder for constructing a Teddy matcher.
+///
+/// The builder primarily permits fine grained configuration of the Teddy
+/// matcher. Most options are made only available for testing/benchmarking
+/// purposes. In reality, options are automatically determined by the nature
+/// and number of patterns given to the builder.
+#[derive(Clone, Debug)]
+pub struct Builder {
+    /// When none, this is automatically determined. Otherwise, `false` means
+    /// slim Teddy is used (8 buckets) and `true` means fat Teddy is used
+    /// (16 buckets). Fat Teddy requires AVX2, so if that CPU feature isn't
+    /// available and Fat Teddy was requested, no matcher will be built.
+    fat: Option<bool>,
+    /// When none, this is automatically determined. Otherwise, `false` means
+    /// that 128-bit vectors will be used (up to SSSE3 instructions) where as
+    /// `true` means that 256-bit vectors will be used. As with `fat`, if
+    /// 256-bit vectors are requested and they aren't available, then a
+    /// searcher will not be built.
+    avx: Option<bool>,
+}
+
+impl Default for Builder {
+    fn default() -> Builder {
+        Builder::new()
+    }
+}
+
+impl Builder {
+    /// Create a new builder for configuring a Teddy matcher.
+    pub fn new() -> Builder {
+        Builder { fat: None, avx: None }
+    }
+
+    /// Build a matcher for the set of patterns given. If a matcher could not
+    /// be built, then `None` is returned.
+    ///
+    /// Generally, a matcher isn't built if the necessary CPU features aren't
+    /// available, an unsupported target or if the searcher is believed to be
+    /// slower than standard techniques (i.e., if there are too many literals).
+    pub fn build(&self, patterns: &Patterns) -> Option<Teddy> {
+        self.build_imp(patterns)
+    }
+
+    /// Require the use of Fat (true) or Slim (false) Teddy. Fat Teddy uses
+    /// 16 buckets where as Slim Teddy uses 8 buckets. More buckets are useful
+    /// for a larger set of literals.
+    ///
+    /// `None` is the default, which results in an automatic selection based
+    /// on the number of literals and available CPU features.
+    pub fn fat(&mut self, yes: Option<bool>) -> &mut Builder {
+        self.fat = yes;
+        self
+    }
+
+    /// Request the use of 256-bit vectors (true) or 128-bit vectors (false).
+    /// Generally, a larger vector size is better since it either permits
+    /// matching more patterns or matching more bytes in the haystack at once.
+    ///
+    /// `None` is the default, which results in an automatic selection based on
+    /// the number of literals and available CPU features.
+    pub fn avx(&mut self, yes: Option<bool>) -> &mut Builder {
+        self.avx = yes;
+        self
+    }
+
+    fn build_imp(&self, patterns: &Patterns) -> Option<Teddy> {
+        use packed::teddy::runtime;
+
+        // Most of the logic here is just about selecting the optimal settings,
+        // or perhaps even rejecting construction altogether. The choices
+        // we have are: fat (avx only) or not, ssse3 or avx2, and how many
+        // patterns we allow ourselves to search. Additionally, for testing
+        // and benchmarking, we permit callers to try to "force" a setting,
+        // and if the setting isn't allowed (e.g., forcing AVX when AVX isn't
+        // available), then we bail and return nothing.
+
+        if patterns.len() > 64 {
+            return None;
+        }
+        let has_ssse3 = is_x86_feature_detected!("ssse3");
+        let has_avx = is_x86_feature_detected!("avx2");
+        let avx = if self.avx == Some(true) {
+            if !has_avx {
+                return None;
+            }
+            true
+        } else if self.avx == Some(false) {
+            if !has_ssse3 {
+                return None;
+            }
+            false
+        } else if !has_ssse3 && !has_avx {
+            return None;
+        } else {
+            has_avx
+        };
+        let fat = match self.fat {
+            None => avx && patterns.len() > 32,
+            Some(false) => false,
+            Some(true) if !avx => return None,
+            Some(true) => true,
+        };
+
+        let mut compiler = Compiler::new(patterns, fat);
+        compiler.compile();
+        let Compiler { buckets, masks, .. } = compiler;
+        // SAFETY: It is required that the builder only produce Teddy matchers
+        // that are allowed to run on the current CPU, since we later assume
+        // that the presence of (for example) TeddySlim1Mask256 means it is
+        // safe to call functions marked with the `avx2` target feature.
+        match (masks.len(), avx, fat) {
+            (1, false, _) => Some(Teddy {
+                buckets,
+                max_pattern_id: patterns.max_pattern_id(),
+                exec: runtime::Exec::TeddySlim1Mask128(
+                    runtime::TeddySlim1Mask128 {
+                        mask1: runtime::Mask128::new(masks[0]),
+                    },
+                ),
+            }),
+            (1, true, false) => Some(Teddy {
+                buckets,
+                max_pattern_id: patterns.max_pattern_id(),
+                exec: runtime::Exec::TeddySlim1Mask256(
+                    runtime::TeddySlim1Mask256 {
+                        mask1: runtime::Mask256::new(masks[0]),
+                    },
+                ),
+            }),
+            (1, true, true) => Some(Teddy {
+                buckets,
+                max_pattern_id: patterns.max_pattern_id(),
+                exec: runtime::Exec::TeddyFat1Mask256(
+                    runtime::TeddyFat1Mask256 {
+                        mask1: runtime::Mask256::new(masks[0]),
+                    },
+                ),
+            }),
+            (2, false, _) => Some(Teddy {
+                buckets,
+                max_pattern_id: patterns.max_pattern_id(),
+                exec: runtime::Exec::TeddySlim2Mask128(
+                    runtime::TeddySlim2Mask128 {
+                        mask1: runtime::Mask128::new(masks[0]),
+                        mask2: runtime::Mask128::new(masks[1]),
+                    },
+                ),
+            }),
+            (2, true, false) => Some(Teddy {
+                buckets,
+                max_pattern_id: patterns.max_pattern_id(),
+                exec: runtime::Exec::TeddySlim2Mask256(
+                    runtime::TeddySlim2Mask256 {
+                        mask1: runtime::Mask256::new(masks[0]),
+                        mask2: runtime::Mask256::new(masks[1]),
+                    },
+                ),
+            }),
+            (2, true, true) => Some(Teddy {
+                buckets,
+                max_pattern_id: patterns.max_pattern_id(),
+                exec: runtime::Exec::TeddyFat2Mask256(
+                    runtime::TeddyFat2Mask256 {
+                        mask1: runtime::Mask256::new(masks[0]),
+                        mask2: runtime::Mask256::new(masks[1]),
+                    },
+                ),
+            }),
+            (3, false, _) => Some(Teddy {
+                buckets,
+                max_pattern_id: patterns.max_pattern_id(),
+                exec: runtime::Exec::TeddySlim3Mask128(
+                    runtime::TeddySlim3Mask128 {
+                        mask1: runtime::Mask128::new(masks[0]),
+                        mask2: runtime::Mask128::new(masks[1]),
+                        mask3: runtime::Mask128::new(masks[2]),
+                    },
+                ),
+            }),
+            (3, true, false) => Some(Teddy {
+                buckets,
+                max_pattern_id: patterns.max_pattern_id(),
+                exec: runtime::Exec::TeddySlim3Mask256(
+                    runtime::TeddySlim3Mask256 {
+                        mask1: runtime::Mask256::new(masks[0]),
+                        mask2: runtime::Mask256::new(masks[1]),
+                        mask3: runtime::Mask256::new(masks[2]),
+                    },
+                ),
+            }),
+            (3, true, true) => Some(Teddy {
+                buckets,
+                max_pattern_id: patterns.max_pattern_id(),
+                exec: runtime::Exec::TeddyFat3Mask256(
+                    runtime::TeddyFat3Mask256 {
+                        mask1: runtime::Mask256::new(masks[0]),
+                        mask2: runtime::Mask256::new(masks[1]),
+                        mask3: runtime::Mask256::new(masks[2]),
+                    },
+                ),
+            }),
+            _ => unreachable!(),
+        }
+    }
+}
+
+/// A compiler is in charge of allocating patterns into buckets and generating
+/// the masks necessary for searching.
+#[derive(Clone)]
+struct Compiler<'p> {
+    patterns: &'p Patterns,
+    buckets: Vec<Vec<PatternID>>,
+    masks: Vec<Mask>,
+}
+
+impl<'p> Compiler<'p> {
+    /// Create a new Teddy compiler for the given patterns. If `fat` is true,
+    /// then 16 buckets will be used instead of 8.
+    ///
+    /// This panics if any of the patterns given are empty.
+    fn new(patterns: &'p Patterns, fat: bool) -> Compiler<'p> {
+        let mask_len = cmp::min(3, patterns.minimum_len());
+        assert!(1 <= mask_len && mask_len <= 3);
+
+        Compiler {
+            patterns,
+            buckets: vec![vec![]; if fat { 16 } else { 8 }],
+            masks: vec![Mask::default(); mask_len],
+        }
+    }
+
+    /// Compile the patterns in this compiler into buckets and masks.
+    fn compile(&mut self) {
+        let mut lonibble_to_bucket: BTreeMap<Vec<u8>, usize> = BTreeMap::new();
+        for (id, pattern) in self.patterns.iter() {
+            // We try to be slightly clever in how we assign patterns into
+            // buckets. Generally speaking, we want patterns with the same
+            // prefix to be in the same bucket, since it minimizes the amount
+            // of time we spend churning through buckets in the verification
+            // step.
+            //
+            // So we could assign patterns with the same N-prefix (where N
+            // is the size of the mask, which is one of {1, 2, 3}) to the
+            // same bucket. However, case insensitive searches are fairly
+            // common, so we'd for example, ideally want to treat `abc` and
+            // `ABC` as if they shared the same prefix. ASCII has the nice
+            // property that the lower 4 bits of A and a are the same, so we
+            // therefore group patterns with the same low-nybbe-N-prefix into
+            // the same bucket.
+            //
+            // MOREOVER, this is actually necessary for correctness! In
+            // particular, by grouping patterns with the same prefix into the
+            // same bucket, we ensure that we preserve correct leftmost-first
+            // and leftmost-longest match semantics. In addition to the fact
+            // that `patterns.iter()` iterates in the correct order, this
+            // guarantees that all possible ambiguous matches will occur in
+            // the same bucket. The verification routine could be adjusted to
+            // support correct leftmost match semantics regardless of bucket
+            // allocation, but that results in a performance hit. It's much
+            // nicer to be able to just stop as soon as a match is found.
+            let lonybs = pattern.low_nybbles(self.masks.len());
+            if let Some(&bucket) = lonibble_to_bucket.get(&lonybs) {
+                self.buckets[bucket].push(id);
+            } else {
+                // N.B. We assign buckets in reverse because it shouldn't have
+                // any influence on performance, but it does make it harder to
+                // get leftmost match semantics accidentally correct.
+                let bucket = (self.buckets.len() - 1)
+                    - (id as usize % self.buckets.len());
+                self.buckets[bucket].push(id);
+                lonibble_to_bucket.insert(lonybs, bucket);
+            }
+        }
+        for (bucket_index, bucket) in self.buckets.iter().enumerate() {
+            for &pat_id in bucket {
+                let pat = self.patterns.get(pat_id);
+                for (i, mask) in self.masks.iter_mut().enumerate() {
+                    if self.buckets.len() == 8 {
+                        mask.add_slim(bucket_index as u8, pat.bytes()[i]);
+                    } else {
+                        mask.add_fat(bucket_index as u8, pat.bytes()[i]);
+                    }
+                }
+            }
+        }
+    }
+}
+
+impl<'p> fmt::Debug for Compiler<'p> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let mut buckets = vec![vec![]; self.buckets.len()];
+        for (i, bucket) in self.buckets.iter().enumerate() {
+            for &patid in bucket {
+                buckets[i].push(self.patterns.get(patid));
+            }
+        }
+        f.debug_struct("Compiler")
+            .field("buckets", &buckets)
+            .field("masks", &self.masks)
+            .finish()
+    }
+}
+
+/// Mask represents the low and high nybble masks that will be used during
+/// search. Each mask is 32 bytes wide, although only the first 16 bytes are
+/// used for the SSSE3 runtime.
+///
+/// Each byte in the mask corresponds to a 8-bit bitset, where bit `i` is set
+/// if and only if the corresponding nybble is in the ith bucket. The index of
+/// the byte (0-15, inclusive) corresponds to the nybble.
+///
+/// Each mask is used as the target of a shuffle, where the indices for the
+/// shuffle are taken from the haystack. AND'ing the shuffles for both the
+/// low and high masks together also results in 8-bit bitsets, but where bit
+/// `i` is set if and only if the correspond *byte* is in the ith bucket.
+///
+/// During compilation, masks are just arrays. But during search, these masks
+/// are represented as 128-bit or 256-bit vectors.
+///
+/// (See the README is this directory for more details.)
+#[derive(Clone, Copy, Default)]
+pub struct Mask {
+    lo: [u8; 32],
+    hi: [u8; 32],
+}
+
+impl Mask {
+    /// Update this mask by adding the given byte to the given bucket. The
+    /// given bucket must be in the range 0-7.
+    ///
+    /// This is for "slim" Teddy, where there are only 8 buckets.
+    fn add_slim(&mut self, bucket: u8, byte: u8) {
+        assert!(bucket < 8);
+
+        let byte_lo = (byte & 0xF) as usize;
+        let byte_hi = ((byte >> 4) & 0xF) as usize;
+        // When using 256-bit vectors, we need to set this bucket assignment in
+        // the low and high 128-bit portions of the mask. This allows us to
+        // process 32 bytes at a time. Namely, AVX2 shuffles operate on each
+        // of the 128-bit lanes, rather than the full 256-bit vector at once.
+        self.lo[byte_lo] |= 1 << bucket;
+        self.lo[byte_lo + 16] |= 1 << bucket;
+        self.hi[byte_hi] |= 1 << bucket;
+        self.hi[byte_hi + 16] |= 1 << bucket;
+    }
+
+    /// Update this mask by adding the given byte to the given bucket. The
+    /// given bucket must be in the range 0-15.
+    ///
+    /// This is for "fat" Teddy, where there are 16 buckets.
+    fn add_fat(&mut self, bucket: u8, byte: u8) {
+        assert!(bucket < 16);
+
+        let byte_lo = (byte & 0xF) as usize;
+        let byte_hi = ((byte >> 4) & 0xF) as usize;
+        // Unlike slim teddy, fat teddy only works with AVX2. For fat teddy,
+        // the high 128 bits of our mask correspond to buckets 8-15, while the
+        // low 128 bits correspond to buckets 0-7.
+        if bucket < 8 {
+            self.lo[byte_lo] |= 1 << bucket;
+            self.hi[byte_hi] |= 1 << bucket;
+        } else {
+            self.lo[byte_lo + 16] |= 1 << (bucket % 8);
+            self.hi[byte_hi + 16] |= 1 << (bucket % 8);
+        }
+    }
+
+    /// Return the low 128 bits of the low-nybble mask.
+    pub fn lo128(&self) -> [u8; 16] {
+        let mut tmp = [0; 16];
+        tmp.copy_from_slice(&self.lo[..16]);
+        tmp
+    }
+
+    /// Return the full low-nybble mask.
+    pub fn lo256(&self) -> [u8; 32] {
+        self.lo
+    }
+
+    /// Return the low 128 bits of the high-nybble mask.
+    pub fn hi128(&self) -> [u8; 16] {
+        let mut tmp = [0; 16];
+        tmp.copy_from_slice(&self.hi[..16]);
+        tmp
+    }
+
+    /// Return the full high-nybble mask.
+    pub fn hi256(&self) -> [u8; 32] {
+        self.hi
+    }
+}
+
+impl fmt::Debug for Mask {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let (mut parts_lo, mut parts_hi) = (vec![], vec![]);
+        for i in 0..32 {
+            parts_lo.push(format!("{:02}: {:08b}", i, self.lo[i]));
+            parts_hi.push(format!("{:02}: {:08b}", i, self.hi[i]));
+        }
+        f.debug_struct("Mask")
+            .field("lo", &parts_lo)
+            .field("hi", &parts_hi)
+            .finish()
+    }
+}
diff --git a/src/packed/teddy/mod.rs b/src/packed/teddy/mod.rs
new file mode 100644
index 0000000..b896023
--- /dev/null
+++ b/src/packed/teddy/mod.rs
@@ -0,0 +1,62 @@
+#[cfg(target_arch = "x86_64")]
+pub use packed::teddy::compile::Builder;
+#[cfg(not(target_arch = "x86_64"))]
+pub use packed::teddy::fallback::Builder;
+#[cfg(not(target_arch = "x86_64"))]
+pub use packed::teddy::fallback::Teddy;
+#[cfg(target_arch = "x86_64")]
+pub use packed::teddy::runtime::Teddy;
+
+#[cfg(target_arch = "x86_64")]
+mod compile;
+#[cfg(target_arch = "x86_64")]
+mod runtime;
+
+#[cfg(not(target_arch = "x86_64"))]
+mod fallback {
+    use packed::pattern::Patterns;
+    use Match;
+
+    #[derive(Clone, Debug, Default)]
+    pub struct Builder(());
+
+    impl Builder {
+        pub fn new() -> Builder {
+            Builder(())
+        }
+
+        pub fn build(&self, _: &Patterns) -> Option<Teddy> {
+            None
+        }
+
+        pub fn fat(&mut self, _: Option<bool>) -> &mut Builder {
+            self
+        }
+
+        pub fn avx(&mut self, _: Option<bool>) -> &mut Builder {
+            self
+        }
+    }
+
+    #[derive(Clone, Debug)]
+    pub struct Teddy(());
+
+    impl Teddy {
+        pub fn find_at(
+            &self,
+            _: &Patterns,
+            _: &[u8],
+            _: usize,
+        ) -> Option<Match> {
+            None
+        }
+
+        pub fn minimum_len(&self) -> usize {
+            0
+        }
+
+        pub fn heap_bytes(&self) -> usize {
+            0
+        }
+    }
+}
diff --git a/src/packed/teddy/runtime.rs b/src/packed/teddy/runtime.rs
new file mode 100644
index 0000000..a736948
--- /dev/null
+++ b/src/packed/teddy/runtime.rs
@@ -0,0 +1,1204 @@
+// See the README in this directory for an explanation of the Teddy algorithm.
+// It is strongly recommended to peruse the README before trying to grok this
+// code, as its use of SIMD is pretty opaque, although I tried to add comments
+// where appropriate.
+//
+// Moreover, while there is a lot of code in this file, most of it is
+// repeated variants of the same thing. Specifically, there are three Teddy
+// variants: Slim 128-bit Teddy (8 buckets), Slim 256-bit Teddy (8 buckets)
+// and Fat 256-bit Teddy (16 buckets). For each variant, there are three
+// implementations, corresponding to mask lengths of 1, 2 and 3. Bringing it to
+// a total of nine variants. Each one is structured roughly the same:
+//
+//     while at <= len(haystack) - CHUNK_SIZE:
+//         let candidate = find_candidate_in_chunk(haystack, at)
+//         if not all zeroes(candidate):
+//             if match = verify(haystack, at, candidate):
+//                 return match
+//
+// For the most part, this remains unchanged. The parts that vary are the
+// verification routine (for slim vs fat Teddy) and the candidate extraction
+// (based on the number of masks).
+//
+// In the code below, a "candidate" corresponds to a single vector with 8-bit
+// lanes. Each lane is itself an 8-bit bitset, where the ith bit is set in the
+// jth lane if and only if the byte occurring at position `j` is in the
+// bucket `i` (where the `j`th position is the position in the current window
+// of the haystack, which is always 16 or 32 bytes). Note to be careful here:
+// the ith bit and the jth lane correspond to the least significant bits of the
+// vector. So when visualizing how the current window of bytes is stored in a
+// vector, you often need to flip it around. For example, the text `abcd` in a
+// 4-byte vector would look like this:
+//
+//     01100100 01100011 01100010 01100001
+//         d        c        b        a
+//
+// When the mask length is 1, then finding the candidate is pretty straight
+// forward: you just apply the shuffle indices (from the haystack window) to
+// the masks, and then AND them together, as described in the README. But for
+// masks of length 2 and 3, you need to keep a little state. Specifically,
+// you need to store the final 1 (for mask length 2) or 2 (for mask length 3)
+// bytes of the candidate for use when searching the next window. This is for
+// handling matches that span two windows.
+//
+// With respect to the repeated code, it would likely be possible to reduce
+// the number of copies of code below using polymorphism, but I find this
+// formulation clearer instead of needing to reason through generics. However,
+// I admit, there may be a simpler generic construction that I'm missing.
+//
+// All variants are fairly heavily tested in src/packed/tests.rs.
+
+use std::arch::x86_64::*;
+use std::mem;
+
+use packed::pattern::{PatternID, Patterns};
+use packed::teddy::compile;
+use packed::vector::*;
+use Match;
+
+/// The Teddy runtime.
+///
+/// A Teddy runtime can be used to quickly search for occurrences of one or
+/// more patterns. While it does not scale to an arbitrary number of patterns
+/// like Aho-Corasick, it does find occurrences for a small set of patterns
+/// much more quickly than Aho-Corasick.
+///
+/// Teddy cannot run on small haystacks below a certain size, which is
+/// dependent on the type of matcher used. This size can be queried via the
+/// `minimum_len` method. Violating this will result in a panic.
+///
+/// Finally, when callers use a Teddy runtime, they must provide precisely the
+/// patterns used to construct the Teddy matcher. Violating this will result
+/// in either a panic or incorrect results, but will never sacrifice memory
+/// safety.
+#[derive(Clone, Debug)]
+pub struct Teddy {
+    /// The allocation of patterns in buckets. This only contains the IDs of
+    /// patterns. In order to do full verification, callers must provide the
+    /// actual patterns when using Teddy.
+    pub buckets: Vec<Vec<PatternID>>,
+    /// The maximum identifier of a pattern. This is used as a sanity check to
+    /// ensure that the patterns provided by the caller are the same as the
+    /// patterns that were used to compile the matcher. This sanity check
+    /// permits safely eliminating bounds checks regardless of what patterns
+    /// are provided by the caller.
+    ///
+    /// Note that users of the aho-corasick crate cannot get this wrong. Only
+    /// code internal to this crate can get it wrong, since neither `Patterns`
+    /// type nor the Teddy runtime are public API items.
+    pub max_pattern_id: PatternID,
+    /// The actual runtime to use.
+    pub exec: Exec,
+}
+
+impl Teddy {
+    /// Return the first occurrence of a match in the given haystack after or
+    /// starting at `at`.
+    ///
+    /// The patterns provided must be precisely the same patterns given to the
+    /// Teddy builder, otherwise this may panic or produce incorrect results.
+    ///
+    /// All matches are consistent with the match semantics (leftmost-first or
+    /// leftmost-longest) set on `pats`.
+    pub fn find_at(
+        &self,
+        pats: &Patterns,
+        haystack: &[u8],
+        at: usize,
+    ) -> Option<Match> {
+        // This assert is a bit subtle, but it's an important guarantee.
+        // Namely, if the maximum pattern ID seen by Teddy is the same as the
+        // one in the patterns given, then we are guaranteed that every pattern
+        // ID in all Teddy buckets are valid indices into `pats`. While this
+        // is nominally true, there is no guarantee that callers provide the
+        // same `pats` to both the Teddy builder and the searcher, which would
+        // otherwise make `find_at` unsafe to call. But this assert lets us
+        // keep this routine safe and eliminate an important bounds check in
+        // verification.
+        assert_eq!(
+            self.max_pattern_id,
+            pats.max_pattern_id(),
+            "teddy must be called with same patterns it was built with",
+        );
+        // SAFETY: The haystack must have at least a minimum number of bytes
+        // for Teddy to be able to work. The minimum number varies depending on
+        // which matcher is used below. If this is violated, then it's possible
+        // for searching to do out-of-bounds writes.
+        assert!(haystack[at..].len() >= self.minimum_len());
+        // SAFETY: The various Teddy matchers are always safe to call because
+        // the Teddy builder guarantees that a particular Exec variant is
+        // built only when it can be run the current CPU. That is, the Teddy
+        // builder will not produce a Exec::TeddySlim1Mask256 unless AVX2 is
+        // enabled. That is, our dynamic CPU feature detection is performed
+        // once in the builder, and we rely on the type system to avoid needing
+        // to do it again.
+        unsafe {
+            match self.exec {
+                Exec::TeddySlim1Mask128(ref e) => {
+                    e.find_at(pats, self, haystack, at)
+                }
+                Exec::TeddySlim1Mask256(ref e) => {
+                    e.find_at(pats, self, haystack, at)
+                }
+                Exec::TeddyFat1Mask256(ref e) => {
+                    e.find_at(pats, self, haystack, at)
+                }
+                Exec::TeddySlim2Mask128(ref e) => {
+                    e.find_at(pats, self, haystack, at)
+                }
+                Exec::TeddySlim2Mask256(ref e) => {
+                    e.find_at(pats, self, haystack, at)
+                }
+                Exec::TeddyFat2Mask256(ref e) => {
+                    e.find_at(pats, self, haystack, at)
+                }
+                Exec::TeddySlim3Mask128(ref e) => {
+                    e.find_at(pats, self, haystack, at)
+                }
+                Exec::TeddySlim3Mask256(ref e) => {
+                    e.find_at(pats, self, haystack, at)
+                }
+                Exec::TeddyFat3Mask256(ref e) => {
+                    e.find_at(pats, self, haystack, at)
+                }
+            }
+        }
+    }
+
+    /// Returns the minimum length of a haystack that must be provided by
+    /// callers to this Teddy searcher. Providing a haystack shorter than this
+    /// will result in a panic, but will never violate memory safety.
+    pub fn minimum_len(&self) -> usize {
+        // SAFETY: These values must be correct in order to ensure safety.
+        // The Teddy runtime assumes their haystacks have at least these
+        // lengths. Violating this will sacrifice memory safety.
+        match self.exec {
+            Exec::TeddySlim1Mask128(_) => 16,
+            Exec::TeddySlim1Mask256(_) => 32,
+            Exec::TeddyFat1Mask256(_) => 16,
+            Exec::TeddySlim2Mask128(_) => 17,
+            Exec::TeddySlim2Mask256(_) => 33,
+            Exec::TeddyFat2Mask256(_) => 17,
+            Exec::TeddySlim3Mask128(_) => 18,
+            Exec::TeddySlim3Mask256(_) => 34,
+            Exec::TeddyFat3Mask256(_) => 34,
+        }
+    }
+
+    /// Returns the approximate total amount of heap used by this searcher, in
+    /// units of bytes.
+    pub fn heap_bytes(&self) -> usize {
+        let num_patterns = self.max_pattern_id as usize + 1;
+        self.buckets.len() * mem::size_of::<Vec<PatternID>>()
+            + num_patterns * mem::size_of::<PatternID>()
+    }
+
+    /// Runs the verification routine for Slim 128-bit Teddy.
+    ///
+    /// The candidate given should be a collection of 8-bit bitsets (one bitset
+    /// per lane), where the ith bit is set in the jth lane if and only if the
+    /// byte occurring at `at + j` in `haystack` is in the bucket `i`.
+    ///
+    /// This is not safe to call unless the SSSE3 target feature is enabled.
+    /// The `target_feature` attribute is not applied since this function is
+    /// always forcefully inlined.
+    #[inline(always)]
+    unsafe fn verify128(
+        &self,
+        pats: &Patterns,
+        haystack: &[u8],
+        at: usize,
+        cand: __m128i,
+    ) -> Option<Match> {
+        debug_assert!(!is_all_zeroes128(cand));
+        debug_assert_eq!(8, self.buckets.len());
+
+        // Convert the candidate into 64-bit chunks, and then verify each of
+        // those chunks.
+        let parts = unpack64x128(cand);
+        for (i, &part) in parts.iter().enumerate() {
+            let pos = at + i * 8;
+            if let Some(m) = self.verify64(pats, 8, haystack, pos, part) {
+                return Some(m);
+            }
+        }
+        None
+    }
+
+    /// Runs the verification routine for Slim 256-bit Teddy.
+    ///
+    /// The candidate given should be a collection of 8-bit bitsets (one bitset
+    /// per lane), where the ith bit is set in the jth lane if and only if the
+    /// byte occurring at `at + j` in `haystack` is in the bucket `i`.
+    ///
+    /// This is not safe to call unless the AVX2 target feature is enabled.
+    /// The `target_feature` attribute is not applied since this function is
+    /// always forcefully inlined.
+    #[inline(always)]
+    unsafe fn verify256(
+        &self,
+        pats: &Patterns,
+        haystack: &[u8],
+        at: usize,
+        cand: __m256i,
+    ) -> Option<Match> {
+        debug_assert!(!is_all_zeroes256(cand));
+        debug_assert_eq!(8, self.buckets.len());
+
+        // Convert the candidate into 64-bit chunks, and then verify each of
+        // those chunks.
+        let parts = unpack64x256(cand);
+        for (i, &part) in parts.iter().enumerate() {
+            let pos = at + i * 8;
+            if let Some(m) = self.verify64(pats, 8, haystack, pos, part) {
+                return Some(m);
+            }
+        }
+        None
+    }
+
+    /// Runs the verification routine for Fat 256-bit Teddy.
+    ///
+    /// The candidate given should be a collection of 8-bit bitsets (one bitset
+    /// per lane), where the ith bit is set in the jth lane if and only if the
+    /// byte occurring at `at + (j < 16 ? j : j - 16)` in `haystack` is in the
+    /// bucket `j < 16 ? i : i + 8`.
+    ///
+    /// This is not safe to call unless the AVX2 target feature is enabled.
+    /// The `target_feature` attribute is not applied since this function is
+    /// always forcefully inlined.
+    #[inline(always)]
+    unsafe fn verify_fat256(
+        &self,
+        pats: &Patterns,
+        haystack: &[u8],
+        at: usize,
+        cand: __m256i,
+    ) -> Option<Match> {
+        debug_assert!(!is_all_zeroes256(cand));
+        debug_assert_eq!(16, self.buckets.len());
+
+        // This is a bit tricky, but we basically want to convert our
+        // candidate, which looks like this
+        //
+        //     a31 a30 ... a17 a16 a15 a14 ... a01 a00
+        //
+        // where each a(i) is an 8-bit bitset corresponding to the activated
+        // buckets, to this
+        //
+        //     a31 a15 a30 a14 a29 a13 ... a18 a02 a17 a01 a16 a00
+        //
+        // Namely, for Fat Teddy, the high 128-bits of the candidate correspond
+        // to the same bytes in the haystack in the low 128-bits (so we only
+        // scan 16 bytes at a time), but are for buckets 8-15 instead of 0-7.
+        //
+        // The verification routine wants to look at all potentially matching
+        // buckets before moving on to the next lane. So for example, both
+        // a16 and a00 both correspond to the first byte in our window; a00
+        // contains buckets 0-7 and a16 contains buckets 8-15. Specifically,
+        // a16 should be checked before a01. So the transformation shown above
+        // allows us to use our normal verification procedure with one small
+        // change: we treat each bitset as 16 bits instead of 8 bits.
+
+        // Swap the 128-bit lanes in the candidate vector.
+        let swap = _mm256_permute4x64_epi64(cand, 0x4E);
+        // Interleave the bytes from the low 128-bit lanes, starting with
+        // cand first.
+        let r1 = _mm256_unpacklo_epi8(cand, swap);
+        // Interleave the bytes from the high 128-bit lanes, starting with
+        // cand first.
+        let r2 = _mm256_unpackhi_epi8(cand, swap);
+        // Now just take the 2 low 64-bit integers from both r1 and r2. We
+        // can drop the high 64-bit integers because they are a mirror image
+        // of the low 64-bit integers. All we care about are the low 128-bit
+        // lanes of r1 and r2. Combined, they contain all our 16-bit bitsets
+        // laid out in the desired order, as described above.
+        let parts = unpacklo64x256(r1, r2);
+        for (i, &part) in parts.iter().enumerate() {
+            let pos = at + i * 4;
+            if let Some(m) = self.verify64(pats, 16, haystack, pos, part) {
+                return Some(m);
+            }
+        }
+        None
+    }
+
+    /// Verify whether there are any matches starting at or after `at` in the
+    /// given `haystack`. The candidate given should correspond to either 8-bit
+    /// (for 8 buckets) or 16-bit (16 buckets) bitsets.
+    #[inline(always)]
+    fn verify64(
+        &self,
+        pats: &Patterns,
+        bucket_count: usize,
+        haystack: &[u8],
+        at: usize,
+        mut cand: u64,
+    ) -> Option<Match> {
+        // N.B. While the bucket count is known from self.buckets.len(),
+        // requiring it as a parameter makes it easier for the optimizer to
+        // know its value, and thus produce more efficient codegen.
+        debug_assert!(bucket_count == 8 || bucket_count == 16);
+        while cand != 0 {
+            let bit = cand.trailing_zeros() as usize;
+            cand &= !(1 << bit);
+
+            let at = at + (bit / bucket_count);
+            let bucket = bit % bucket_count;
+            if let Some(m) = self.verify_bucket(pats, haystack, bucket, at) {
+                return Some(m);
+            }
+        }
+        None
+    }
+
+    /// Verify whether there are any matches starting at `at` in the given
+    /// `haystack` corresponding only to patterns in the given bucket.
+    #[inline(always)]
+    fn verify_bucket(
+        &self,
+        pats: &Patterns,
+        haystack: &[u8],
+        bucket: usize,
+        at: usize,
+    ) -> Option<Match> {
+        // Forcing this function to not inline and be "cold" seems to help
+        // the codegen for Teddy overall. Interestingly, this is good for a
+        // 16% boost in the sherlock/packed/teddy/name/alt1 benchmark (among
+        // others). Overall, this seems like a problem with codegen, since
+        // creating the Match itself is a very small amount of code.
+        #[cold]
+        #[inline(never)]
+        fn match_from_span(
+            pati: PatternID,
+            start: usize,
+            end: usize,
+        ) -> Match {
+            Match::from_span(pati as usize, start, end)
+        }
+
+        // N.B. The bounds check for this bucket lookup *should* be elided
+        // since we assert the number of buckets in each `find_at` routine,
+        // and the compiler can prove that the `% 8` (or `% 16`) in callers
+        // of this routine will always be in bounds.
+        for &pati in &self.buckets[bucket] {
+            // SAFETY: This is safe because we are guaranteed that every
+            // index in a Teddy bucket is a valid index into `pats`. This
+            // guarantee is upheld by the assert checking `max_pattern_id` in
+            // the beginning of `find_at` above.
+            //
+            // This explicit bounds check elision is (amazingly) good for a
+            // 25-50% boost in some benchmarks, particularly ones with a lot
+            // of short literals.
+            let pat = unsafe { pats.get_unchecked(pati) };
+            if pat.is_prefix(&haystack[at..]) {
+                return Some(match_from_span(pati, at, at + pat.len()));
+            }
+        }
+        None
+    }
+}
+
+/// Exec represents the different search strategies supported by the Teddy
+/// runtime.
+///
+/// This enum is an important safety abstraction. Namely, callers should only
+/// construct a variant in this enum if it is safe to execute its corresponding
+/// target features on the current CPU. The 128-bit searchers require SSSE3,
+/// while the 256-bit searchers require AVX2.
+#[derive(Clone, Debug)]
+pub enum Exec {
+    TeddySlim1Mask128(TeddySlim1Mask128),
+    TeddySlim1Mask256(TeddySlim1Mask256),
+    TeddyFat1Mask256(TeddyFat1Mask256),
+    TeddySlim2Mask128(TeddySlim2Mask128),
+    TeddySlim2Mask256(TeddySlim2Mask256),
+    TeddyFat2Mask256(TeddyFat2Mask256),
+    TeddySlim3Mask128(TeddySlim3Mask128),
+    TeddySlim3Mask256(TeddySlim3Mask256),
+    TeddyFat3Mask256(TeddyFat3Mask256),
+}
+
+// Most of the code below remains undocumented because they are effectively
+// repeated versions of themselves. The general structure is described in the
+// README and in the comments above.
+
+#[derive(Clone, Debug)]
+pub struct TeddySlim1Mask128 {
+    pub mask1: Mask128,
+}
+
+impl TeddySlim1Mask128 {
+    #[target_feature(enable = "ssse3")]
+    unsafe fn find_at(
+        &self,
+        pats: &Patterns,
+        teddy: &Teddy,
+        haystack: &[u8],
+        mut at: usize,
+    ) -> Option<Match> {
+        debug_assert!(haystack[at..].len() >= teddy.minimum_len());
+        // This assert helps eliminate bounds checks for bucket lookups in
+        // Teddy::verify_bucket, which has a small (3-4%) performance boost.
+        assert_eq!(8, teddy.buckets.len());
+
+        let len = haystack.len();
+        while at <= len - 16 {
+            let c = self.candidate(haystack, at);
+            if !is_all_zeroes128(c) {
+                if let Some(m) = teddy.verify128(pats, haystack, at, c) {
+                    return Some(m);
+                }
+            }
+            at += 16;
+        }
+        if at < len {
+            at = len - 16;
+            let c = self.candidate(haystack, at);
+            if !is_all_zeroes128(c) {
+                if let Some(m) = teddy.verify128(pats, haystack, at, c) {
+                    return Some(m);
+                }
+            }
+        }
+        None
+    }
+
+    #[inline(always)]
+    unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m128i {
+        debug_assert!(haystack[at..].len() >= 16);
+
+        let chunk = loadu128(haystack, at);
+        members1m128(chunk, self.mask1)
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct TeddySlim1Mask256 {
+    pub mask1: Mask256,
+}
+
+impl TeddySlim1Mask256 {
+    #[target_feature(enable = "avx2")]
+    unsafe fn find_at(
+        &self,
+        pats: &Patterns,
+        teddy: &Teddy,
+        haystack: &[u8],
+        mut at: usize,
+    ) -> Option<Match> {
+        debug_assert!(haystack[at..].len() >= teddy.minimum_len());
+        // This assert helps eliminate bounds checks for bucket lookups in
+        // Teddy::verify_bucket, which has a small (3-4%) performance boost.
+        assert_eq!(8, teddy.buckets.len());
+
+        let len = haystack.len();
+        while at <= len - 32 {
+            let c = self.candidate(haystack, at);
+            if !is_all_zeroes256(c) {
+                if let Some(m) = teddy.verify256(pats, haystack, at, c) {
+                    return Some(m);
+                }
+            }
+            at += 32;
+        }
+        if at < len {
+            at = len - 32;
+            let c = self.candidate(haystack, at);
+            if !is_all_zeroes256(c) {
+                if let Some(m) = teddy.verify256(pats, haystack, at, c) {
+                    return Some(m);
+                }
+            }
+        }
+        None
+    }
+
+    #[inline(always)]
+    unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m256i {
+        debug_assert!(haystack[at..].len() >= 32);
+
+        let chunk = loadu256(haystack, at);
+        members1m256(chunk, self.mask1)
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct TeddyFat1Mask256 {
+    pub mask1: Mask256,
+}
+
+impl TeddyFat1Mask256 {
+    #[target_feature(enable = "avx2")]
+    unsafe fn find_at(
+        &self,
+        pats: &Patterns,
+        teddy: &Teddy,
+        haystack: &[u8],
+        mut at: usize,
+    ) -> Option<Match> {
+        debug_assert!(haystack[at..].len() >= teddy.minimum_len());
+        // This assert helps eliminate bounds checks for bucket lookups in
+        // Teddy::verify_bucket, which has a small (3-4%) performance boost.
+        assert_eq!(16, teddy.buckets.len());
+
+        let len = haystack.len();
+        while at <= len - 16 {
+            let c = self.candidate(haystack, at);
+            if !is_all_zeroes256(c) {
+                if let Some(m) = teddy.verify_fat256(pats, haystack, at, c) {
+                    return Some(m);
+                }
+            }
+            at += 16;
+        }
+        if at < len {
+            at = len - 16;
+            let c = self.candidate(haystack, at);
+            if !is_all_zeroes256(c) {
+                if let Some(m) = teddy.verify_fat256(pats, haystack, at, c) {
+                    return Some(m);
+                }
+            }
+        }
+        None
+    }
+
+    #[inline(always)]
+    unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m256i {
+        debug_assert!(haystack[at..].len() >= 16);
+
+        let chunk = _mm256_broadcastsi128_si256(loadu128(haystack, at));
+        members1m256(chunk, self.mask1)
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct TeddySlim2Mask128 {
+    pub mask1: Mask128,
+    pub mask2: Mask128,
+}
+
+impl TeddySlim2Mask128 {
+    #[target_feature(enable = "ssse3")]
+    unsafe fn find_at(
+        &self,
+        pats: &Patterns,
+        teddy: &Teddy,
+        haystack: &[u8],
+        mut at: usize,
+    ) -> Option<Match> {
+        debug_assert!(haystack[at..].len() >= teddy.minimum_len());
+        // This assert helps eliminate bounds checks for bucket lookups in
+        // Teddy::verify_bucket, which has a small (3-4%) performance boost.
+        assert_eq!(8, teddy.buckets.len());
+
+        at += 1;
+        let len = haystack.len();
+        let mut prev0 = ones128();
+        while at <= len - 16 {
+            let c = self.candidate(haystack, at, &mut prev0);
+            if !is_all_zeroes128(c) {
+                if let Some(m) = teddy.verify128(pats, haystack, at - 1, c) {
+                    return Some(m);
+                }
+            }
+            at += 16;
+        }
+        if at < len {
+            at = len - 16;
+            prev0 = ones128();
+
+            let c = self.candidate(haystack, at, &mut prev0);
+            if !is_all_zeroes128(c) {
+                if let Some(m) = teddy.verify128(pats, haystack, at - 1, c) {
+                    return Some(m);
+                }
+            }
+        }
+        None
+    }
+
+    #[inline(always)]
+    unsafe fn candidate(
+        &self,
+        haystack: &[u8],
+        at: usize,
+        prev0: &mut __m128i,
+    ) -> __m128i {
+        debug_assert!(haystack[at..].len() >= 16);
+
+        let chunk = loadu128(haystack, at);
+        let (res0, res1) = members2m128(chunk, self.mask1, self.mask2);
+        let res0prev0 = _mm_alignr_epi8(res0, *prev0, 15);
+        _mm_and_si128(res0prev0, res1)
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct TeddySlim2Mask256 {
+    pub mask1: Mask256,
+    pub mask2: Mask256,
+}
+
+impl TeddySlim2Mask256 {
+    #[target_feature(enable = "avx2")]
+    unsafe fn find_at(
+        &self,
+        pats: &Patterns,
+        teddy: &Teddy,
+        haystack: &[u8],
+        mut at: usize,
+    ) -> Option<Match> {
+        debug_assert!(haystack[at..].len() >= teddy.minimum_len());
+        // This assert helps eliminate bounds checks for bucket lookups in
+        // Teddy::verify_bucket, which has a small (3-4%) performance boost.
+        assert_eq!(8, teddy.buckets.len());
+
+        at += 1;
+        let len = haystack.len();
+        let mut prev0 = ones256();
+        while at <= len - 32 {
+            let c = self.candidate(haystack, at, &mut prev0);
+            if !is_all_zeroes256(c) {
+                if let Some(m) = teddy.verify256(pats, haystack, at - 1, c) {
+                    return Some(m);
+                }
+            }
+            at += 32;
+        }
+        if at < len {
+            at = len - 32;
+            prev0 = ones256();
+
+            let c = self.candidate(haystack, at, &mut prev0);
+            if !is_all_zeroes256(c) {
+                if let Some(m) = teddy.verify256(pats, haystack, at - 1, c) {
+                    return Some(m);
+                }
+            }
+        }
+        None
+    }
+
+    #[inline(always)]
+    unsafe fn candidate(
+        &self,
+        haystack: &[u8],
+        at: usize,
+        prev0: &mut __m256i,
+    ) -> __m256i {
+        debug_assert!(haystack[at..].len() >= 32);
+
+        let chunk = loadu256(haystack, at);
+        let (res0, res1) = members2m256(chunk, self.mask1, self.mask2);
+        let res0prev0 = alignr256_15(res0, *prev0);
+        let res = _mm256_and_si256(res0prev0, res1);
+        *prev0 = res0;
+        res
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct TeddyFat2Mask256 {
+    pub mask1: Mask256,
+    pub mask2: Mask256,
+}
+
+impl TeddyFat2Mask256 {
+    #[target_feature(enable = "avx2")]
+    unsafe fn find_at(
+        &self,
+        pats: &Patterns,
+        teddy: &Teddy,
+        haystack: &[u8],
+        mut at: usize,
+    ) -> Option<Match> {
+        debug_assert!(haystack[at..].len() >= teddy.minimum_len());
+        // This assert helps eliminate bounds checks for bucket lookups in
+        // Teddy::verify_bucket, which has a small (3-4%) performance boost.
+        assert_eq!(16, teddy.buckets.len());
+
+        at += 1;
+        let len = haystack.len();
+        let mut prev0 = ones256();
+        while at <= len - 16 {
+            let c = self.candidate(haystack, at, &mut prev0);
+            if !is_all_zeroes256(c) {
+                if let Some(m) = teddy.verify_fat256(pats, haystack, at - 1, c)
+                {
+                    return Some(m);
+                }
+            }
+            at += 16;
+        }
+        if at < len {
+            at = len - 16;
+            prev0 = ones256();
+
+            let c = self.candidate(haystack, at, &mut prev0);
+            if !is_all_zeroes256(c) {
+                if let Some(m) = teddy.verify_fat256(pats, haystack, at - 1, c)
+                {
+                    return Some(m);
+                }
+            }
+        }
+        None
+    }
+
+    #[inline(always)]
+    unsafe fn candidate(
+        &self,
+        haystack: &[u8],
+        at: usize,
+        prev0: &mut __m256i,
+    ) -> __m256i {
+        debug_assert!(haystack[at..].len() >= 16);
+
+        let chunk = _mm256_broadcastsi128_si256(loadu128(haystack, at));
+        let (res0, res1) = members2m256(chunk, self.mask1, self.mask2);
+        let res0prev0 = _mm256_alignr_epi8(res0, *prev0, 15);
+        let res = _mm256_and_si256(res0prev0, res1);
+        *prev0 = res0;
+        res
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct TeddySlim3Mask128 {
+    pub mask1: Mask128,
+    pub mask2: Mask128,
+    pub mask3: Mask128,
+}
+
+impl TeddySlim3Mask128 {
+    #[target_feature(enable = "ssse3")]
+    unsafe fn find_at(
+        &self,
+        pats: &Patterns,
+        teddy: &Teddy,
+        haystack: &[u8],
+        mut at: usize,
+    ) -> Option<Match> {
+        debug_assert!(haystack[at..].len() >= teddy.minimum_len());
+        // This assert helps eliminate bounds checks for bucket lookups in
+        // Teddy::verify_bucket, which has a small (3-4%) performance boost.
+        assert_eq!(8, teddy.buckets.len());
+
+        at += 2;
+        let len = haystack.len();
+        let (mut prev0, mut prev1) = (ones128(), ones128());
+        while at <= len - 16 {
+            let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
+            if !is_all_zeroes128(c) {
+                if let Some(m) = teddy.verify128(pats, haystack, at - 2, c) {
+                    return Some(m);
+                }
+            }
+            at += 16;
+        }
+        if at < len {
+            at = len - 16;
+            prev0 = ones128();
+            prev1 = ones128();
+
+            let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
+            if !is_all_zeroes128(c) {
+                if let Some(m) = teddy.verify128(pats, haystack, at - 2, c) {
+                    return Some(m);
+                }
+            }
+        }
+        None
+    }
+
+    #[inline(always)]
+    unsafe fn candidate(
+        &self,
+        haystack: &[u8],
+        at: usize,
+        prev0: &mut __m128i,
+        prev1: &mut __m128i,
+    ) -> __m128i {
+        debug_assert!(haystack[at..].len() >= 16);
+
+        let chunk = loadu128(haystack, at);
+        let (res0, res1, res2) =
+            members3m128(chunk, self.mask1, self.mask2, self.mask3);
+        let res0prev0 = _mm_alignr_epi8(res0, *prev0, 14);
+        let res1prev1 = _mm_alignr_epi8(res1, *prev1, 15);
+        let res = _mm_and_si128(_mm_and_si128(res0prev0, res1prev1), res2);
+        *prev0 = res0;
+        *prev1 = res1;
+        res
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct TeddySlim3Mask256 {
+    pub mask1: Mask256,
+    pub mask2: Mask256,
+    pub mask3: Mask256,
+}
+
+impl TeddySlim3Mask256 {
+    #[target_feature(enable = "avx2")]
+    unsafe fn find_at(
+        &self,
+        pats: &Patterns,
+        teddy: &Teddy,
+        haystack: &[u8],
+        mut at: usize,
+    ) -> Option<Match> {
+        debug_assert!(haystack[at..].len() >= teddy.minimum_len());
+        // This assert helps eliminate bounds checks for bucket lookups in
+        // Teddy::verify_bucket, which has a small (3-4%) performance boost.
+        assert_eq!(8, teddy.buckets.len());
+
+        at += 2;
+        let len = haystack.len();
+        let (mut prev0, mut prev1) = (ones256(), ones256());
+        while at <= len - 32 {
+            let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
+            if !is_all_zeroes256(c) {
+                if let Some(m) = teddy.verify256(pats, haystack, at - 2, c) {
+                    return Some(m);
+                }
+            }
+            at += 32;
+        }
+        if at < len {
+            at = len - 32;
+            prev0 = ones256();
+            prev1 = ones256();
+
+            let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
+            if !is_all_zeroes256(c) {
+                if let Some(m) = teddy.verify256(pats, haystack, at - 2, c) {
+                    return Some(m);
+                }
+            }
+        }
+        None
+    }
+
+    #[inline(always)]
+    unsafe fn candidate(
+        &self,
+        haystack: &[u8],
+        at: usize,
+        prev0: &mut __m256i,
+        prev1: &mut __m256i,
+    ) -> __m256i {
+        debug_assert!(haystack[at..].len() >= 32);
+
+        let chunk = loadu256(haystack, at);
+        let (res0, res1, res2) =
+            members3m256(chunk, self.mask1, self.mask2, self.mask3);
+        let res0prev0 = alignr256_14(res0, *prev0);
+        let res1prev1 = alignr256_15(res1, *prev1);
+        let res =
+            _mm256_and_si256(_mm256_and_si256(res0prev0, res1prev1), res2);
+        *prev0 = res0;
+        *prev1 = res1;
+        res
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct TeddyFat3Mask256 {
+    pub mask1: Mask256,
+    pub mask2: Mask256,
+    pub mask3: Mask256,
+}
+
+impl TeddyFat3Mask256 {
+    #[target_feature(enable = "avx2")]
+    unsafe fn find_at(
+        &self,
+        pats: &Patterns,
+        teddy: &Teddy,
+        haystack: &[u8],
+        mut at: usize,
+    ) -> Option<Match> {
+        debug_assert!(haystack[at..].len() >= teddy.minimum_len());
+        // This assert helps eliminate bounds checks for bucket lookups in
+        // Teddy::verify_bucket, which has a small (3-4%) performance boost.
+        assert_eq!(16, teddy.buckets.len());
+
+        at += 2;
+        let len = haystack.len();
+        let (mut prev0, mut prev1) = (ones256(), ones256());
+        while at <= len - 16 {
+            let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
+            if !is_all_zeroes256(c) {
+                if let Some(m) = teddy.verify_fat256(pats, haystack, at - 2, c)
+                {
+                    return Some(m);
+                }
+            }
+            at += 16;
+        }
+        if at < len {
+            at = len - 16;
+            prev0 = ones256();
+            prev1 = ones256();
+
+            let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
+            if !is_all_zeroes256(c) {
+                if let Some(m) = teddy.verify_fat256(pats, haystack, at - 2, c)
+                {
+                    return Some(m);
+                }
+            }
+        }
+        None
+    }
+
+    #[inline(always)]
+    unsafe fn candidate(
+        &self,
+        haystack: &[u8],
+        at: usize,
+        prev0: &mut __m256i,
+        prev1: &mut __m256i,
+    ) -> __m256i {
+        debug_assert!(haystack[at..].len() >= 16);
+
+        let chunk = _mm256_broadcastsi128_si256(loadu128(haystack, at));
+        let (res0, res1, res2) =
+            members3m256(chunk, self.mask1, self.mask2, self.mask3);
+        let res0prev0 = _mm256_alignr_epi8(res0, *prev0, 14);
+        let res1prev1 = _mm256_alignr_epi8(res1, *prev1, 15);
+        let res =
+            _mm256_and_si256(_mm256_and_si256(res0prev0, res1prev1), res2);
+        *prev0 = res0;
+        *prev1 = res1;
+        res
+    }
+}
+
+/// A 128-bit mask for the low and high nybbles in a set of patterns. Each
+/// lane `j` corresponds to a bitset where the `i`th bit is set if and only if
+/// the nybble `j` is in the bucket `i` at a particular position.
+#[derive(Clone, Copy, Debug)]
+pub struct Mask128 {
+    lo: __m128i,
+    hi: __m128i,
+}
+
+impl Mask128 {
+    /// Create a new SIMD mask from the mask produced by the Teddy builder.
+    pub fn new(mask: compile::Mask) -> Mask128 {
+        // SAFETY: This is safe since [u8; 16] has the same representation
+        // as __m128i.
+        unsafe {
+            Mask128 {
+                lo: mem::transmute(mask.lo128()),
+                hi: mem::transmute(mask.hi128()),
+            }
+        }
+    }
+}
+
+/// A 256-bit mask for the low and high nybbles in a set of patterns. Each
+/// lane `j` corresponds to a bitset where the `i`th bit is set if and only if
+/// the nybble `j` is in the bucket `i` at a particular position.
+///
+/// This is slightly tweaked dependending on whether Slim or Fat Teddy is being
+/// used. For Slim Teddy, the bitsets in the lower 128-bits are the same as
+/// the bitsets in the higher 128-bits, so that we can search 32 bytes at a
+/// time. (Remember, the nybbles in the haystack are used as indices into these
+/// masks, and 256-bit shuffles only operate on 128-bit lanes.)
+///
+/// For Fat Teddy, the bitsets are not repeated, but instead, the high 128
+/// bits correspond to buckets 8-15. So that a bitset `00100010` has buckets
+/// 1 and 5 set if it's in the lower 128 bits, but has buckets 9 and 13 set
+/// if it's in the higher 128 bits.
+#[derive(Clone, Copy, Debug)]
+pub struct Mask256 {
+    lo: __m256i,
+    hi: __m256i,
+}
+
+impl Mask256 {
+    /// Create a new SIMD mask from the mask produced by the Teddy builder.
+    pub fn new(mask: compile::Mask) -> Mask256 {
+        // SAFETY: This is safe since [u8; 32] has the same representation
+        // as __m256i.
+        unsafe {
+            Mask256 {
+                lo: mem::transmute(mask.lo256()),
+                hi: mem::transmute(mask.hi256()),
+            }
+        }
+    }
+}
+
+// The "members" routines below are responsible for taking a chunk of bytes,
+// a number of nybble masks and returning the result of using the masks to
+// lookup bytes in the chunk. The results of the high and low nybble masks are
+// AND'ed together, such that each candidate returned is a vector, with byte
+// sized lanes, and where each lane is an 8-bit bitset corresponding to the
+// buckets that contain the corresponding byte.
+//
+// In the case of masks of length greater than 1, callers will need to keep
+// the results from the previous haystack's window, and then shift the vectors
+// so that they all line up. Then they can be AND'ed together.
+
+/// Return a candidate for Slim 128-bit Teddy, where `chunk` corresponds to a
+/// 16-byte window of the haystack (where the least significant byte
+/// corresponds to the start of the window), and `mask1` corresponds to a
+/// low/high mask for the first byte of all patterns that are being searched.
+#[target_feature(enable = "ssse3")]
+unsafe fn members1m128(chunk: __m128i, mask1: Mask128) -> __m128i {
+    let lomask = _mm_set1_epi8(0xF);
+    let hlo = _mm_and_si128(chunk, lomask);
+    let hhi = _mm_and_si128(_mm_srli_epi16(chunk, 4), lomask);
+    _mm_and_si128(
+        _mm_shuffle_epi8(mask1.lo, hlo),
+        _mm_shuffle_epi8(mask1.hi, hhi),
+    )
+}
+
+/// Return a candidate for Slim 256-bit Teddy, where `chunk` corresponds to a
+/// 32-byte window of the haystack (where the least significant byte
+/// corresponds to the start of the window), and `mask1` corresponds to a
+/// low/high mask for the first byte of all patterns that are being searched.
+///
+/// Note that this can also be used for Fat Teddy, where the high 128 bits in
+/// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte
+/// window in the haystack.
+#[target_feature(enable = "avx2")]
+unsafe fn members1m256(chunk: __m256i, mask1: Mask256) -> __m256i {
+    let lomask = _mm256_set1_epi8(0xF);
+    let hlo = _mm256_and_si256(chunk, lomask);
+    let hhi = _mm256_and_si256(_mm256_srli_epi16(chunk, 4), lomask);
+    _mm256_and_si256(
+        _mm256_shuffle_epi8(mask1.lo, hlo),
+        _mm256_shuffle_epi8(mask1.hi, hhi),
+    )
+}
+
+/// Return candidates for Slim 128-bit Teddy, where `chunk` corresponds
+/// to a 16-byte window of the haystack (where the least significant byte
+/// corresponds to the start of the window), and the masks correspond to a
+/// low/high mask for the first and second bytes of all patterns that are being
+/// searched. The vectors returned correspond to candidates for the first and
+/// second bytes in the patterns represented by the masks.
+#[target_feature(enable = "ssse3")]
+unsafe fn members2m128(
+    chunk: __m128i,
+    mask1: Mask128,
+    mask2: Mask128,
+) -> (__m128i, __m128i) {
+    let lomask = _mm_set1_epi8(0xF);
+    let hlo = _mm_and_si128(chunk, lomask);
+    let hhi = _mm_and_si128(_mm_srli_epi16(chunk, 4), lomask);
+    let res0 = _mm_and_si128(
+        _mm_shuffle_epi8(mask1.lo, hlo),
+        _mm_shuffle_epi8(mask1.hi, hhi),
+    );
+    let res1 = _mm_and_si128(
+        _mm_shuffle_epi8(mask2.lo, hlo),
+        _mm_shuffle_epi8(mask2.hi, hhi),
+    );
+    (res0, res1)
+}
+
+/// Return candidates for Slim 256-bit Teddy, where `chunk` corresponds
+/// to a 32-byte window of the haystack (where the least significant byte
+/// corresponds to the start of the window), and the masks correspond to a
+/// low/high mask for the first and second bytes of all patterns that are being
+/// searched. The vectors returned correspond to candidates for the first and
+/// second bytes in the patterns represented by the masks.
+///
+/// Note that this can also be used for Fat Teddy, where the high 128 bits in
+/// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte
+/// window in the haystack.
+#[target_feature(enable = "avx2")]
+unsafe fn members2m256(
+    chunk: __m256i,
+    mask1: Mask256,
+    mask2: Mask256,
+) -> (__m256i, __m256i) {
+    let lomask = _mm256_set1_epi8(0xF);
+    let hlo = _mm256_and_si256(chunk, lomask);
+    let hhi = _mm256_and_si256(_mm256_srli_epi16(chunk, 4), lomask);
+    let res0 = _mm256_and_si256(
+        _mm256_shuffle_epi8(mask1.lo, hlo),
+        _mm256_shuffle_epi8(mask1.hi, hhi),
+    );
+    let res1 = _mm256_and_si256(
+        _mm256_shuffle_epi8(mask2.lo, hlo),
+        _mm256_shuffle_epi8(mask2.hi, hhi),
+    );
+    (res0, res1)
+}
+
+/// Return candidates for Slim 128-bit Teddy, where `chunk` corresponds
+/// to a 16-byte window of the haystack (where the least significant byte
+/// corresponds to the start of the window), and the masks correspond to a
+/// low/high mask for the first, second and third bytes of all patterns that
+/// are being searched. The vectors returned correspond to candidates for the
+/// first, second and third bytes in the patterns represented by the masks.
+#[target_feature(enable = "ssse3")]
+unsafe fn members3m128(
+    chunk: __m128i,
+    mask1: Mask128,
+    mask2: Mask128,
+    mask3: Mask128,
+) -> (__m128i, __m128i, __m128i) {
+    let lomask = _mm_set1_epi8(0xF);
+    let hlo = _mm_and_si128(chunk, lomask);
+    let hhi = _mm_and_si128(_mm_srli_epi16(chunk, 4), lomask);
+    let res0 = _mm_and_si128(
+        _mm_shuffle_epi8(mask1.lo, hlo),
+        _mm_shuffle_epi8(mask1.hi, hhi),
+    );
+    let res1 = _mm_and_si128(
+        _mm_shuffle_epi8(mask2.lo, hlo),
+        _mm_shuffle_epi8(mask2.hi, hhi),
+    );
+    let res2 = _mm_and_si128(
+        _mm_shuffle_epi8(mask3.lo, hlo),
+        _mm_shuffle_epi8(mask3.hi, hhi),
+    );
+    (res0, res1, res2)
+}
+
+/// Return candidates for Slim 256-bit Teddy, where `chunk` corresponds
+/// to a 32-byte window of the haystack (where the least significant byte
+/// corresponds to the start of the window), and the masks correspond to a
+/// low/high mask for the first, second and third bytes of all patterns that
+/// are being searched. The vectors returned correspond to candidates for the
+/// first, second and third bytes in the patterns represented by the masks.
+///
+/// Note that this can also be used for Fat Teddy, where the high 128 bits in
+/// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte
+/// window in the haystack.
+#[target_feature(enable = "avx2")]
+unsafe fn members3m256(
+    chunk: __m256i,
+    mask1: Mask256,
+    mask2: Mask256,
+    mask3: Mask256,
+) -> (__m256i, __m256i, __m256i) {
+    let lomask = _mm256_set1_epi8(0xF);
+    let hlo = _mm256_and_si256(chunk, lomask);
+    let hhi = _mm256_and_si256(_mm256_srli_epi16(chunk, 4), lomask);
+    let res0 = _mm256_and_si256(
+        _mm256_shuffle_epi8(mask1.lo, hlo),
+        _mm256_shuffle_epi8(mask1.hi, hhi),
+    );
+    let res1 = _mm256_and_si256(
+        _mm256_shuffle_epi8(mask2.lo, hlo),
+        _mm256_shuffle_epi8(mask2.hi, hhi),
+    );
+    let res2 = _mm256_and_si256(
+        _mm256_shuffle_epi8(mask3.lo, hlo),
+        _mm256_shuffle_epi8(mask3.hi, hhi),
+    );
+    (res0, res1, res2)
+}
diff --git a/src/packed/tests.rs b/src/packed/tests.rs
new file mode 100644
index 0000000..a384396
--- /dev/null
+++ b/src/packed/tests.rs
@@ -0,0 +1,568 @@
+use std::collections::HashMap;
+use std::usize;
+
+use packed::{Config, MatchKind};
+use Match;
+
+/// A description of a single test against a multi-pattern searcher.
+///
+/// A single test may not necessarily pass on every configuration of a
+/// searcher. The tests are categorized and grouped appropriately below.
+#[derive(Clone, Debug, Eq, PartialEq)]
+struct SearchTest {
+    /// The name of this test, for debugging.
+    name: &'static str,
+    /// The patterns to search for.
+    patterns: &'static [&'static str],
+    /// The text to search.
+    haystack: &'static str,
+    /// Each match is a triple of (pattern_index, start, end), where
+    /// pattern_index is an index into `patterns` and `start`/`end` are indices
+    /// into `haystack`.
+    matches: &'static [(usize, usize, usize)],
+}
+
+struct SearchTestOwned {
+    offset: usize,
+    name: String,
+    patterns: Vec<String>,
+    haystack: String,
+    matches: Vec<(usize, usize, usize)>,
+}
+
+impl SearchTest {
+    fn variations(&self) -> Vec<SearchTestOwned> {
+        let mut tests = vec![];
+        for i in 0..=260 {
+            tests.push(self.offset_prefix(i));
+            tests.push(self.offset_suffix(i));
+            tests.push(self.offset_both(i));
+        }
+        tests
+    }
+
+    fn offset_both(&self, off: usize) -> SearchTestOwned {
+        SearchTestOwned {
+            offset: off,
+            name: self.name.to_string(),
+            patterns: self.patterns.iter().map(|s| s.to_string()).collect(),
+            haystack: format!(
+                "{}{}{}",
+                "Z".repeat(off),
+                self.haystack,
+                "Z".repeat(off)
+            ),
+            matches: self
+                .matches
+                .iter()
+                .map(|&(id, s, e)| (id, s + off, e + off))
+                .collect(),
+        }
+    }
+
+    fn offset_prefix(&self, off: usize) -> SearchTestOwned {
+        SearchTestOwned {
+            offset: off,
+            name: self.name.to_string(),
+            patterns: self.patterns.iter().map(|s| s.to_string()).collect(),
+            haystack: format!("{}{}", "Z".repeat(off), self.haystack),
+            matches: self
+                .matches
+                .iter()
+                .map(|&(id, s, e)| (id, s + off, e + off))
+                .collect(),
+        }
+    }
+
+    fn offset_suffix(&self, off: usize) -> SearchTestOwned {
+        SearchTestOwned {
+            offset: off,
+            name: self.name.to_string(),
+            patterns: self.patterns.iter().map(|s| s.to_string()).collect(),
+            haystack: format!("{}{}", self.haystack, "Z".repeat(off)),
+            matches: self.matches.to_vec(),
+        }
+    }
+
+    // fn to_owned(&self) -> SearchTestOwned {
+    // SearchTestOwned {
+    // name: self.name.to_string(),
+    // patterns: self.patterns.iter().map(|s| s.to_string()).collect(),
+    // haystack: self.haystack.to_string(),
+    // matches: self.matches.iter().cloned().collect(),
+    // }
+    // }
+}
+
+/// Short-hand constructor for SearchTest. We use it a lot below.
+macro_rules! t {
+    ($name:ident, $patterns:expr, $haystack:expr, $matches:expr) => {
+        SearchTest {
+            name: stringify!($name),
+            patterns: $patterns,
+            haystack: $haystack,
+            matches: $matches,
+        }
+    };
+}
+
+/// A collection of test groups.
+type TestCollection = &'static [&'static [SearchTest]];
+
+// Define several collections corresponding to the different type of match
+// semantics supported. These collections have some overlap, but each
+// collection should have some tests that no other collection has.
+
+/// Tests for leftmost-first match semantics.
+const PACKED_LEFTMOST_FIRST: TestCollection =
+    &[BASICS, LEFTMOST, LEFTMOST_FIRST, REGRESSION, TEDDY];
+
+/// Tests for leftmost-longest match semantics.
+const PACKED_LEFTMOST_LONGEST: TestCollection =
+    &[BASICS, LEFTMOST, LEFTMOST_LONGEST, REGRESSION, TEDDY];
+
+// Now define the individual tests that make up the collections above.
+
+/// A collection of tests for the that should always be true regardless of
+/// match semantics. That is, all combinations of leftmost-{first, longest}
+/// should produce the same answer.
+const BASICS: &'static [SearchTest] = &[
+    t!(basic001, &["a"], "", &[]),
+    t!(basic010, &["a"], "a", &[(0, 0, 1)]),
+    t!(basic020, &["a"], "aa", &[(0, 0, 1), (0, 1, 2)]),
+    t!(basic030, &["a"], "aaa", &[(0, 0, 1), (0, 1, 2), (0, 2, 3)]),
+    t!(basic040, &["a"], "aba", &[(0, 0, 1), (0, 2, 3)]),
+    t!(basic050, &["a"], "bba", &[(0, 2, 3)]),
+    t!(basic060, &["a"], "bbb", &[]),
+    t!(basic070, &["a"], "bababbbba", &[(0, 1, 2), (0, 3, 4), (0, 8, 9)]),
+    t!(basic100, &["aa"], "", &[]),
+    t!(basic110, &["aa"], "aa", &[(0, 0, 2)]),
+    t!(basic120, &["aa"], "aabbaa", &[(0, 0, 2), (0, 4, 6)]),
+    t!(basic130, &["aa"], "abbab", &[]),
+    t!(basic140, &["aa"], "abbabaa", &[(0, 5, 7)]),
+    t!(basic150, &["aaa"], "aaa", &[(0, 0, 3)]),
+    t!(basic200, &["abc"], "abc", &[(0, 0, 3)]),
+    t!(basic210, &["abc"], "zazabzabcz", &[(0, 6, 9)]),
+    t!(basic220, &["abc"], "zazabczabcz", &[(0, 3, 6), (0, 7, 10)]),
+    t!(basic300, &["a", "b"], "", &[]),
+    t!(basic310, &["a", "b"], "z", &[]),
+    t!(basic320, &["a", "b"], "b", &[(1, 0, 1)]),
+    t!(basic330, &["a", "b"], "a", &[(0, 0, 1)]),
+    t!(
+        basic340,
+        &["a", "b"],
+        "abba",
+        &[(0, 0, 1), (1, 1, 2), (1, 2, 3), (0, 3, 4),]
+    ),
+    t!(
+        basic350,
+        &["b", "a"],
+        "abba",
+        &[(1, 0, 1), (0, 1, 2), (0, 2, 3), (1, 3, 4),]
+    ),
+    t!(basic360, &["abc", "bc"], "xbc", &[(1, 1, 3),]),
+    t!(basic400, &["foo", "bar"], "", &[]),
+    t!(basic410, &["foo", "bar"], "foobar", &[(0, 0, 3), (1, 3, 6),]),
+    t!(basic420, &["foo", "bar"], "barfoo", &[(1, 0, 3), (0, 3, 6),]),
+    t!(basic430, &["foo", "bar"], "foofoo", &[(0, 0, 3), (0, 3, 6),]),
+    t!(basic440, &["foo", "bar"], "barbar", &[(1, 0, 3), (1, 3, 6),]),
+    t!(basic450, &["foo", "bar"], "bafofoo", &[(0, 4, 7),]),
+    t!(basic460, &["bar", "foo"], "bafofoo", &[(1, 4, 7),]),
+    t!(basic470, &["foo", "bar"], "fobabar", &[(1, 4, 7),]),
+    t!(basic480, &["bar", "foo"], "fobabar", &[(0, 4, 7),]),
+    t!(basic700, &["yabcdef", "abcdezghi"], "yabcdefghi", &[(0, 0, 7),]),
+    t!(basic710, &["yabcdef", "abcdezghi"], "yabcdezghi", &[(1, 1, 10),]),
+    t!(
+        basic720,
+        &["yabcdef", "bcdeyabc", "abcdezghi"],
+        "yabcdezghi",
+        &[(2, 1, 10),]
+    ),
+    t!(basic810, &["abcd", "bcd", "cd"], "abcd", &[(0, 0, 4),]),
+    t!(basic820, &["bcd", "cd", "abcd"], "abcd", &[(2, 0, 4),]),
+    t!(basic830, &["abc", "bc"], "zazabcz", &[(0, 3, 6),]),
+    t!(
+        basic840,
+        &["ab", "ba"],
+        "abababa",
+        &[(0, 0, 2), (0, 2, 4), (0, 4, 6),]
+    ),
+    t!(basic850, &["foo", "foo"], "foobarfoo", &[(0, 0, 3), (0, 6, 9),]),
+];
+
+/// Tests for leftmost match semantics. These should pass for both
+/// leftmost-first and leftmost-longest match kinds. Stated differently, among
+/// ambiguous matches, the longest match and the match that appeared first when
+/// constructing the automaton should always be the same.
+const LEFTMOST: &'static [SearchTest] = &[
+    t!(leftmost000, &["ab", "ab"], "abcd", &[(0, 0, 2)]),
+    t!(leftmost030, &["a", "ab"], "aa", &[(0, 0, 1), (0, 1, 2)]),
+    t!(leftmost031, &["ab", "a"], "aa", &[(1, 0, 1), (1, 1, 2)]),
+    t!(leftmost032, &["ab", "a"], "xayabbbz", &[(1, 1, 2), (0, 3, 5)]),
+    t!(leftmost300, &["abcd", "bce", "b"], "abce", &[(1, 1, 4)]),
+    t!(leftmost310, &["abcd", "ce", "bc"], "abce", &[(2, 1, 3)]),
+    t!(leftmost320, &["abcd", "bce", "ce", "b"], "abce", &[(1, 1, 4)]),
+    t!(leftmost330, &["abcd", "bce", "cz", "bc"], "abcz", &[(3, 1, 3)]),
+    t!(leftmost340, &["bce", "cz", "bc"], "bcz", &[(2, 0, 2)]),
+    t!(leftmost350, &["abc", "bd", "ab"], "abd", &[(2, 0, 2)]),
+    t!(
+        leftmost360,
+        &["abcdefghi", "hz", "abcdefgh"],
+        "abcdefghz",
+        &[(2, 0, 8),]
+    ),
+    t!(
+        leftmost370,
+        &["abcdefghi", "cde", "hz", "abcdefgh"],
+        "abcdefghz",
+        &[(3, 0, 8),]
+    ),
+    t!(
+        leftmost380,
+        &["abcdefghi", "hz", "abcdefgh", "a"],
+        "abcdefghz",
+        &[(2, 0, 8),]
+    ),
+    t!(
+        leftmost390,
+        &["b", "abcdefghi", "hz", "abcdefgh"],
+        "abcdefghz",
+        &[(3, 0, 8),]
+    ),
+    t!(
+        leftmost400,
+        &["h", "abcdefghi", "hz", "abcdefgh"],
+        "abcdefghz",
+        &[(3, 0, 8),]
+    ),
+    t!(
+        leftmost410,
+        &["z", "abcdefghi", "hz", "abcdefgh"],
+        "abcdefghz",
+        &[(3, 0, 8), (0, 8, 9),]
+    ),
+];
+
+/// Tests for non-overlapping leftmost-first match semantics. These tests
+/// should generally be specific to leftmost-first, which means they should
+/// generally fail under leftmost-longest semantics.
+const LEFTMOST_FIRST: &'static [SearchTest] = &[
+    t!(leftfirst000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]),
+    t!(leftfirst020, &["abcd", "ab"], "abcd", &[(0, 0, 4)]),
+    t!(leftfirst030, &["ab", "ab"], "abcd", &[(0, 0, 2)]),
+    t!(leftfirst040, &["a", "ab"], "xayabbbz", &[(0, 1, 2), (0, 3, 4)]),
+    t!(leftfirst100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[(1, 1, 5)]),
+    t!(leftfirst110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[(1, 1, 6)]),
+    t!(leftfirst300, &["abcd", "b", "bce"], "abce", &[(1, 1, 2)]),
+    t!(
+        leftfirst310,
+        &["abcd", "b", "bce", "ce"],
+        "abce",
+        &[(1, 1, 2), (3, 2, 4),]
+    ),
+    t!(
+        leftfirst320,
+        &["a", "abcdefghi", "hz", "abcdefgh"],
+        "abcdefghz",
+        &[(0, 0, 1), (2, 7, 9),]
+    ),
+    t!(leftfirst330, &["a", "abab"], "abab", &[(0, 0, 1), (0, 2, 3)]),
+    t!(
+        leftfirst340,
+        &["abcdef", "x", "x", "x", "x", "x", "x", "abcde"],
+        "abcdef",
+        &[(0, 0, 6)]
+    ),
+];
+
+/// Tests for non-overlapping leftmost-longest match semantics. These tests
+/// should generally be specific to leftmost-longest, which means they should
+/// generally fail under leftmost-first semantics.
+const LEFTMOST_LONGEST: &'static [SearchTest] = &[
+    t!(leftlong000, &["ab", "abcd"], "abcd", &[(1, 0, 4)]),
+    t!(leftlong010, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4),]),
+    t!(leftlong040, &["a", "ab"], "a", &[(0, 0, 1)]),
+    t!(leftlong050, &["a", "ab"], "ab", &[(1, 0, 2)]),
+    t!(leftlong060, &["ab", "a"], "a", &[(1, 0, 1)]),
+    t!(leftlong070, &["ab", "a"], "ab", &[(0, 0, 2)]),
+    t!(leftlong100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[(2, 1, 6)]),
+    t!(leftlong110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[(1, 1, 6)]),
+    t!(leftlong300, &["abcd", "b", "bce"], "abce", &[(2, 1, 4)]),
+    t!(
+        leftlong310,
+        &["a", "abcdefghi", "hz", "abcdefgh"],
+        "abcdefghz",
+        &[(3, 0, 8),]
+    ),
+    t!(leftlong320, &["a", "abab"], "abab", &[(1, 0, 4)]),
+    t!(leftlong330, &["abcd", "b", "ce"], "abce", &[(1, 1, 2), (2, 2, 4),]),
+    t!(leftlong340, &["a", "ab"], "xayabbbz", &[(0, 1, 2), (1, 3, 5)]),
+];
+
+/// Regression tests that are applied to all combinations.
+///
+/// If regression tests are needed for specific match semantics, then add them
+/// to the appropriate group above.
+const REGRESSION: &'static [SearchTest] = &[
+    t!(regression010, &["inf", "ind"], "infind", &[(0, 0, 3), (1, 3, 6),]),
+    t!(regression020, &["ind", "inf"], "infind", &[(1, 0, 3), (0, 3, 6),]),
+    t!(
+        regression030,
+        &["libcore/", "libstd/"],
+        "libcore/char/methods.rs",
+        &[(0, 0, 8),]
+    ),
+    t!(
+        regression040,
+        &["libstd/", "libcore/"],
+        "libcore/char/methods.rs",
+        &[(1, 0, 8),]
+    ),
+    t!(
+        regression050,
+        &["\x00\x00\x01", "\x00\x00\x00"],
+        "\x00\x00\x00",
+        &[(1, 0, 3),]
+    ),
+    t!(
+        regression060,
+        &["\x00\x00\x00", "\x00\x00\x01"],
+        "\x00\x00\x00",
+        &[(0, 0, 3),]
+    ),
+];
+
+const TEDDY: &'static [SearchTest] = &[
+    t!(
+        teddy010,
+        &["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"],
+        "abcdefghijk",
+        &[
+            (0, 0, 1),
+            (1, 1, 2),
+            (2, 2, 3),
+            (3, 3, 4),
+            (4, 4, 5),
+            (5, 5, 6),
+            (6, 6, 7),
+            (7, 7, 8),
+            (8, 8, 9),
+            (9, 9, 10),
+            (10, 10, 11)
+        ]
+    ),
+    t!(
+        teddy020,
+        &["ab", "bc", "cd", "de", "ef", "fg", "gh", "hi", "ij", "jk", "kl"],
+        "abcdefghijk",
+        &[(0, 0, 2), (2, 2, 4), (4, 4, 6), (6, 6, 8), (8, 8, 10),]
+    ),
+    t!(
+        teddy030,
+        &["abc"],
+        "abcdefghijklmnopqrstuvwxyzabcdefghijk",
+        &[(0, 0, 3), (0, 26, 29)]
+    ),
+];
+
+// Now define a test for each combination of things above that we want to run.
+// Since there are a few different combinations for each collection of tests,
+// we define a couple of macros to avoid repetition drudgery. The testconfig
+// macro constructs the automaton from a given match kind, and runs the search
+// tests one-by-one over the given collection. The `with` parameter allows one
+// to configure the config with additional parameters. The testcombo macro
+// invokes testconfig in precisely this way: it sets up several tests where
+// each one turns a different knob on Config.
+
+macro_rules! testconfig {
+    ($name:ident, $collection:expr, $with:expr) => {
+        #[test]
+        fn $name() {
+            run_search_tests($collection, |test| {
+                let mut config = Config::new();
+                $with(&mut config);
+                config
+                    .builder()
+                    .extend(test.patterns.iter().map(|p| p.as_bytes()))
+                    .build()
+                    .unwrap()
+                    .find_iter(&test.haystack)
+                    .collect()
+            });
+        }
+    };
+}
+
+#[cfg(target_arch = "x86_64")]
+testconfig!(
+    search_default_leftmost_first,
+    PACKED_LEFTMOST_FIRST,
+    |_: &mut Config| {}
+);
+
+#[cfg(target_arch = "x86_64")]
+testconfig!(
+    search_default_leftmost_longest,
+    PACKED_LEFTMOST_LONGEST,
+    |c: &mut Config| {
+        c.match_kind(MatchKind::LeftmostLongest);
+    }
+);
+
+#[cfg(target_arch = "x86_64")]
+testconfig!(
+    search_teddy_leftmost_first,
+    PACKED_LEFTMOST_FIRST,
+    |c: &mut Config| {
+        c.force_teddy(true);
+    }
+);
+
+#[cfg(target_arch = "x86_64")]
+testconfig!(
+    search_teddy_leftmost_longest,
+    PACKED_LEFTMOST_LONGEST,
+    |c: &mut Config| {
+        c.force_teddy(true).match_kind(MatchKind::LeftmostLongest);
+    }
+);
+
+#[cfg(target_arch = "x86_64")]
+testconfig!(
+    search_teddy_ssse3_leftmost_first,
+    PACKED_LEFTMOST_FIRST,
+    |c: &mut Config| {
+        c.force_teddy(true);
+        if is_x86_feature_detected!("ssse3") {
+            c.force_avx(Some(false));
+        }
+    }
+);
+
+#[cfg(target_arch = "x86_64")]
+testconfig!(
+    search_teddy_ssse3_leftmost_longest,
+    PACKED_LEFTMOST_LONGEST,
+    |c: &mut Config| {
+        c.force_teddy(true).match_kind(MatchKind::LeftmostLongest);
+        if is_x86_feature_detected!("ssse3") {
+            c.force_avx(Some(false));
+        }
+    }
+);
+
+#[cfg(target_arch = "x86_64")]
+testconfig!(
+    search_teddy_avx2_leftmost_first,
+    PACKED_LEFTMOST_FIRST,
+    |c: &mut Config| {
+        c.force_teddy(true);
+        if is_x86_feature_detected!("avx2") {
+            c.force_avx(Some(true));
+        }
+    }
+);
+
+#[cfg(target_arch = "x86_64")]
+testconfig!(
+    search_teddy_avx2_leftmost_longest,
+    PACKED_LEFTMOST_LONGEST,
+    |c: &mut Config| {
+        c.force_teddy(true).match_kind(MatchKind::LeftmostLongest);
+        if is_x86_feature_detected!("avx2") {
+            c.force_avx(Some(true));
+        }
+    }
+);
+
+#[cfg(target_arch = "x86_64")]
+testconfig!(
+    search_teddy_fat_leftmost_first,
+    PACKED_LEFTMOST_FIRST,
+    |c: &mut Config| {
+        c.force_teddy(true);
+        if is_x86_feature_detected!("avx2") {
+            c.force_teddy_fat(Some(true));
+        }
+    }
+);
+
+#[cfg(target_arch = "x86_64")]
+testconfig!(
+    search_teddy_fat_leftmost_longest,
+    PACKED_LEFTMOST_LONGEST,
+    |c: &mut Config| {
+        c.force_teddy(true).match_kind(MatchKind::LeftmostLongest);
+        if is_x86_feature_detected!("avx2") {
+            c.force_teddy_fat(Some(true));
+        }
+    }
+);
+
+testconfig!(
+    search_rabinkarp_leftmost_first,
+    PACKED_LEFTMOST_FIRST,
+    |c: &mut Config| {
+        c.force_rabin_karp(true);
+    }
+);
+
+testconfig!(
+    search_rabinkarp_leftmost_longest,
+    PACKED_LEFTMOST_LONGEST,
+    |c: &mut Config| {
+        c.force_rabin_karp(true).match_kind(MatchKind::LeftmostLongest);
+    }
+);
+
+#[test]
+fn search_tests_have_unique_names() {
+    let assert = |constname, tests: &[SearchTest]| {
+        let mut seen = HashMap::new(); // map from test name to position
+        for (i, test) in tests.iter().enumerate() {
+            if !seen.contains_key(test.name) {
+                seen.insert(test.name, i);
+            } else {
+                let last = seen[test.name];
+                panic!(
+                    "{} tests have duplicate names at positions {} and {}",
+                    constname, last, i
+                );
+            }
+        }
+    };
+    assert("BASICS", BASICS);
+    assert("LEFTMOST", LEFTMOST);
+    assert("LEFTMOST_FIRST", LEFTMOST_FIRST);
+    assert("LEFTMOST_LONGEST", LEFTMOST_LONGEST);
+    assert("REGRESSION", REGRESSION);
+    assert("TEDDY", TEDDY);
+}
+
+fn run_search_tests<F: FnMut(&SearchTestOwned) -> Vec<Match>>(
+    which: TestCollection,
+    mut f: F,
+) {
+    let get_match_triples =
+        |matches: Vec<Match>| -> Vec<(usize, usize, usize)> {
+            matches
+                .into_iter()
+                .map(|m| (m.pattern(), m.start(), m.end()))
+                .collect()
+        };
+    for &tests in which {
+        for spec in tests {
+            for test in spec.variations() {
+                assert_eq!(
+                    test.matches,
+                    get_match_triples(f(&test)).as_slice(),
+                    "test: {}, patterns: {:?}, haystack: {:?}, offset: {:?}",
+                    test.name,
+                    test.patterns,
+                    test.haystack,
+                    test.offset,
+                );
+            }
+        }
+    }
+}
diff --git a/src/packed/vector.rs b/src/packed/vector.rs
new file mode 100644
index 0000000..ca6c2b0
--- /dev/null
+++ b/src/packed/vector.rs
@@ -0,0 +1,181 @@
+// This file contains a set of fairly generic utility functions when working
+// with SIMD vectors.
+//
+// SAFETY: All of the routines below are unsafe to call because they assume
+// the necessary CPU target features in order to use particular vendor
+// intrinsics. Calling these routines when the underlying CPU does not support
+// the appropriate target features is NOT safe. Callers must ensure this
+// themselves.
+//
+// Note that it may not look like this safety invariant is being upheld when
+// these routines are called. Namely, the CPU feature check is typically pretty
+// far away from when these routines are used. Instead, we rely on the fact
+// that certain types serve as a guaranteed receipt that pertinent target
+// features are enabled. For example, the only way TeddySlim3Mask256 can be
+// constructed is if the AVX2 CPU feature is available. Thus, any code running
+// inside of TeddySlim3Mask256 can use any of the functions below without any
+// additional checks: its very existence *is* the check.
+
+use std::arch::x86_64::*;
+
+/// Shift `a` to the left by two bytes (removing its two most significant
+/// bytes), and concatenate it with the the two most significant bytes of `b`.
+#[target_feature(enable = "avx2")]
+pub unsafe fn alignr256_14(a: __m256i, b: __m256i) -> __m256i {
+    // Credit goes to jneem for figuring this out:
+    // https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184
+    //
+    // TL;DR avx2's PALIGNR instruction is actually just two 128-bit PALIGNR
+    // instructions, which is not what we want, so we need to do some extra
+    // shuffling.
+
+    // This permute gives us the low 16 bytes of a concatenated with the high
+    // 16 bytes of b, in order of most significant to least significant. So
+    // `v = a[15:0] b[31:16]`.
+    let v = _mm256_permute2x128_si256(b, a, 0x21);
+    // This effectively does this (where we deal in terms of byte-indexing
+    // and byte-shifting, and use inclusive ranges):
+    //
+    //   ret[15:0]  := ((a[15:0] << 16) | v[15:0]) >> 14
+    //               = ((a[15:0] << 16) | b[31:16]) >> 14
+    //   ret[31:16] := ((a[31:16] << 16) | v[31:16]) >> 14
+    //               = ((a[31:16] << 16) | a[15:0]) >> 14
+    //
+    // Which therefore results in:
+    //
+    //   ret[31:0]  := a[29:16] a[15:14] a[13:0] b[31:30]
+    //
+    // The end result is that we've effectively done this:
+    //
+    //   (a << 2) | (b >> 30)
+    //
+    // When `A` and `B` are strings---where the beginning of the string is in
+    // the least significant bits---we effectively result in the following
+    // semantic operation:
+    //
+    //   (A >> 2) | (B << 30)
+    //
+    // The reversal being attributed to the fact that we are in little-endian.
+    _mm256_alignr_epi8(a, v, 14)
+}
+
+/// Shift `a` to the left by one byte (removing its most significant byte), and
+/// concatenate it with the the most significant byte of `b`.
+#[target_feature(enable = "avx2")]
+pub unsafe fn alignr256_15(a: __m256i, b: __m256i) -> __m256i {
+    // For explanation, see alignr256_14.
+    let v = _mm256_permute2x128_si256(b, a, 0x21);
+    _mm256_alignr_epi8(a, v, 15)
+}
+
+/// Unpack the given 128-bit vector into its 64-bit components. The first
+/// element of the array returned corresponds to the least significant 64-bit
+/// lane in `a`.
+#[target_feature(enable = "ssse3")]
+pub unsafe fn unpack64x128(a: __m128i) -> [u64; 2] {
+    [
+        _mm_cvtsi128_si64(a) as u64,
+        _mm_cvtsi128_si64(_mm_srli_si128(a, 8)) as u64,
+    ]
+}
+
+/// Unpack the given 256-bit vector into its 64-bit components. The first
+/// element of the array returned corresponds to the least significant 64-bit
+/// lane in `a`.
+#[target_feature(enable = "avx2")]
+pub unsafe fn unpack64x256(a: __m256i) -> [u64; 4] {
+    // Using transmute here is precisely equivalent, but actually slower. It's
+    // not quite clear why.
+    let lo = _mm256_extracti128_si256(a, 0);
+    let hi = _mm256_extracti128_si256(a, 1);
+    [
+        _mm_cvtsi128_si64(lo) as u64,
+        _mm_cvtsi128_si64(_mm_srli_si128(lo, 8)) as u64,
+        _mm_cvtsi128_si64(hi) as u64,
+        _mm_cvtsi128_si64(_mm_srli_si128(hi, 8)) as u64,
+    ]
+}
+
+/// Unpack the low 128-bits of `a` and `b`, and return them as 4 64-bit
+/// integers.
+///
+/// More precisely, if a = a4 a3 a2 a1 and b = b4 b3 b2 b1, where each element
+/// is a 64-bit integer and a1/b1 correspond to the least significant 64 bits,
+/// then the return value is `b2 b1 a2 a1`.
+#[target_feature(enable = "avx2")]
+pub unsafe fn unpacklo64x256(a: __m256i, b: __m256i) -> [u64; 4] {
+    let lo = _mm256_castsi256_si128(a);
+    let hi = _mm256_castsi256_si128(b);
+    [
+        _mm_cvtsi128_si64(lo) as u64,
+        _mm_cvtsi128_si64(_mm_srli_si128(lo, 8)) as u64,
+        _mm_cvtsi128_si64(hi) as u64,
+        _mm_cvtsi128_si64(_mm_srli_si128(hi, 8)) as u64,
+    ]
+}
+
+/// Returns true if and only if all bits in the given 128-bit vector are 0.
+#[target_feature(enable = "ssse3")]
+pub unsafe fn is_all_zeroes128(a: __m128i) -> bool {
+    let cmp = _mm_cmpeq_epi8(a, zeroes128());
+    _mm_movemask_epi8(cmp) as u32 == 0xFFFF
+}
+
+/// Returns true if and only if all bits in the given 256-bit vector are 0.
+#[target_feature(enable = "avx2")]
+pub unsafe fn is_all_zeroes256(a: __m256i) -> bool {
+    let cmp = _mm256_cmpeq_epi8(a, zeroes256());
+    _mm256_movemask_epi8(cmp) as u32 == 0xFFFFFFFF
+}
+
+/// Load a 128-bit vector from slice at the given position. The slice does
+/// not need to be unaligned.
+///
+/// Since this code assumes little-endian (there is no big-endian x86), the
+/// bytes starting in `slice[at..]` will be at the least significant bits of
+/// the returned vector. This is important for the surrounding code, since for
+/// example, shifting the resulting vector right is equivalent to logically
+/// shifting the bytes in `slice` left.
+#[target_feature(enable = "sse2")]
+pub unsafe fn loadu128(slice: &[u8], at: usize) -> __m128i {
+    let ptr = slice.get_unchecked(at..).as_ptr();
+    _mm_loadu_si128(ptr as *const u8 as *const __m128i)
+}
+
+/// Load a 256-bit vector from slice at the given position. The slice does
+/// not need to be unaligned.
+///
+/// Since this code assumes little-endian (there is no big-endian x86), the
+/// bytes starting in `slice[at..]` will be at the least significant bits of
+/// the returned vector. This is important for the surrounding code, since for
+/// example, shifting the resulting vector right is equivalent to logically
+/// shifting the bytes in `slice` left.
+#[target_feature(enable = "avx2")]
+pub unsafe fn loadu256(slice: &[u8], at: usize) -> __m256i {
+    let ptr = slice.get_unchecked(at..).as_ptr();
+    _mm256_loadu_si256(ptr as *const u8 as *const __m256i)
+}
+
+/// Returns a 128-bit vector with all bits set to 0.
+#[target_feature(enable = "sse2")]
+pub unsafe fn zeroes128() -> __m128i {
+    _mm_set1_epi8(0)
+}
+
+/// Returns a 256-bit vector with all bits set to 0.
+#[target_feature(enable = "avx2")]
+pub unsafe fn zeroes256() -> __m256i {
+    _mm256_set1_epi8(0)
+}
+
+/// Returns a 128-bit vector with all bits set to 1.
+#[target_feature(enable = "sse2")]
+pub unsafe fn ones128() -> __m128i {
+    _mm_set1_epi8(0xFF as u8 as i8)
+}
+
+/// Returns a 256-bit vector with all bits set to 1.
+#[target_feature(enable = "avx2")]
+pub unsafe fn ones256() -> __m256i {
+    _mm256_set1_epi8(0xFF as u8 as i8)
+}
diff --git a/src/prefilter.rs b/src/prefilter.rs
new file mode 100644
index 0000000..bda215d
--- /dev/null
+++ b/src/prefilter.rs
@@ -0,0 +1,997 @@
+use std::cmp;
+use std::fmt;
+use std::panic::{RefUnwindSafe, UnwindSafe};
+use std::u8;
+
+use memchr::{memchr, memchr2, memchr3};
+
+use ahocorasick::MatchKind;
+use packed;
+use Match;
+
+/// A candidate is the result of running a prefilter on a haystack at a
+/// particular position. The result is either no match, a confirmed match or
+/// a possible match.
+///
+/// When no match is returned, the prefilter is guaranteeing that no possible
+/// match can be found in the haystack, and the caller may trust this. That is,
+/// all correct prefilters must never report false negatives.
+///
+/// In some cases, a prefilter can confirm a match very quickly, in which case,
+/// the caller may use this to stop what it's doing and report the match. In
+/// this case, prefilter implementations must never report a false positive.
+/// In other cases, the prefilter can only report a potential match, in which
+/// case the callers must attempt to confirm the match. In this case, prefilter
+/// implementations are permitted to return false positives.
+#[derive(Clone, Debug)]
+pub enum Candidate {
+    None,
+    Match(Match),
+    PossibleStartOfMatch(usize),
+}
+
+impl Candidate {
+    /// Convert this candidate into an option. This is useful when callers
+    /// do not distinguish between true positives and false positives (i.e.,
+    /// the caller must always confirm the match in order to update some other
+    /// state).
+    pub fn into_option(self) -> Option<usize> {
+        match self {
+            Candidate::None => None,
+            Candidate::Match(ref m) => Some(m.start()),
+            Candidate::PossibleStartOfMatch(start) => Some(start),
+        }
+    }
+}
+
+/// A prefilter describes the behavior of fast literal scanners for quickly
+/// skipping past bytes in the haystack that we know cannot possibly
+/// participate in a match.
+pub trait Prefilter:
+    Send + Sync + RefUnwindSafe + UnwindSafe + fmt::Debug
+{
+    /// Returns the next possible match candidate. This may yield false
+    /// positives, so callers must confirm a match starting at the position
+    /// returned. This, however, must never produce false negatives. That is,
+    /// this must, at minimum, return the starting position of the next match
+    /// in the given haystack after or at the given position.
+    fn next_candidate(
+        &self,
+        state: &mut PrefilterState,
+        haystack: &[u8],
+        at: usize,
+    ) -> Candidate;
+
+    /// A method for cloning a prefilter, to work-around the fact that Clone
+    /// is not object-safe.
+    fn clone_prefilter(&self) -> Box<dyn Prefilter>;
+
+    /// Returns the approximate total amount of heap used by this prefilter, in
+    /// units of bytes.
+    fn heap_bytes(&self) -> usize;
+
+    /// Returns true if and only if this prefilter never returns false
+    /// positives. This is useful for completely avoiding the automaton
+    /// when the prefilter can quickly confirm its own matches.
+    ///
+    /// By default, this returns true, which is conservative; it is always
+    /// correct to return `true`. Returning `false` here and reporting a false
+    /// positive will result in incorrect searches.
+    fn reports_false_positives(&self) -> bool {
+        true
+    }
+}
+
+impl<'a, P: Prefilter + ?Sized> Prefilter for &'a P {
+    #[inline]
+    fn next_candidate(
+        &self,
+        state: &mut PrefilterState,
+        haystack: &[u8],
+        at: usize,
+    ) -> Candidate {
+        (**self).next_candidate(state, haystack, at)
+    }
+
+    fn clone_prefilter(&self) -> Box<dyn Prefilter> {
+        (**self).clone_prefilter()
+    }
+
+    fn heap_bytes(&self) -> usize {
+        (**self).heap_bytes()
+    }
+
+    fn reports_false_positives(&self) -> bool {
+        (**self).reports_false_positives()
+    }
+}
+
+/// A convenience object for representing any type that implements Prefilter
+/// and is cloneable.
+#[derive(Debug)]
+pub struct PrefilterObj(Box<dyn Prefilter>);
+
+impl Clone for PrefilterObj {
+    fn clone(&self) -> Self {
+        PrefilterObj(self.0.clone_prefilter())
+    }
+}
+
+impl PrefilterObj {
+    /// Create a new prefilter object.
+    pub fn new<T: Prefilter + 'static>(t: T) -> PrefilterObj {
+        PrefilterObj(Box::new(t))
+    }
+
+    /// Return the underlying prefilter trait object.
+    pub fn as_ref(&self) -> &dyn Prefilter {
+        &*self.0
+    }
+}
+
+/// PrefilterState tracks state associated with the effectiveness of a
+/// prefilter. It is used to track how many bytes, on average, are skipped by
+/// the prefilter. If this average dips below a certain threshold over time,
+/// then the state renders the prefilter inert and stops using it.
+///
+/// A prefilter state should be created for each search. (Where creating an
+/// iterator via, e.g., `find_iter`, is treated as a single search.)
+#[derive(Clone, Debug)]
+pub struct PrefilterState {
+    /// The number of skips that has been executed.
+    skips: usize,
+    /// The total number of bytes that have been skipped.
+    skipped: usize,
+    /// The maximum length of a match. This is used to help determine how many
+    /// bytes on average should be skipped in order for a prefilter to be
+    /// effective.
+    max_match_len: usize,
+    /// Once this heuristic has been deemed permanently ineffective, it will be
+    /// inert throughout the rest of its lifetime. This serves as a cheap way
+    /// to check inertness.
+    inert: bool,
+    /// The last (absolute) position at which a prefilter scanned to.
+    /// Prefilters can use this position to determine whether to re-scan or
+    /// not.
+    ///
+    /// Unlike other things that impact effectiveness, this is a fleeting
+    /// condition. That is, a prefilter can be considered ineffective if it is
+    /// at a position before `last_scan_at`, but can become effective again
+    /// once the search moves past `last_scan_at`.
+    ///
+    /// The utility of this is to both avoid additional overhead from calling
+    /// the prefilter and to avoid quadratic behavior. This ensures that a
+    /// prefilter will scan any particular byte at most once. (Note that some
+    /// prefilters, like the start-byte prefilter, do not need to use this
+    /// field at all, since it only looks for starting bytes.)
+    last_scan_at: usize,
+}
+
+impl PrefilterState {
+    /// The minimum number of skip attempts to try before considering whether
+    /// a prefilter is effective or not.
+    const MIN_SKIPS: usize = 40;
+
+    /// The minimum amount of bytes that skipping must average, expressed as a
+    /// factor of the multiple of the length of a possible match.
+    ///
+    /// That is, after MIN_SKIPS have occurred, if the average number of bytes
+    /// skipped ever falls below MIN_AVG_FACTOR * max-match-length, then the
+    /// prefilter outed to be rendered inert.
+    const MIN_AVG_FACTOR: usize = 2;
+
+    /// Create a fresh prefilter state.
+    pub fn new(max_match_len: usize) -> PrefilterState {
+        PrefilterState {
+            skips: 0,
+            skipped: 0,
+            max_match_len,
+            inert: false,
+            last_scan_at: 0,
+        }
+    }
+
+    /// Update this state with the number of bytes skipped on the last
+    /// invocation of the prefilter.
+    #[inline]
+    fn update_skipped_bytes(&mut self, skipped: usize) {
+        self.skips += 1;
+        self.skipped += skipped;
+    }
+
+    /// Updates the position at which the last scan stopped. This may be
+    /// greater than the position of the last candidate reported. For example,
+    /// searching for the "rare" byte `z` in `abczdef` for the pattern `abcz`
+    /// will report a candidate at position `0`, but the end of its last scan
+    /// will be at position `3`.
+    ///
+    /// This position factors into the effectiveness of this prefilter. If the
+    /// current position is less than the last position at which a scan ended,
+    /// then the prefilter should not be re-run until the search moves past
+    /// that position.
+    #[inline]
+    fn update_at(&mut self, at: usize) {
+        if at > self.last_scan_at {
+            self.last_scan_at = at;
+        }
+    }
+
+    /// Return true if and only if this state indicates that a prefilter is
+    /// still effective.
+    ///
+    /// The given pos should correspond to the current starting position of the
+    /// search.
+    #[inline]
+    pub fn is_effective(&mut self, at: usize) -> bool {
+        if self.inert {
+            return false;
+        }
+        if at < self.last_scan_at {
+            return false;
+        }
+        if self.skips < PrefilterState::MIN_SKIPS {
+            return true;
+        }
+
+        let min_avg = PrefilterState::MIN_AVG_FACTOR * self.max_match_len;
+        if self.skipped >= min_avg * self.skips {
+            return true;
+        }
+
+        // We're inert.
+        self.inert = true;
+        false
+    }
+}
+
+/// A builder for constructing the best possible prefilter. When constructed,
+/// this builder will heuristically select the best prefilter it can build,
+/// if any, and discard the rest.
+#[derive(Debug)]
+pub struct Builder {
+    count: usize,
+    ascii_case_insensitive: bool,
+    start_bytes: StartBytesBuilder,
+    rare_bytes: RareBytesBuilder,
+    packed: Option<packed::Builder>,
+}
+
+impl Builder {
+    /// Create a new builder for constructing the best possible prefilter.
+    pub fn new(kind: MatchKind) -> Builder {
+        let pbuilder = kind
+            .as_packed()
+            .map(|kind| packed::Config::new().match_kind(kind).builder());
+        Builder {
+            count: 0,
+            ascii_case_insensitive: false,
+            start_bytes: StartBytesBuilder::new(),
+            rare_bytes: RareBytesBuilder::new(),
+            packed: pbuilder,
+        }
+    }
+
+    /// Enable ASCII case insensitivity. When set, byte strings added to this
+    /// builder will be interpreted without respect to ASCII case.
+    pub fn ascii_case_insensitive(mut self, yes: bool) -> Builder {
+        self.ascii_case_insensitive = yes;
+        self.start_bytes = self.start_bytes.ascii_case_insensitive(yes);
+        self.rare_bytes = self.rare_bytes.ascii_case_insensitive(yes);
+        self
+    }
+
+    /// Return a prefilter suitable for quickly finding potential matches.
+    ///
+    /// All patterns added to an Aho-Corasick automaton should be added to this
+    /// builder before attempting to construct the prefilter.
+    pub fn build(&self) -> Option<PrefilterObj> {
+        match (self.start_bytes.build(), self.rare_bytes.build()) {
+            // If we could build both start and rare prefilters, then there are
+            // a few cases in which we'd want to use the start-byte prefilter
+            // over the rare-byte prefilter, since the former has lower
+            // overhead.
+            (prestart @ Some(_), prerare @ Some(_)) => {
+                // If the start-byte prefilter can scan for a smaller number
+                // of bytes than the rare-byte prefilter, then it's probably
+                // faster.
+                let has_fewer_bytes =
+                    self.start_bytes.count < self.rare_bytes.count;
+                // Otherwise, if the combined frequency rank of the detected
+                // bytes in the start-byte prefilter is "close" to the combined
+                // frequency rank of the rare-byte prefilter, then we pick
+                // the start-byte prefilter even if the rare-byte prefilter
+                // heuristically searches for rare bytes. This is because the
+                // rare-byte prefilter has higher constant costs, so we tend to
+                // prefer the start-byte prefilter when we can.
+                let has_rarer_bytes =
+                    self.start_bytes.rank_sum <= self.rare_bytes.rank_sum + 50;
+                if has_fewer_bytes || has_rarer_bytes {
+                    prestart
+                } else {
+                    prerare
+                }
+            }
+            (prestart @ Some(_), None) => prestart,
+            (None, prerare @ Some(_)) => prerare,
+            (None, None) if self.ascii_case_insensitive => None,
+            (None, None) => self
+                .packed
+                .as_ref()
+                .and_then(|b| b.build())
+                .map(|s| PrefilterObj::new(Packed(s))),
+        }
+    }
+
+    /// Add a literal string to this prefilter builder.
+    pub fn add(&mut self, bytes: &[u8]) {
+        self.count += 1;
+        self.start_bytes.add(bytes);
+        self.rare_bytes.add(bytes);
+        if let Some(ref mut pbuilder) = self.packed {
+            pbuilder.add(bytes);
+        }
+    }
+}
+
+/// A type that wraps a packed searcher and implements the `Prefilter`
+/// interface.
+#[derive(Clone, Debug)]
+struct Packed(packed::Searcher);
+
+impl Prefilter for Packed {
+    fn next_candidate(
+        &self,
+        _state: &mut PrefilterState,
+        haystack: &[u8],
+        at: usize,
+    ) -> Candidate {
+        self.0.find_at(haystack, at).map_or(Candidate::None, Candidate::Match)
+    }
+
+    fn clone_prefilter(&self) -> Box<dyn Prefilter> {
+        Box::new(self.clone())
+    }
+
+    fn heap_bytes(&self) -> usize {
+        self.0.heap_bytes()
+    }
+
+    fn reports_false_positives(&self) -> bool {
+        false
+    }
+}
+
+/// A builder for constructing a rare byte prefilter.
+///
+/// A rare byte prefilter attempts to pick out a small set of rare bytes that
+/// occurr in the patterns, and then quickly scan to matches of those rare
+/// bytes.
+#[derive(Clone, Debug)]
+struct RareBytesBuilder {
+    /// Whether this prefilter should account for ASCII case insensitivity or
+    /// not.
+    ascii_case_insensitive: bool,
+    /// A set of rare bytes, indexed by byte value.
+    rare_set: ByteSet,
+    /// A set of byte offsets associated with bytes in a pattern. An entry
+    /// corresponds to a particular bytes (its index) and is only non-zero if
+    /// the byte occurred at an offset greater than 0 in at least one pattern.
+    ///
+    /// If a byte's offset is not representable in 8 bits, then the rare bytes
+    /// prefilter becomes inert.
+    byte_offsets: RareByteOffsets,
+    /// Whether this is available as a prefilter or not. This can be set to
+    /// false during construction if a condition is seen that invalidates the
+    /// use of the rare-byte prefilter.
+    available: bool,
+    /// The number of bytes set to an active value in `byte_offsets`.
+    count: usize,
+    /// The sum of frequency ranks for the rare bytes detected. This is
+    /// intended to give a heuristic notion of how rare the bytes are.
+    rank_sum: u16,
+}
+
+/// A set of bytes.
+#[derive(Clone, Copy)]
+struct ByteSet([bool; 256]);
+
+impl ByteSet {
+    fn empty() -> ByteSet {
+        ByteSet([false; 256])
+    }
+
+    fn insert(&mut self, b: u8) -> bool {
+        let new = !self.contains(b);
+        self.0[b as usize] = true;
+        new
+    }
+
+    fn contains(&self, b: u8) -> bool {
+        self.0[b as usize]
+    }
+}
+
+impl fmt::Debug for ByteSet {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let mut bytes = vec![];
+        for b in 0..=255 {
+            if self.contains(b) {
+                bytes.push(b);
+            }
+        }
+        f.debug_struct("ByteSet").field("set", &bytes).finish()
+    }
+}
+
+/// A set of byte offsets, keyed by byte.
+#[derive(Clone, Copy)]
+struct RareByteOffsets {
+    /// Each entry corresponds to the maximum offset of the corresponding
+    /// byte across all patterns seen.
+    set: [RareByteOffset; 256],
+}
+
+impl RareByteOffsets {
+    /// Create a new empty set of rare byte offsets.
+    pub fn empty() -> RareByteOffsets {
+        RareByteOffsets { set: [RareByteOffset::default(); 256] }
+    }
+
+    /// Add the given offset for the given byte to this set. If the offset is
+    /// greater than the existing offset, then it overwrites the previous
+    /// value and returns false. If there is no previous value set, then this
+    /// sets it and returns true.
+    pub fn set(&mut self, byte: u8, off: RareByteOffset) {
+        self.set[byte as usize].max =
+            cmp::max(self.set[byte as usize].max, off.max);
+    }
+}
+
+impl fmt::Debug for RareByteOffsets {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let mut offsets = vec![];
+        for off in self.set.iter() {
+            if off.max > 0 {
+                offsets.push(off);
+            }
+        }
+        f.debug_struct("RareByteOffsets").field("set", &offsets).finish()
+    }
+}
+
+/// Offsets associated with an occurrence of a "rare" byte in any of the
+/// patterns used to construct a single Aho-Corasick automaton.
+#[derive(Clone, Copy, Debug)]
+struct RareByteOffset {
+    /// The maximum offset at which a particular byte occurs from the start
+    /// of any pattern. This is used as a shift amount. That is, when an
+    /// occurrence of this byte is found, the candidate position reported by
+    /// the prefilter is `position_of_byte - max`, such that the automaton
+    /// will begin its search at a position that is guaranteed to observe a
+    /// match.
+    ///
+    /// To avoid accidentally quadratic behavior, a prefilter is considered
+    /// ineffective when it is asked to start scanning from a position that it
+    /// has already scanned past.
+    ///
+    /// Using a `u8` here means that if we ever see a pattern that's longer
+    /// than 255 bytes, then the entire rare byte prefilter is disabled.
+    max: u8,
+}
+
+impl Default for RareByteOffset {
+    fn default() -> RareByteOffset {
+        RareByteOffset { max: 0 }
+    }
+}
+
+impl RareByteOffset {
+    /// Create a new rare byte offset. If the given offset is too big, then
+    /// None is returned. In that case, callers should render the rare bytes
+    /// prefilter inert.
+    fn new(max: usize) -> Option<RareByteOffset> {
+        if max > u8::MAX as usize {
+            None
+        } else {
+            Some(RareByteOffset { max: max as u8 })
+        }
+    }
+}
+
+impl RareBytesBuilder {
+    /// Create a new builder for constructing a rare byte prefilter.
+    fn new() -> RareBytesBuilder {
+        RareBytesBuilder {
+            ascii_case_insensitive: false,
+            rare_set: ByteSet::empty(),
+            byte_offsets: RareByteOffsets::empty(),
+            available: true,
+            count: 0,
+            rank_sum: 0,
+        }
+    }
+
+    /// Enable ASCII case insensitivity. When set, byte strings added to this
+    /// builder will be interpreted without respect to ASCII case.
+    fn ascii_case_insensitive(mut self, yes: bool) -> RareBytesBuilder {
+        self.ascii_case_insensitive = yes;
+        self
+    }
+
+    /// Build the rare bytes prefilter.
+    ///
+    /// If there are more than 3 distinct starting bytes, or if heuristics
+    /// otherwise determine that this prefilter should not be used, then `None`
+    /// is returned.
+    fn build(&self) -> Option<PrefilterObj> {
+        if !self.available || self.count > 3 {
+            return None;
+        }
+        let (mut bytes, mut len) = ([0; 3], 0);
+        for b in 0..=255 {
+            if self.rare_set.contains(b) {
+                bytes[len] = b as u8;
+                len += 1;
+            }
+        }
+        match len {
+            0 => None,
+            1 => Some(PrefilterObj::new(RareBytesOne {
+                byte1: bytes[0],
+                offset: self.byte_offsets.set[bytes[0] as usize],
+            })),
+            2 => Some(PrefilterObj::new(RareBytesTwo {
+                offsets: self.byte_offsets,
+                byte1: bytes[0],
+                byte2: bytes[1],
+            })),
+            3 => Some(PrefilterObj::new(RareBytesThree {
+                offsets: self.byte_offsets,
+                byte1: bytes[0],
+                byte2: bytes[1],
+                byte3: bytes[2],
+            })),
+            _ => unreachable!(),
+        }
+    }
+
+    /// Add a byte string to this builder.
+    ///
+    /// All patterns added to an Aho-Corasick automaton should be added to this
+    /// builder before attempting to construct the prefilter.
+    fn add(&mut self, bytes: &[u8]) {
+        // If we've already given up, then do nothing.
+        if !self.available {
+            return;
+        }
+        // If we've already blown our budget, then don't waste time looking
+        // for more rare bytes.
+        if self.count > 3 {
+            self.available = false;
+            return;
+        }
+        // If the pattern is too long, then our offset table is bunk, so
+        // give up.
+        if bytes.len() >= 256 {
+            self.available = false;
+            return;
+        }
+        let mut rarest = match bytes.get(0) {
+            None => return,
+            Some(&b) => (b, freq_rank(b)),
+        };
+        // The idea here is to look for the rarest byte in each pattern, and
+        // add that to our set. As a special exception, if we see a byte that
+        // we've already added, then we immediately stop and choose that byte,
+        // even if there's another rare byte in the pattern. This helps us
+        // apply the rare byte optimization in more cases by attempting to pick
+        // bytes that are in common between patterns. So for example, if we
+        // were searching for `Sherlock` and `lockjaw`, then this would pick
+        // `k` for both patterns, resulting in the use of `memchr` instead of
+        // `memchr2` for `k` and `j`.
+        let mut found = false;
+        for (pos, &b) in bytes.iter().enumerate() {
+            self.set_offset(pos, b);
+            if found {
+                continue;
+            }
+            if self.rare_set.contains(b) {
+                found = true;
+                continue;
+            }
+            let rank = freq_rank(b);
+            if rank < rarest.1 {
+                rarest = (b, rank);
+            }
+        }
+        if !found {
+            self.add_rare_byte(rarest.0);
+        }
+    }
+
+    fn set_offset(&mut self, pos: usize, byte: u8) {
+        // This unwrap is OK because pos is never bigger than our max.
+        let offset = RareByteOffset::new(pos).unwrap();
+        self.byte_offsets.set(byte, offset);
+        if self.ascii_case_insensitive {
+            self.byte_offsets.set(opposite_ascii_case(byte), offset);
+        }
+    }
+
+    fn add_rare_byte(&mut self, byte: u8) {
+        self.add_one_rare_byte(byte);
+        if self.ascii_case_insensitive {
+            self.add_one_rare_byte(opposite_ascii_case(byte));
+        }
+    }
+
+    fn add_one_rare_byte(&mut self, byte: u8) {
+        if self.rare_set.insert(byte) {
+            self.count += 1;
+            self.rank_sum += freq_rank(byte) as u16;
+        }
+    }
+}
+
+/// A prefilter for scanning for a single "rare" byte.
+#[derive(Clone, Debug)]
+struct RareBytesOne {
+    byte1: u8,
+    offset: RareByteOffset,
+}
+
+impl Prefilter for RareBytesOne {
+    fn next_candidate(
+        &self,
+        state: &mut PrefilterState,
+        haystack: &[u8],
+        at: usize,
+    ) -> Candidate {
+        memchr(self.byte1, &haystack[at..])
+            .map(|i| {
+                let pos = at + i;
+                state.last_scan_at = pos;
+                cmp::max(at, pos.saturating_sub(self.offset.max as usize))
+            })
+            .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
+    }
+
+    fn clone_prefilter(&self) -> Box<dyn Prefilter> {
+        Box::new(self.clone())
+    }
+
+    fn heap_bytes(&self) -> usize {
+        0
+    }
+}
+
+/// A prefilter for scanning for two "rare" bytes.
+#[derive(Clone, Debug)]
+struct RareBytesTwo {
+    offsets: RareByteOffsets,
+    byte1: u8,
+    byte2: u8,
+}
+
+impl Prefilter for RareBytesTwo {
+    fn next_candidate(
+        &self,
+        state: &mut PrefilterState,
+        haystack: &[u8],
+        at: usize,
+    ) -> Candidate {
+        memchr2(self.byte1, self.byte2, &haystack[at..])
+            .map(|i| {
+                let pos = at + i;
+                state.update_at(pos);
+                let offset = self.offsets.set[haystack[pos] as usize].max;
+                cmp::max(at, pos.saturating_sub(offset as usize))
+            })
+            .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
+    }
+
+    fn clone_prefilter(&self) -> Box<dyn Prefilter> {
+        Box::new(self.clone())
+    }
+
+    fn heap_bytes(&self) -> usize {
+        0
+    }
+}
+
+/// A prefilter for scanning for three "rare" bytes.
+#[derive(Clone, Debug)]
+struct RareBytesThree {
+    offsets: RareByteOffsets,
+    byte1: u8,
+    byte2: u8,
+    byte3: u8,
+}
+
+impl Prefilter for RareBytesThree {
+    fn next_candidate(
+        &self,
+        state: &mut PrefilterState,
+        haystack: &[u8],
+        at: usize,
+    ) -> Candidate {
+        memchr3(self.byte1, self.byte2, self.byte3, &haystack[at..])
+            .map(|i| {
+                let pos = at + i;
+                state.update_at(pos);
+                let offset = self.offsets.set[haystack[pos] as usize].max;
+                cmp::max(at, pos.saturating_sub(offset as usize))
+            })
+            .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
+    }
+
+    fn clone_prefilter(&self) -> Box<dyn Prefilter> {
+        Box::new(self.clone())
+    }
+
+    fn heap_bytes(&self) -> usize {
+        0
+    }
+}
+
+/// A builder for constructing a starting byte prefilter.
+///
+/// A starting byte prefilter is a simplistic prefilter that looks for possible
+/// matches by reporting all positions corresponding to a particular byte. This
+/// generally only takes affect when there are at most 3 distinct possible
+/// starting bytes. e.g., the patterns `foo`, `bar`, and `baz` have two
+/// distinct starting bytes (`f` and `b`), and this prefilter returns all
+/// occurrences of either `f` or `b`.
+///
+/// In some cases, a heuristic frequency analysis may determine that it would
+/// be better not to use this prefilter even when there are 3 or fewer distinct
+/// starting bytes.
+#[derive(Clone, Debug)]
+struct StartBytesBuilder {
+    /// Whether this prefilter should account for ASCII case insensitivity or
+    /// not.
+    ascii_case_insensitive: bool,
+    /// The set of starting bytes observed.
+    byteset: Vec<bool>,
+    /// The number of bytes set to true in `byteset`.
+    count: usize,
+    /// The sum of frequency ranks for the rare bytes detected. This is
+    /// intended to give a heuristic notion of how rare the bytes are.
+    rank_sum: u16,
+}
+
+impl StartBytesBuilder {
+    /// Create a new builder for constructing a start byte prefilter.
+    fn new() -> StartBytesBuilder {
+        StartBytesBuilder {
+            ascii_case_insensitive: false,
+            byteset: vec![false; 256],
+            count: 0,
+            rank_sum: 0,
+        }
+    }
+
+    /// Enable ASCII case insensitivity. When set, byte strings added to this
+    /// builder will be interpreted without respect to ASCII case.
+    fn ascii_case_insensitive(mut self, yes: bool) -> StartBytesBuilder {
+        self.ascii_case_insensitive = yes;
+        self
+    }
+
+    /// Build the starting bytes prefilter.
+    ///
+    /// If there are more than 3 distinct starting bytes, or if heuristics
+    /// otherwise determine that this prefilter should not be used, then `None`
+    /// is returned.
+    fn build(&self) -> Option<PrefilterObj> {
+        if self.count > 3 {
+            return None;
+        }
+        let (mut bytes, mut len) = ([0; 3], 0);
+        for b in 0..256 {
+            if !self.byteset[b] {
+                continue;
+            }
+            // We don't handle non-ASCII bytes for now. Getting non-ASCII
+            // bytes right is trickier, since we generally don't want to put
+            // a leading UTF-8 code unit into a prefilter that isn't ASCII,
+            // since they can frequently. Instead, it would be better to use a
+            // continuation byte, but this requires more sophisticated analysis
+            // of the automaton and a richer prefilter API.
+            if b > 0x7F {
+                return None;
+            }
+            bytes[len] = b as u8;
+            len += 1;
+        }
+        match len {
+            0 => None,
+            1 => Some(PrefilterObj::new(StartBytesOne { byte1: bytes[0] })),
+            2 => Some(PrefilterObj::new(StartBytesTwo {
+                byte1: bytes[0],
+                byte2: bytes[1],
+            })),
+            3 => Some(PrefilterObj::new(StartBytesThree {
+                byte1: bytes[0],
+                byte2: bytes[1],
+                byte3: bytes[2],
+            })),
+            _ => unreachable!(),
+        }
+    }
+
+    /// Add a byte string to this builder.
+    ///
+    /// All patterns added to an Aho-Corasick automaton should be added to this
+    /// builder before attempting to construct the prefilter.
+    fn add(&mut self, bytes: &[u8]) {
+        if self.count > 3 {
+            return;
+        }
+        if let Some(&byte) = bytes.get(0) {
+            self.add_one_byte(byte);
+            if self.ascii_case_insensitive {
+                self.add_one_byte(opposite_ascii_case(byte));
+            }
+        }
+    }
+
+    fn add_one_byte(&mut self, byte: u8) {
+        if !self.byteset[byte as usize] {
+            self.byteset[byte as usize] = true;
+            self.count += 1;
+            self.rank_sum += freq_rank(byte) as u16;
+        }
+    }
+}
+
+/// A prefilter for scanning for a single starting byte.
+#[derive(Clone, Debug)]
+struct StartBytesOne {
+    byte1: u8,
+}
+
+impl Prefilter for StartBytesOne {
+    fn next_candidate(
+        &self,
+        _state: &mut PrefilterState,
+        haystack: &[u8],
+        at: usize,
+    ) -> Candidate {
+        memchr(self.byte1, &haystack[at..])
+            .map(|i| at + i)
+            .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
+    }
+
+    fn clone_prefilter(&self) -> Box<dyn Prefilter> {
+        Box::new(self.clone())
+    }
+
+    fn heap_bytes(&self) -> usize {
+        0
+    }
+}
+
+/// A prefilter for scanning for two starting bytes.
+#[derive(Clone, Debug)]
+struct StartBytesTwo {
+    byte1: u8,
+    byte2: u8,
+}
+
+impl Prefilter for StartBytesTwo {
+    fn next_candidate(
+        &self,
+        _state: &mut PrefilterState,
+        haystack: &[u8],
+        at: usize,
+    ) -> Candidate {
+        memchr2(self.byte1, self.byte2, &haystack[at..])
+            .map(|i| at + i)
+            .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
+    }
+
+    fn clone_prefilter(&self) -> Box<dyn Prefilter> {
+        Box::new(self.clone())
+    }
+
+    fn heap_bytes(&self) -> usize {
+        0
+    }
+}
+
+/// A prefilter for scanning for three starting bytes.
+#[derive(Clone, Debug)]
+struct StartBytesThree {
+    byte1: u8,
+    byte2: u8,
+    byte3: u8,
+}
+
+impl Prefilter for StartBytesThree {
+    fn next_candidate(
+        &self,
+        _state: &mut PrefilterState,
+        haystack: &[u8],
+        at: usize,
+    ) -> Candidate {
+        memchr3(self.byte1, self.byte2, self.byte3, &haystack[at..])
+            .map(|i| at + i)
+            .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
+    }
+
+    fn clone_prefilter(&self) -> Box<dyn Prefilter> {
+        Box::new(self.clone())
+    }
+
+    fn heap_bytes(&self) -> usize {
+        0
+    }
+}
+
+/// Return the next candidate reported by the given prefilter while
+/// simultaneously updating the given prestate.
+///
+/// The caller is responsible for checking the prestate before deciding whether
+/// to initiate a search.
+#[inline]
+pub fn next<P: Prefilter>(
+    prestate: &mut PrefilterState,
+    prefilter: P,
+    haystack: &[u8],
+    at: usize,
+) -> Candidate {
+    let cand = prefilter.next_candidate(prestate, haystack, at);
+    match cand {
+        Candidate::None => {
+            prestate.update_skipped_bytes(haystack.len() - at);
+        }
+        Candidate::Match(ref m) => {
+            prestate.update_skipped_bytes(m.start() - at);
+        }
+        Candidate::PossibleStartOfMatch(i) => {
+            prestate.update_skipped_bytes(i - at);
+        }
+    }
+    cand
+}
+
+/// If the given byte is an ASCII letter, then return it in the opposite case.
+/// e.g., Given `b'A'`, this returns `b'a'`, and given `b'a'`, this returns
+/// `b'A'`. If a non-ASCII letter is given, then the given byte is returned.
+pub fn opposite_ascii_case(b: u8) -> u8 {
+    if b'A' <= b && b <= b'Z' {
+        b.to_ascii_lowercase()
+    } else if b'a' <= b && b <= b'z' {
+        b.to_ascii_uppercase()
+    } else {
+        b
+    }
+}
+
+/// Return the frequency rank of the given byte. The higher the rank, the more
+/// common the byte (heuristically speaking).
+fn freq_rank(b: u8) -> u8 {
+    use byte_frequencies::BYTE_FREQUENCIES;
+    BYTE_FREQUENCIES[b as usize]
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn scratch() {
+        let mut b = Builder::new(MatchKind::LeftmostFirst);
+        b.add(b"Sherlock");
+        b.add(b"locjaw");
+        // b.add(b"Sherlock");
+        // b.add(b"Holmes");
+        // b.add(b"Watson");
+        // b.add("Шерлок Холмс".as_bytes());
+        // b.add("Джон Уотсон".as_bytes());
+
+        let s = b.build().unwrap();
+        println!("{:?}", s);
+    }
+}
diff --git a/src/state_id.rs b/src/state_id.rs
new file mode 100644
index 0000000..22f6be2
--- /dev/null
+++ b/src/state_id.rs
@@ -0,0 +1,192 @@
+use std::fmt::Debug;
+use std::hash::Hash;
+
+use error::{Error, Result};
+
+// NOTE: Most of this code was copied from regex-automata, but without the
+// (de)serialization specific stuff.
+
+/// Check that the premultiplication of the given state identifier can
+/// fit into the representation indicated by `S`. If it cannot, or if it
+/// overflows `usize` itself, then an error is returned.
+pub fn premultiply_overflow_error<S: StateID>(
+    last_state: S,
+    alphabet_len: usize,
+) -> Result<()> {
+    let requested = match last_state.to_usize().checked_mul(alphabet_len) {
+        Some(requested) => requested,
+        None => return Err(Error::premultiply_overflow(0, 0)),
+    };
+    if requested > S::max_id() {
+        return Err(Error::premultiply_overflow(S::max_id(), requested));
+    }
+    Ok(())
+}
+
+/// Convert the given `usize` to the chosen state identifier
+/// representation. If the given value cannot fit in the chosen
+/// representation, then an error is returned.
+pub fn usize_to_state_id<S: StateID>(value: usize) -> Result<S> {
+    if value > S::max_id() {
+        Err(Error::state_id_overflow(S::max_id()))
+    } else {
+        Ok(S::from_usize(value))
+    }
+}
+
+/// Return the unique identifier for an automaton's fail state in the chosen
+/// representation indicated by `S`.
+pub fn fail_id<S: StateID>() -> S {
+    S::from_usize(0)
+}
+
+/// Return the unique identifier for an automaton's fail state in the chosen
+/// representation indicated by `S`.
+pub fn dead_id<S: StateID>() -> S {
+    S::from_usize(1)
+}
+
+mod private {
+    /// Sealed stops crates other than aho-corasick from implementing any
+    /// traits that use it.
+    pub trait Sealed {}
+    impl Sealed for u8 {}
+    impl Sealed for u16 {}
+    impl Sealed for u32 {}
+    impl Sealed for u64 {}
+    impl Sealed for usize {}
+}
+
+/// A trait describing the representation of an automaton's state identifier.
+///
+/// The purpose of this trait is to safely express both the possible state
+/// identifier representations that can be used in an automaton and to convert
+/// between state identifier representations and types that can be used to
+/// efficiently index memory (such as `usize`).
+///
+/// In general, one should not need to implement this trait explicitly. Indeed,
+/// for now, this trait is sealed such that it cannot be implemented by any
+/// other type. In particular, this crate provides implementations for `u8`,
+/// `u16`, `u32`, `u64` and `usize`. (`u32` and `u64` are only provided for
+/// targets that can represent all corresponding values in a `usize`.)
+pub trait StateID:
+    private::Sealed
+    + Clone
+    + Copy
+    + Debug
+    + Eq
+    + Hash
+    + PartialEq
+    + PartialOrd
+    + Ord
+{
+    /// Convert from a `usize` to this implementation's representation.
+    ///
+    /// Implementors may assume that `n <= Self::max_id`. That is, implementors
+    /// do not need to check whether `n` can fit inside this implementation's
+    /// representation.
+    fn from_usize(n: usize) -> Self;
+
+    /// Convert this implementation's representation to a `usize`.
+    ///
+    /// Implementors must not return a `usize` value greater than
+    /// `Self::max_id` and must not permit overflow when converting between the
+    /// implementor's representation and `usize`. In general, the preferred
+    /// way for implementors to achieve this is to simply not provide
+    /// implementations of `StateID` that cannot fit into the target platform's
+    /// `usize`.
+    fn to_usize(self) -> usize;
+
+    /// Return the maximum state identifier supported by this representation.
+    ///
+    /// Implementors must return a correct bound. Doing otherwise may result
+    /// in unspecified behavior (but will not violate memory safety).
+    fn max_id() -> usize;
+}
+
+impl StateID for usize {
+    #[inline]
+    fn from_usize(n: usize) -> usize {
+        n
+    }
+
+    #[inline]
+    fn to_usize(self) -> usize {
+        self
+    }
+
+    #[inline]
+    fn max_id() -> usize {
+        ::std::usize::MAX
+    }
+}
+
+impl StateID for u8 {
+    #[inline]
+    fn from_usize(n: usize) -> u8 {
+        n as u8
+    }
+
+    #[inline]
+    fn to_usize(self) -> usize {
+        self as usize
+    }
+
+    #[inline]
+    fn max_id() -> usize {
+        ::std::u8::MAX as usize
+    }
+}
+
+impl StateID for u16 {
+    #[inline]
+    fn from_usize(n: usize) -> u16 {
+        n as u16
+    }
+
+    #[inline]
+    fn to_usize(self) -> usize {
+        self as usize
+    }
+
+    #[inline]
+    fn max_id() -> usize {
+        ::std::u16::MAX as usize
+    }
+}
+
+#[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
+impl StateID for u32 {
+    #[inline]
+    fn from_usize(n: usize) -> u32 {
+        n as u32
+    }
+
+    #[inline]
+    fn to_usize(self) -> usize {
+        self as usize
+    }
+
+    #[inline]
+    fn max_id() -> usize {
+        ::std::u32::MAX as usize
+    }
+}
+
+#[cfg(target_pointer_width = "64")]
+impl StateID for u64 {
+    #[inline]
+    fn from_usize(n: usize) -> u64 {
+        n as u64
+    }
+
+    #[inline]
+    fn to_usize(self) -> usize {
+        self as usize
+    }
+
+    #[inline]
+    fn max_id() -> usize {
+        ::std::u64::MAX as usize
+    }
+}
diff --git a/src/tests.rs b/src/tests.rs
new file mode 100644
index 0000000..0ae31f0
--- /dev/null
+++ b/src/tests.rs
@@ -0,0 +1,1152 @@
+use std::collections::HashMap;
+use std::io;
+use std::usize;
+
+use {AhoCorasickBuilder, Match, MatchKind};
+
+/// A description of a single test against an Aho-Corasick automaton.
+///
+/// A single test may not necessarily pass on every configuration of an
+/// Aho-Corasick automaton. The tests are categorized and grouped appropriately
+/// below.
+#[derive(Clone, Debug, Eq, PartialEq)]
+struct SearchTest {
+    /// The name of this test, for debugging.
+    name: &'static str,
+    /// The patterns to search for.
+    patterns: &'static [&'static str],
+    /// The text to search.
+    haystack: &'static str,
+    /// Each match is a triple of (pattern_index, start, end), where
+    /// pattern_index is an index into `patterns` and `start`/`end` are indices
+    /// into `haystack`.
+    matches: &'static [(usize, usize, usize)],
+}
+
+/// Short-hand constructor for SearchTest. We use it a lot below.
+macro_rules! t {
+    ($name:ident, $patterns:expr, $haystack:expr, $matches:expr) => {
+        SearchTest {
+            name: stringify!($name),
+            patterns: $patterns,
+            haystack: $haystack,
+            matches: $matches,
+        }
+    };
+}
+
+/// A collection of test groups.
+type TestCollection = &'static [&'static [SearchTest]];
+
+// Define several collections corresponding to the different type of match
+// semantics supported by Aho-Corasick. These collections have some overlap,
+// but each collection should have some tests that no other collection has.
+
+/// Tests for Aho-Corasick's standard non-overlapping match semantics.
+const AC_STANDARD_NON_OVERLAPPING: TestCollection =
+    &[BASICS, NON_OVERLAPPING, STANDARD, REGRESSION];
+
+/// Tests for Aho-Corasick's anchored standard non-overlapping match semantics.
+const AC_STANDARD_ANCHORED_NON_OVERLAPPING: TestCollection =
+    &[ANCHORED_BASICS, ANCHORED_NON_OVERLAPPING, STANDARD_ANCHORED];
+
+/// Tests for Aho-Corasick's standard overlapping match semantics.
+const AC_STANDARD_OVERLAPPING: TestCollection =
+    &[BASICS, OVERLAPPING, REGRESSION];
+
+/// Tests for Aho-Corasick's anchored standard overlapping match semantics.
+const AC_STANDARD_ANCHORED_OVERLAPPING: TestCollection =
+    &[ANCHORED_BASICS, ANCHORED_OVERLAPPING];
+
+/// Tests for Aho-Corasick's leftmost-first match semantics.
+const AC_LEFTMOST_FIRST: TestCollection =
+    &[BASICS, NON_OVERLAPPING, LEFTMOST, LEFTMOST_FIRST, REGRESSION];
+
+/// Tests for Aho-Corasick's anchored leftmost-first match semantics.
+const AC_LEFTMOST_FIRST_ANCHORED: TestCollection = &[
+    ANCHORED_BASICS,
+    ANCHORED_NON_OVERLAPPING,
+    ANCHORED_LEFTMOST,
+    ANCHORED_LEFTMOST_FIRST,
+];
+
+/// Tests for Aho-Corasick's leftmost-longest match semantics.
+const AC_LEFTMOST_LONGEST: TestCollection =
+    &[BASICS, NON_OVERLAPPING, LEFTMOST, LEFTMOST_LONGEST, REGRESSION];
+
+/// Tests for Aho-Corasick's anchored leftmost-longest match semantics.
+const AC_LEFTMOST_LONGEST_ANCHORED: TestCollection = &[
+    ANCHORED_BASICS,
+    ANCHORED_NON_OVERLAPPING,
+    ANCHORED_LEFTMOST,
+    ANCHORED_LEFTMOST_LONGEST,
+];
+
+// Now define the individual tests that make up the collections above.
+
+/// A collection of tests for the Aho-Corasick algorithm that should always be
+/// true regardless of match semantics. That is, all combinations of
+/// leftmost-{shortest, first, longest} x {overlapping, non-overlapping}
+/// should produce the same answer.
+const BASICS: &'static [SearchTest] = &[
+    t!(basic000, &[], "", &[]),
+    t!(basic001, &["a"], "", &[]),
+    t!(basic010, &["a"], "a", &[(0, 0, 1)]),
+    t!(basic020, &["a"], "aa", &[(0, 0, 1), (0, 1, 2)]),
+    t!(basic030, &["a"], "aaa", &[(0, 0, 1), (0, 1, 2), (0, 2, 3)]),
+    t!(basic040, &["a"], "aba", &[(0, 0, 1), (0, 2, 3)]),
+    t!(basic050, &["a"], "bba", &[(0, 2, 3)]),
+    t!(basic060, &["a"], "bbb", &[]),
+    t!(basic070, &["a"], "bababbbba", &[(0, 1, 2), (0, 3, 4), (0, 8, 9)]),
+    t!(basic100, &["aa"], "", &[]),
+    t!(basic110, &["aa"], "aa", &[(0, 0, 2)]),
+    t!(basic120, &["aa"], "aabbaa", &[(0, 0, 2), (0, 4, 6)]),
+    t!(basic130, &["aa"], "abbab", &[]),
+    t!(basic140, &["aa"], "abbabaa", &[(0, 5, 7)]),
+    t!(basic200, &["abc"], "abc", &[(0, 0, 3)]),
+    t!(basic210, &["abc"], "zazabzabcz", &[(0, 6, 9)]),
+    t!(basic220, &["abc"], "zazabczabcz", &[(0, 3, 6), (0, 7, 10)]),
+    t!(basic300, &["a", "b"], "", &[]),
+    t!(basic310, &["a", "b"], "z", &[]),
+    t!(basic320, &["a", "b"], "b", &[(1, 0, 1)]),
+    t!(basic330, &["a", "b"], "a", &[(0, 0, 1)]),
+    t!(
+        basic340,
+        &["a", "b"],
+        "abba",
+        &[(0, 0, 1), (1, 1, 2), (1, 2, 3), (0, 3, 4),]
+    ),
+    t!(
+        basic350,
+        &["b", "a"],
+        "abba",
+        &[(1, 0, 1), (0, 1, 2), (0, 2, 3), (1, 3, 4),]
+    ),
+    t!(basic360, &["abc", "bc"], "xbc", &[(1, 1, 3),]),
+    t!(basic400, &["foo", "bar"], "", &[]),
+    t!(basic410, &["foo", "bar"], "foobar", &[(0, 0, 3), (1, 3, 6),]),
+    t!(basic420, &["foo", "bar"], "barfoo", &[(1, 0, 3), (0, 3, 6),]),
+    t!(basic430, &["foo", "bar"], "foofoo", &[(0, 0, 3), (0, 3, 6),]),
+    t!(basic440, &["foo", "bar"], "barbar", &[(1, 0, 3), (1, 3, 6),]),
+    t!(basic450, &["foo", "bar"], "bafofoo", &[(0, 4, 7),]),
+    t!(basic460, &["bar", "foo"], "bafofoo", &[(1, 4, 7),]),
+    t!(basic470, &["foo", "bar"], "fobabar", &[(1, 4, 7),]),
+    t!(basic480, &["bar", "foo"], "fobabar", &[(0, 4, 7),]),
+    t!(basic600, &[""], "", &[(0, 0, 0)]),
+    t!(basic610, &[""], "a", &[(0, 0, 0), (0, 1, 1)]),
+    t!(basic620, &[""], "abc", &[(0, 0, 0), (0, 1, 1), (0, 2, 2), (0, 3, 3)]),
+    t!(basic700, &["yabcdef", "abcdezghi"], "yabcdefghi", &[(0, 0, 7),]),
+    t!(basic710, &["yabcdef", "abcdezghi"], "yabcdezghi", &[(1, 1, 10),]),
+    t!(
+        basic720,
+        &["yabcdef", "bcdeyabc", "abcdezghi"],
+        "yabcdezghi",
+        &[(2, 1, 10),]
+    ),
+];
+
+/// A collection of *anchored* tests for the Aho-Corasick algorithm that should
+/// always be true regardless of match semantics. That is, all combinations of
+/// leftmost-{shortest, first, longest} x {overlapping, non-overlapping} should
+/// produce the same answer.
+const ANCHORED_BASICS: &'static [SearchTest] = &[
+    t!(abasic000, &[], "", &[]),
+    t!(abasic010, &[""], "", &[(0, 0, 0)]),
+    t!(abasic020, &[""], "a", &[(0, 0, 0)]),
+    t!(abasic030, &[""], "abc", &[(0, 0, 0)]),
+    t!(abasic100, &["a"], "a", &[(0, 0, 1)]),
+    t!(abasic110, &["a"], "aa", &[(0, 0, 1)]),
+    t!(abasic120, &["a", "b"], "ab", &[(0, 0, 1)]),
+    t!(abasic130, &["a", "b"], "ba", &[(1, 0, 1)]),
+    t!(abasic140, &["foo", "foofoo"], "foo", &[(0, 0, 3)]),
+    t!(abasic150, &["foofoo", "foo"], "foo", &[(1, 0, 3)]),
+];
+
+/// Tests for non-overlapping standard match semantics.
+///
+/// These tests generally shouldn't pass for leftmost-{first,longest}, although
+/// some do in order to write clearer tests. For example, standard000 will
+/// pass with leftmost-first semantics, but standard010 will not. We write
+/// both to emphasize how the match semantics work.
+const STANDARD: &'static [SearchTest] = &[
+    t!(standard000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]),
+    t!(standard010, &["abcd", "ab"], "abcd", &[(1, 0, 2)]),
+    t!(standard020, &["abcd", "ab", "abc"], "abcd", &[(1, 0, 2)]),
+    t!(standard030, &["abcd", "abc", "ab"], "abcd", &[(2, 0, 2)]),
+    t!(standard040, &["a", ""], "a", &[(1, 0, 0), (1, 1, 1)]),
+    t!(
+        standard400,
+        &["abcd", "bcd", "cd", "b"],
+        "abcd",
+        &[(3, 1, 2), (2, 2, 4),]
+    ),
+    t!(standard410, &["", "a"], "a", &[(0, 0, 0), (0, 1, 1),]),
+    t!(standard420, &["", "a"], "aa", &[(0, 0, 0), (0, 1, 1), (0, 2, 2),]),
+    t!(standard430, &["", "a", ""], "a", &[(0, 0, 0), (0, 1, 1),]),
+    t!(standard440, &["a", "", ""], "a", &[(1, 0, 0), (1, 1, 1),]),
+    t!(standard450, &["", "", "a"], "a", &[(0, 0, 0), (0, 1, 1),]),
+];
+
+/// Like STANDARD, but for anchored searches.
+const STANDARD_ANCHORED: &'static [SearchTest] = &[
+    t!(astandard000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]),
+    t!(astandard010, &["abcd", "ab"], "abcd", &[(1, 0, 2)]),
+    t!(astandard020, &["abcd", "ab", "abc"], "abcd", &[(1, 0, 2)]),
+    t!(astandard030, &["abcd", "abc", "ab"], "abcd", &[(2, 0, 2)]),
+    t!(astandard040, &["a", ""], "a", &[(1, 0, 0)]),
+    t!(astandard050, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4)]),
+    t!(astandard410, &["", "a"], "a", &[(0, 0, 0)]),
+    t!(astandard420, &["", "a"], "aa", &[(0, 0, 0)]),
+    t!(astandard430, &["", "a", ""], "a", &[(0, 0, 0)]),
+    t!(astandard440, &["a", "", ""], "a", &[(1, 0, 0)]),
+    t!(astandard450, &["", "", "a"], "a", &[(0, 0, 0)]),
+];
+
+/// Tests for non-overlapping leftmost match semantics. These should pass for
+/// both leftmost-first and leftmost-longest match kinds. Stated differently,
+/// among ambiguous matches, the longest match and the match that appeared
+/// first when constructing the automaton should always be the same.
+const LEFTMOST: &'static [SearchTest] = &[
+    t!(leftmost000, &["ab", "ab"], "abcd", &[(0, 0, 2)]),
+    t!(leftmost010, &["a", ""], "a", &[(0, 0, 1), (1, 1, 1)]),
+    t!(leftmost020, &["", ""], "a", &[(0, 0, 0), (0, 1, 1)]),
+    t!(leftmost030, &["a", "ab"], "aa", &[(0, 0, 1), (0, 1, 2)]),
+    t!(leftmost031, &["ab", "a"], "aa", &[(1, 0, 1), (1, 1, 2)]),
+    t!(leftmost032, &["ab", "a"], "xayabbbz", &[(1, 1, 2), (0, 3, 5)]),
+    t!(leftmost300, &["abcd", "bce", "b"], "abce", &[(1, 1, 4)]),
+    t!(leftmost310, &["abcd", "ce", "bc"], "abce", &[(2, 1, 3)]),
+    t!(leftmost320, &["abcd", "bce", "ce", "b"], "abce", &[(1, 1, 4)]),
+    t!(leftmost330, &["abcd", "bce", "cz", "bc"], "abcz", &[(3, 1, 3)]),
+    t!(leftmost340, &["bce", "cz", "bc"], "bcz", &[(2, 0, 2)]),
+    t!(leftmost350, &["abc", "bd", "ab"], "abd", &[(2, 0, 2)]),
+    t!(
+        leftmost360,
+        &["abcdefghi", "hz", "abcdefgh"],
+        "abcdefghz",
+        &[(2, 0, 8),]
+    ),
+    t!(
+        leftmost370,
+        &["abcdefghi", "cde", "hz", "abcdefgh"],
+        "abcdefghz",
+        &[(3, 0, 8),]
+    ),
+    t!(
+        leftmost380,
+        &["abcdefghi", "hz", "abcdefgh", "a"],
+        "abcdefghz",
+        &[(2, 0, 8),]
+    ),
+    t!(
+        leftmost390,
+        &["b", "abcdefghi", "hz", "abcdefgh"],
+        "abcdefghz",
+        &[(3, 0, 8),]
+    ),
+    t!(
+        leftmost400,
+        &["h", "abcdefghi", "hz", "abcdefgh"],
+        "abcdefghz",
+        &[(3, 0, 8),]
+    ),
+    t!(
+        leftmost410,
+        &["z", "abcdefghi", "hz", "abcdefgh"],
+        "abcdefghz",
+        &[(3, 0, 8), (0, 8, 9),]
+    ),
+];
+
+/// Like LEFTMOST, but for anchored searches.
+const ANCHORED_LEFTMOST: &'static [SearchTest] = &[
+    t!(aleftmost000, &["ab", "ab"], "abcd", &[(0, 0, 2)]),
+    t!(aleftmost010, &["a", ""], "a", &[(0, 0, 1)]),
+    t!(aleftmost020, &["", ""], "a", &[(0, 0, 0)]),
+    t!(aleftmost030, &["a", "ab"], "aa", &[(0, 0, 1)]),
+    t!(aleftmost031, &["ab", "a"], "aa", &[(1, 0, 1)]),
+    t!(aleftmost032, &["ab", "a"], "xayabbbz", &[]),
+    t!(aleftmost300, &["abcd", "bce", "b"], "abce", &[]),
+    t!(aleftmost310, &["abcd", "ce", "bc"], "abce", &[]),
+    t!(aleftmost320, &["abcd", "bce", "ce", "b"], "abce", &[]),
+    t!(aleftmost330, &["abcd", "bce", "cz", "bc"], "abcz", &[]),
+    t!(aleftmost340, &["bce", "cz", "bc"], "bcz", &[(2, 0, 2)]),
+    t!(aleftmost350, &["abc", "bd", "ab"], "abd", &[(2, 0, 2)]),
+    t!(
+        aleftmost360,
+        &["abcdefghi", "hz", "abcdefgh"],
+        "abcdefghz",
+        &[(2, 0, 8),]
+    ),
+    t!(
+        aleftmost370,
+        &["abcdefghi", "cde", "hz", "abcdefgh"],
+        "abcdefghz",
+        &[(3, 0, 8),]
+    ),
+    t!(
+        aleftmost380,
+        &["abcdefghi", "hz", "abcdefgh", "a"],
+        "abcdefghz",
+        &[(2, 0, 8),]
+    ),
+    t!(
+        aleftmost390,
+        &["b", "abcdefghi", "hz", "abcdefgh"],
+        "abcdefghz",
+        &[(3, 0, 8),]
+    ),
+    t!(
+        aleftmost400,
+        &["h", "abcdefghi", "hz", "abcdefgh"],
+        "abcdefghz",
+        &[(3, 0, 8),]
+    ),
+    t!(
+        aleftmost410,
+        &["z", "abcdefghi", "hz", "abcdefgh"],
+        "abcdefghz",
+        &[(3, 0, 8)]
+    ),
+];
+
+/// Tests for non-overlapping leftmost-first match semantics. These tests
+/// should generally be specific to leftmost-first, which means they should
+/// generally fail under leftmost-longest semantics.
+const LEFTMOST_FIRST: &'static [SearchTest] = &[
+    t!(leftfirst000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]),
+    t!(leftfirst010, &["", "a"], "a", &[(0, 0, 0), (0, 1, 1)]),
+    t!(leftfirst011, &["", "a", ""], "a", &[(0, 0, 0), (0, 1, 1),]),
+    t!(leftfirst012, &["a", "", ""], "a", &[(0, 0, 1), (1, 1, 1),]),
+    t!(leftfirst013, &["", "", "a"], "a", &[(0, 0, 0), (0, 1, 1),]),
+    t!(leftfirst020, &["abcd", "ab"], "abcd", &[(0, 0, 4)]),
+    t!(leftfirst030, &["ab", "ab"], "abcd", &[(0, 0, 2)]),
+    t!(leftfirst040, &["a", "ab"], "xayabbbz", &[(0, 1, 2), (0, 3, 4)]),
+    t!(leftfirst100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[(1, 1, 5)]),
+    t!(leftfirst110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[(1, 1, 6)]),
+    t!(leftfirst300, &["abcd", "b", "bce"], "abce", &[(1, 1, 2)]),
+    t!(
+        leftfirst310,
+        &["abcd", "b", "bce", "ce"],
+        "abce",
+        &[(1, 1, 2), (3, 2, 4),]
+    ),
+    t!(
+        leftfirst320,
+        &["a", "abcdefghi", "hz", "abcdefgh"],
+        "abcdefghz",
+        &[(0, 0, 1), (2, 7, 9),]
+    ),
+    t!(leftfirst330, &["a", "abab"], "abab", &[(0, 0, 1), (0, 2, 3)]),
+];
+
+/// Like LEFTMOST_FIRST, but for anchored searches.
+const ANCHORED_LEFTMOST_FIRST: &'static [SearchTest] = &[
+    t!(aleftfirst000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]),
+    t!(aleftfirst010, &["", "a"], "a", &[(0, 0, 0)]),
+    t!(aleftfirst011, &["", "a", ""], "a", &[(0, 0, 0)]),
+    t!(aleftfirst012, &["a", "", ""], "a", &[(0, 0, 1)]),
+    t!(aleftfirst013, &["", "", "a"], "a", &[(0, 0, 0)]),
+    t!(aleftfirst020, &["abcd", "ab"], "abcd", &[(0, 0, 4)]),
+    t!(aleftfirst030, &["ab", "ab"], "abcd", &[(0, 0, 2)]),
+    t!(aleftfirst040, &["a", "ab"], "xayabbbz", &[]),
+    t!(aleftfirst100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[]),
+    t!(aleftfirst110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[]),
+    t!(aleftfirst300, &["abcd", "b", "bce"], "abce", &[]),
+    t!(aleftfirst310, &["abcd", "b", "bce", "ce"], "abce", &[]),
+    t!(
+        aleftfirst320,
+        &["a", "abcdefghi", "hz", "abcdefgh"],
+        "abcdefghz",
+        &[(0, 0, 1)]
+    ),
+    t!(aleftfirst330, &["a", "abab"], "abab", &[(0, 0, 1)]),
+];
+
+/// Tests for non-overlapping leftmost-longest match semantics. These tests
+/// should generally be specific to leftmost-longest, which means they should
+/// generally fail under leftmost-first semantics.
+const LEFTMOST_LONGEST: &'static [SearchTest] = &[
+    t!(leftlong000, &["ab", "abcd"], "abcd", &[(1, 0, 4)]),
+    t!(leftlong010, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4),]),
+    t!(leftlong020, &["", "a"], "a", &[(1, 0, 1), (0, 1, 1),]),
+    t!(leftlong021, &["", "a", ""], "a", &[(1, 0, 1), (0, 1, 1),]),
+    t!(leftlong022, &["a", "", ""], "a", &[(0, 0, 1), (1, 1, 1),]),
+    t!(leftlong023, &["", "", "a"], "a", &[(2, 0, 1), (0, 1, 1),]),
+    t!(leftlong030, &["", "a"], "aa", &[(1, 0, 1), (1, 1, 2), (0, 2, 2),]),
+    t!(leftlong040, &["a", "ab"], "a", &[(0, 0, 1)]),
+    t!(leftlong050, &["a", "ab"], "ab", &[(1, 0, 2)]),
+    t!(leftlong060, &["ab", "a"], "a", &[(1, 0, 1)]),
+    t!(leftlong070, &["ab", "a"], "ab", &[(0, 0, 2)]),
+    t!(leftlong100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[(2, 1, 6)]),
+    t!(leftlong110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[(1, 1, 6)]),
+    t!(leftlong300, &["abcd", "b", "bce"], "abce", &[(2, 1, 4)]),
+    t!(
+        leftlong310,
+        &["a", "abcdefghi", "hz", "abcdefgh"],
+        "abcdefghz",
+        &[(3, 0, 8),]
+    ),
+    t!(leftlong320, &["a", "abab"], "abab", &[(1, 0, 4)]),
+    t!(leftlong330, &["abcd", "b", "ce"], "abce", &[(1, 1, 2), (2, 2, 4),]),
+    t!(leftlong340, &["a", "ab"], "xayabbbz", &[(0, 1, 2), (1, 3, 5)]),
+];
+
+/// Like LEFTMOST_LONGEST, but for anchored searches.
+const ANCHORED_LEFTMOST_LONGEST: &'static [SearchTest] = &[
+    t!(aleftlong000, &["ab", "abcd"], "abcd", &[(1, 0, 4)]),
+    t!(aleftlong010, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4),]),
+    t!(aleftlong020, &["", "a"], "a", &[(1, 0, 1)]),
+    t!(aleftlong021, &["", "a", ""], "a", &[(1, 0, 1)]),
+    t!(aleftlong022, &["a", "", ""], "a", &[(0, 0, 1)]),
+    t!(aleftlong023, &["", "", "a"], "a", &[(2, 0, 1)]),
+    t!(aleftlong030, &["", "a"], "aa", &[(1, 0, 1)]),
+    t!(aleftlong040, &["a", "ab"], "a", &[(0, 0, 1)]),
+    t!(aleftlong050, &["a", "ab"], "ab", &[(1, 0, 2)]),
+    t!(aleftlong060, &["ab", "a"], "a", &[(1, 0, 1)]),
+    t!(aleftlong070, &["ab", "a"], "ab", &[(0, 0, 2)]),
+    t!(aleftlong100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[]),
+    t!(aleftlong110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[]),
+    t!(aleftlong300, &["abcd", "b", "bce"], "abce", &[]),
+    t!(
+        aleftlong310,
+        &["a", "abcdefghi", "hz", "abcdefgh"],
+        "abcdefghz",
+        &[(3, 0, 8),]
+    ),
+    t!(aleftlong320, &["a", "abab"], "abab", &[(1, 0, 4)]),
+    t!(aleftlong330, &["abcd", "b", "ce"], "abce", &[]),
+    t!(aleftlong340, &["a", "ab"], "xayabbbz", &[]),
+];
+
+/// Tests for non-overlapping match semantics.
+///
+/// Generally these tests shouldn't pass when using overlapping semantics.
+/// These should pass for both standard and leftmost match semantics.
+const NON_OVERLAPPING: &'static [SearchTest] = &[
+    t!(nover010, &["abcd", "bcd", "cd"], "abcd", &[(0, 0, 4),]),
+    t!(nover020, &["bcd", "cd", "abcd"], "abcd", &[(2, 0, 4),]),
+    t!(nover030, &["abc", "bc"], "zazabcz", &[(0, 3, 6),]),
+    t!(
+        nover100,
+        &["ab", "ba"],
+        "abababa",
+        &[(0, 0, 2), (0, 2, 4), (0, 4, 6),]
+    ),
+    t!(nover200, &["foo", "foo"], "foobarfoo", &[(0, 0, 3), (0, 6, 9),]),
+    t!(nover300, &["", ""], "", &[(0, 0, 0),]),
+    t!(nover310, &["", ""], "a", &[(0, 0, 0), (0, 1, 1),]),
+];
+
+/// Like NON_OVERLAPPING, but for anchored searches.
+const ANCHORED_NON_OVERLAPPING: &'static [SearchTest] = &[
+    t!(anover010, &["abcd", "bcd", "cd"], "abcd", &[(0, 0, 4),]),
+    t!(anover020, &["bcd", "cd", "abcd"], "abcd", &[(2, 0, 4),]),
+    t!(anover030, &["abc", "bc"], "zazabcz", &[]),
+    t!(anover100, &["ab", "ba"], "abababa", &[(0, 0, 2)]),
+    t!(anover200, &["foo", "foo"], "foobarfoo", &[(0, 0, 3)]),
+    t!(anover300, &["", ""], "", &[(0, 0, 0),]),
+    t!(anover310, &["", ""], "a", &[(0, 0, 0)]),
+];
+
+/// Tests for overlapping match semantics.
+///
+/// This only supports standard match semantics, since leftmost-{first,longest}
+/// do not support overlapping matches.
+const OVERLAPPING: &'static [SearchTest] = &[
+    t!(
+        over000,
+        &["abcd", "bcd", "cd", "b"],
+        "abcd",
+        &[(3, 1, 2), (0, 0, 4), (1, 1, 4), (2, 2, 4),]
+    ),
+    t!(
+        over010,
+        &["bcd", "cd", "b", "abcd"],
+        "abcd",
+        &[(2, 1, 2), (3, 0, 4), (0, 1, 4), (1, 2, 4),]
+    ),
+    t!(
+        over020,
+        &["abcd", "bcd", "cd"],
+        "abcd",
+        &[(0, 0, 4), (1, 1, 4), (2, 2, 4),]
+    ),
+    t!(
+        over030,
+        &["bcd", "abcd", "cd"],
+        "abcd",
+        &[(1, 0, 4), (0, 1, 4), (2, 2, 4),]
+    ),
+    t!(
+        over040,
+        &["bcd", "cd", "abcd"],
+        "abcd",
+        &[(2, 0, 4), (0, 1, 4), (1, 2, 4),]
+    ),
+    t!(over050, &["abc", "bc"], "zazabcz", &[(0, 3, 6), (1, 4, 6),]),
+    t!(
+        over100,
+        &["ab", "ba"],
+        "abababa",
+        &[(0, 0, 2), (1, 1, 3), (0, 2, 4), (1, 3, 5), (0, 4, 6), (1, 5, 7),]
+    ),
+    t!(
+        over200,
+        &["foo", "foo"],
+        "foobarfoo",
+        &[(0, 0, 3), (1, 0, 3), (0, 6, 9), (1, 6, 9),]
+    ),
+    t!(over300, &["", ""], "", &[(0, 0, 0), (1, 0, 0),]),
+    t!(
+        over310,
+        &["", ""],
+        "a",
+        &[(0, 0, 0), (1, 0, 0), (0, 1, 1), (1, 1, 1),]
+    ),
+    t!(over320, &["", "a"], "a", &[(0, 0, 0), (1, 0, 1), (0, 1, 1),]),
+    t!(
+        over330,
+        &["", "a", ""],
+        "a",
+        &[(0, 0, 0), (2, 0, 0), (1, 0, 1), (0, 1, 1), (2, 1, 1),]
+    ),
+    t!(
+        over340,
+        &["a", "", ""],
+        "a",
+        &[(1, 0, 0), (2, 0, 0), (0, 0, 1), (1, 1, 1), (2, 1, 1),]
+    ),
+    t!(
+        over350,
+        &["", "", "a"],
+        "a",
+        &[(0, 0, 0), (1, 0, 0), (2, 0, 1), (0, 1, 1), (1, 1, 1),]
+    ),
+    t!(
+        over360,
+        &["foo", "foofoo"],
+        "foofoo",
+        &[(0, 0, 3), (1, 0, 6), (0, 3, 6)]
+    ),
+];
+
+/// Like OVERLAPPING, but for anchored searches.
+const ANCHORED_OVERLAPPING: &'static [SearchTest] = &[
+    t!(aover000, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4)]),
+    t!(aover010, &["bcd", "cd", "b", "abcd"], "abcd", &[(3, 0, 4)]),
+    t!(aover020, &["abcd", "bcd", "cd"], "abcd", &[(0, 0, 4)]),
+    t!(aover030, &["bcd", "abcd", "cd"], "abcd", &[(1, 0, 4)]),
+    t!(aover040, &["bcd", "cd", "abcd"], "abcd", &[(2, 0, 4)]),
+    t!(aover050, &["abc", "bc"], "zazabcz", &[]),
+    t!(aover100, &["ab", "ba"], "abababa", &[(0, 0, 2)]),
+    t!(aover200, &["foo", "foo"], "foobarfoo", &[(0, 0, 3), (1, 0, 3)]),
+    t!(aover300, &["", ""], "", &[(0, 0, 0), (1, 0, 0),]),
+    t!(aover310, &["", ""], "a", &[(0, 0, 0), (1, 0, 0)]),
+    t!(aover320, &["", "a"], "a", &[(0, 0, 0), (1, 0, 1)]),
+    t!(aover330, &["", "a", ""], "a", &[(0, 0, 0), (2, 0, 0), (1, 0, 1)]),
+    t!(aover340, &["a", "", ""], "a", &[(1, 0, 0), (2, 0, 0), (0, 0, 1)]),
+    t!(aover350, &["", "", "a"], "a", &[(0, 0, 0), (1, 0, 0), (2, 0, 1)]),
+    t!(aover360, &["foo", "foofoo"], "foofoo", &[(0, 0, 3), (1, 0, 6)]),
+];
+
+/// Tests for ASCII case insensitivity.
+///
+/// These tests should all have the same behavior regardless of match semantics
+/// or whether the search is overlapping.
+const ASCII_CASE_INSENSITIVE: &'static [SearchTest] = &[
+    t!(acasei000, &["a"], "A", &[(0, 0, 1)]),
+    t!(acasei010, &["Samwise"], "SAMWISE", &[(0, 0, 7)]),
+    t!(acasei011, &["Samwise"], "SAMWISE.abcd", &[(0, 0, 7)]),
+    t!(acasei020, &["fOoBaR"], "quux foobar baz", &[(0, 5, 11)]),
+];
+
+/// Like ASCII_CASE_INSENSITIVE, but specifically for non-overlapping tests.
+const ASCII_CASE_INSENSITIVE_NON_OVERLAPPING: &'static [SearchTest] = &[
+    t!(acasei000, &["foo", "FOO"], "fOo", &[(0, 0, 3)]),
+    t!(acasei000, &["FOO", "foo"], "fOo", &[(0, 0, 3)]),
+];
+
+/// Like ASCII_CASE_INSENSITIVE, but specifically for overlapping tests.
+const ASCII_CASE_INSENSITIVE_OVERLAPPING: &'static [SearchTest] = &[
+    t!(acasei000, &["foo", "FOO"], "fOo", &[(0, 0, 3), (1, 0, 3)]),
+    t!(acasei001, &["FOO", "foo"], "fOo", &[(0, 0, 3), (1, 0, 3)]),
+];
+
+/// Regression tests that are applied to all Aho-Corasick combinations.
+///
+/// If regression tests are needed for specific match semantics, then add them
+/// to the appropriate group above.
+const REGRESSION: &'static [SearchTest] = &[
+    t!(regression010, &["inf", "ind"], "infind", &[(0, 0, 3), (1, 3, 6),]),
+    t!(regression020, &["ind", "inf"], "infind", &[(1, 0, 3), (0, 3, 6),]),
+    t!(
+        regression030,
+        &["libcore/", "libstd/"],
+        "libcore/char/methods.rs",
+        &[(0, 0, 8),]
+    ),
+    t!(
+        regression040,
+        &["libstd/", "libcore/"],
+        "libcore/char/methods.rs",
+        &[(1, 0, 8),]
+    ),
+    t!(
+        regression050,
+        &["\x00\x00\x01", "\x00\x00\x00"],
+        "\x00\x00\x00",
+        &[(1, 0, 3),]
+    ),
+    t!(
+        regression060,
+        &["\x00\x00\x00", "\x00\x00\x01"],
+        "\x00\x00\x00",
+        &[(0, 0, 3),]
+    ),
+];
+
+// Now define a test for each combination of things above that we want to run.
+// Since there are a few different combinations for each collection of tests,
+// we define a couple of macros to avoid repetition drudgery. The testconfig
+// macro constructs the automaton from a given match kind, and runs the search
+// tests one-by-one over the given collection. The `with` parameter allows one
+// to configure the builder with additional parameters. The testcombo macro
+// invokes testconfig in precisely this way: it sets up several tests where
+// each one turns a different knob on AhoCorasickBuilder.
+
+macro_rules! testconfig {
+    (overlapping, $name:ident, $collection:expr, $kind:ident, $with:expr) => {
+        #[test]
+        fn $name() {
+            run_search_tests($collection, |test| {
+                let mut builder = AhoCorasickBuilder::new();
+                $with(&mut builder);
+                builder
+                    .match_kind(MatchKind::$kind)
+                    .build(test.patterns)
+                    .find_overlapping_iter(test.haystack)
+                    .collect()
+            });
+        }
+    };
+    (stream, $name:ident, $collection:expr, $kind:ident, $with:expr) => {
+        #[test]
+        fn $name() {
+            run_search_tests($collection, |test| {
+                let buf =
+                    io::BufReader::with_capacity(1, test.haystack.as_bytes());
+                let mut builder = AhoCorasickBuilder::new();
+                $with(&mut builder);
+                builder
+                    .match_kind(MatchKind::$kind)
+                    .build(test.patterns)
+                    .stream_find_iter(buf)
+                    .map(|result| result.unwrap())
+                    .collect()
+            });
+        }
+    };
+    ($name:ident, $collection:expr, $kind:ident, $with:expr) => {
+        #[test]
+        fn $name() {
+            run_search_tests($collection, |test| {
+                let mut builder = AhoCorasickBuilder::new();
+                $with(&mut builder);
+                builder
+                    .match_kind(MatchKind::$kind)
+                    .build(test.patterns)
+                    .find_iter(test.haystack)
+                    .collect()
+            });
+        }
+    };
+}
+
+macro_rules! testcombo {
+    ($name:ident, $collection:expr, $kind:ident) => {
+        mod $name {
+            use super::*;
+
+            testconfig!(nfa_default, $collection, $kind, |_| ());
+            testconfig!(
+                nfa_no_prefilter,
+                $collection,
+                $kind,
+                |b: &mut AhoCorasickBuilder| {
+                    b.prefilter(false);
+                }
+            );
+            testconfig!(
+                nfa_all_sparse,
+                $collection,
+                $kind,
+                |b: &mut AhoCorasickBuilder| {
+                    b.dense_depth(0);
+                }
+            );
+            testconfig!(
+                nfa_all_dense,
+                $collection,
+                $kind,
+                |b: &mut AhoCorasickBuilder| {
+                    b.dense_depth(usize::MAX);
+                }
+            );
+            testconfig!(
+                dfa_default,
+                $collection,
+                $kind,
+                |b: &mut AhoCorasickBuilder| {
+                    b.dfa(true);
+                }
+            );
+            testconfig!(
+                dfa_no_prefilter,
+                $collection,
+                $kind,
+                |b: &mut AhoCorasickBuilder| {
+                    b.dfa(true).prefilter(false);
+                }
+            );
+            testconfig!(
+                dfa_all_sparse,
+                $collection,
+                $kind,
+                |b: &mut AhoCorasickBuilder| {
+                    b.dfa(true).dense_depth(0);
+                }
+            );
+            testconfig!(
+                dfa_all_dense,
+                $collection,
+                $kind,
+                |b: &mut AhoCorasickBuilder| {
+                    b.dfa(true).dense_depth(usize::MAX);
+                }
+            );
+            testconfig!(
+                dfa_no_byte_class,
+                $collection,
+                $kind,
+                |b: &mut AhoCorasickBuilder| {
+                    b.dfa(true).byte_classes(false);
+                }
+            );
+            testconfig!(
+                dfa_no_premultiply,
+                $collection,
+                $kind,
+                |b: &mut AhoCorasickBuilder| {
+                    b.dfa(true).premultiply(false);
+                }
+            );
+            testconfig!(
+                dfa_no_byte_class_no_premultiply,
+                $collection,
+                $kind,
+                |b: &mut AhoCorasickBuilder| {
+                    b.dfa(true).byte_classes(false).premultiply(false);
+                }
+            );
+        }
+    };
+}
+
+// Write out the combinations.
+testcombo!(search_leftmost_longest, AC_LEFTMOST_LONGEST, LeftmostLongest);
+testcombo!(search_leftmost_first, AC_LEFTMOST_FIRST, LeftmostFirst);
+testcombo!(
+    search_standard_nonoverlapping,
+    AC_STANDARD_NON_OVERLAPPING,
+    Standard
+);
+
+// Write out the overlapping combo by hand since there is only one of them.
+testconfig!(
+    overlapping,
+    search_standard_overlapping_nfa_default,
+    AC_STANDARD_OVERLAPPING,
+    Standard,
+    |_| ()
+);
+testconfig!(
+    overlapping,
+    search_standard_overlapping_nfa_all_sparse,
+    AC_STANDARD_OVERLAPPING,
+    Standard,
+    |b: &mut AhoCorasickBuilder| {
+        b.dense_depth(0);
+    }
+);
+testconfig!(
+    overlapping,
+    search_standard_overlapping_nfa_all_dense,
+    AC_STANDARD_OVERLAPPING,
+    Standard,
+    |b: &mut AhoCorasickBuilder| {
+        b.dense_depth(usize::MAX);
+    }
+);
+testconfig!(
+    overlapping,
+    search_standard_overlapping_dfa_default,
+    AC_STANDARD_OVERLAPPING,
+    Standard,
+    |b: &mut AhoCorasickBuilder| {
+        b.dfa(true);
+    }
+);
+testconfig!(
+    overlapping,
+    search_standard_overlapping_dfa_all_sparse,
+    AC_STANDARD_OVERLAPPING,
+    Standard,
+    |b: &mut AhoCorasickBuilder| {
+        b.dfa(true).dense_depth(0);
+    }
+);
+testconfig!(
+    overlapping,
+    search_standard_overlapping_dfa_all_dense,
+    AC_STANDARD_OVERLAPPING,
+    Standard,
+    |b: &mut AhoCorasickBuilder| {
+        b.dfa(true).dense_depth(usize::MAX);
+    }
+);
+testconfig!(
+    overlapping,
+    search_standard_overlapping_dfa_no_byte_class,
+    AC_STANDARD_OVERLAPPING,
+    Standard,
+    |b: &mut AhoCorasickBuilder| {
+        b.dfa(true).byte_classes(false);
+    }
+);
+testconfig!(
+    overlapping,
+    search_standard_overlapping_dfa_no_premultiply,
+    AC_STANDARD_OVERLAPPING,
+    Standard,
+    |b: &mut AhoCorasickBuilder| {
+        b.dfa(true).premultiply(false);
+    }
+);
+testconfig!(
+    overlapping,
+    search_standard_overlapping_dfa_no_byte_class_no_premultiply,
+    AC_STANDARD_OVERLAPPING,
+    Standard,
+    |b: &mut AhoCorasickBuilder| {
+        b.dfa(true).byte_classes(false).premultiply(false);
+    }
+);
+
+// Also write out tests manually for streams, since we only test the standard
+// match semantics. We also don't bother testing different automaton
+// configurations, since those are well covered by tests above.
+testconfig!(
+    stream,
+    search_standard_stream_nfa_default,
+    AC_STANDARD_NON_OVERLAPPING,
+    Standard,
+    |_| ()
+);
+testconfig!(
+    stream,
+    search_standard_stream_dfa_default,
+    AC_STANDARD_NON_OVERLAPPING,
+    Standard,
+    |b: &mut AhoCorasickBuilder| {
+        b.dfa(true);
+    }
+);
+
+// Same thing for anchored searches. Write them out manually.
+testconfig!(
+    search_standard_anchored_nfa_default,
+    AC_STANDARD_ANCHORED_NON_OVERLAPPING,
+    Standard,
+    |b: &mut AhoCorasickBuilder| {
+        b.anchored(true);
+    }
+);
+testconfig!(
+    search_standard_anchored_dfa_default,
+    AC_STANDARD_ANCHORED_NON_OVERLAPPING,
+    Standard,
+    |b: &mut AhoCorasickBuilder| {
+        b.anchored(true).dfa(true);
+    }
+);
+testconfig!(
+    overlapping,
+    search_standard_anchored_overlapping_nfa_default,
+    AC_STANDARD_ANCHORED_OVERLAPPING,
+    Standard,
+    |b: &mut AhoCorasickBuilder| {
+        b.anchored(true);
+    }
+);
+testconfig!(
+    overlapping,
+    search_standard_anchored_overlapping_dfa_default,
+    AC_STANDARD_ANCHORED_OVERLAPPING,
+    Standard,
+    |b: &mut AhoCorasickBuilder| {
+        b.anchored(true).dfa(true);
+    }
+);
+testconfig!(
+    search_leftmost_first_anchored_nfa_default,
+    AC_LEFTMOST_FIRST_ANCHORED,
+    LeftmostFirst,
+    |b: &mut AhoCorasickBuilder| {
+        b.anchored(true);
+    }
+);
+testconfig!(
+    search_leftmost_first_anchored_dfa_default,
+    AC_LEFTMOST_FIRST_ANCHORED,
+    LeftmostFirst,
+    |b: &mut AhoCorasickBuilder| {
+        b.anchored(true).dfa(true);
+    }
+);
+testconfig!(
+    search_leftmost_longest_anchored_nfa_default,
+    AC_LEFTMOST_LONGEST_ANCHORED,
+    LeftmostLongest,
+    |b: &mut AhoCorasickBuilder| {
+        b.anchored(true);
+    }
+);
+testconfig!(
+    search_leftmost_longest_anchored_dfa_default,
+    AC_LEFTMOST_LONGEST_ANCHORED,
+    LeftmostLongest,
+    |b: &mut AhoCorasickBuilder| {
+        b.anchored(true).dfa(true);
+    }
+);
+
+// And also write out the test combinations for ASCII case insensitivity.
+testconfig!(
+    acasei_standard_nfa_default,
+    &[ASCII_CASE_INSENSITIVE],
+    Standard,
+    |b: &mut AhoCorasickBuilder| {
+        b.prefilter(false).ascii_case_insensitive(true);
+    }
+);
+testconfig!(
+    acasei_standard_dfa_default,
+    &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING],
+    Standard,
+    |b: &mut AhoCorasickBuilder| {
+        b.ascii_case_insensitive(true).dfa(true);
+    }
+);
+testconfig!(
+    overlapping,
+    acasei_standard_overlapping_nfa_default,
+    &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_OVERLAPPING],
+    Standard,
+    |b: &mut AhoCorasickBuilder| {
+        b.ascii_case_insensitive(true);
+    }
+);
+testconfig!(
+    overlapping,
+    acasei_standard_overlapping_dfa_default,
+    &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_OVERLAPPING],
+    Standard,
+    |b: &mut AhoCorasickBuilder| {
+        b.ascii_case_insensitive(true).dfa(true);
+    }
+);
+testconfig!(
+    acasei_leftmost_first_nfa_default,
+    &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING],
+    LeftmostFirst,
+    |b: &mut AhoCorasickBuilder| {
+        b.ascii_case_insensitive(true);
+    }
+);
+testconfig!(
+    acasei_leftmost_first_dfa_default,
+    &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING],
+    LeftmostFirst,
+    |b: &mut AhoCorasickBuilder| {
+        b.ascii_case_insensitive(true).dfa(true);
+    }
+);
+testconfig!(
+    acasei_leftmost_longest_nfa_default,
+    &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING],
+    LeftmostLongest,
+    |b: &mut AhoCorasickBuilder| {
+        b.ascii_case_insensitive(true);
+    }
+);
+testconfig!(
+    acasei_leftmost_longest_dfa_default,
+    &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING],
+    LeftmostLongest,
+    |b: &mut AhoCorasickBuilder| {
+        b.ascii_case_insensitive(true).dfa(true);
+    }
+);
+
+#[test]
+fn search_tests_have_unique_names() {
+    let assert = |constname, tests: &[SearchTest]| {
+        let mut seen = HashMap::new(); // map from test name to position
+        for (i, test) in tests.iter().enumerate() {
+            if !seen.contains_key(test.name) {
+                seen.insert(test.name, i);
+            } else {
+                let last = seen[test.name];
+                panic!(
+                    "{} tests have duplicate names at positions {} and {}",
+                    constname, last, i
+                );
+            }
+        }
+    };
+    assert("BASICS", BASICS);
+    assert("STANDARD", STANDARD);
+    assert("LEFTMOST", LEFTMOST);
+    assert("LEFTMOST_FIRST", LEFTMOST_FIRST);
+    assert("LEFTMOST_LONGEST", LEFTMOST_LONGEST);
+    assert("NON_OVERLAPPING", NON_OVERLAPPING);
+    assert("OVERLAPPING", OVERLAPPING);
+    assert("REGRESSION", REGRESSION);
+}
+
+#[test]
+#[should_panic]
+fn stream_not_allowed_leftmost_first() {
+    let fsm = AhoCorasickBuilder::new()
+        .match_kind(MatchKind::LeftmostFirst)
+        .build(None::<String>);
+    assert_eq!(fsm.stream_find_iter(&b""[..]).count(), 0);
+}
+
+#[test]
+#[should_panic]
+fn stream_not_allowed_leftmost_longest() {
+    let fsm = AhoCorasickBuilder::new()
+        .match_kind(MatchKind::LeftmostLongest)
+        .build(None::<String>);
+    assert_eq!(fsm.stream_find_iter(&b""[..]).count(), 0);
+}
+
+#[test]
+#[should_panic]
+fn overlapping_not_allowed_leftmost_first() {
+    let fsm = AhoCorasickBuilder::new()
+        .match_kind(MatchKind::LeftmostFirst)
+        .build(None::<String>);
+    assert_eq!(fsm.find_overlapping_iter("").count(), 0);
+}
+
+#[test]
+#[should_panic]
+fn overlapping_not_allowed_leftmost_longest() {
+    let fsm = AhoCorasickBuilder::new()
+        .match_kind(MatchKind::LeftmostLongest)
+        .build(None::<String>);
+    assert_eq!(fsm.find_overlapping_iter("").count(), 0);
+}
+
+#[test]
+fn state_id_too_small() {
+    let mut patterns = vec![];
+    for c1 in (b'a'..b'z').map(|b| b as char) {
+        for c2 in (b'a'..b'z').map(|b| b as char) {
+            for c3 in (b'a'..b'z').map(|b| b as char) {
+                patterns.push(format!("{}{}{}", c1, c2, c3));
+            }
+        }
+    }
+    let result =
+        AhoCorasickBuilder::new().build_with_size::<u8, _, _>(&patterns);
+    assert!(result.is_err());
+}
+
+// See: https://github.com/BurntSushi/aho-corasick/issues/44
+//
+// In short, this test ensures that enabling ASCII case insensitivity does not
+// visit an exponential number of states when filling in failure transitions.
+#[test]
+fn regression_ascii_case_insensitive_no_exponential() {
+    let ac = AhoCorasickBuilder::new()
+        .ascii_case_insensitive(true)
+        .build(&["Tsubaki House-Triple Shot Vol01校花三姐妹"]);
+    assert!(ac.find("").is_none());
+}
+
+// See: https://github.com/BurntSushi/aho-corasick/issues/53
+//
+// This test ensures that the rare byte prefilter works in a particular corner
+// case. In particular, the shift offset detected for '/' in the patterns below
+// was incorrect, leading to a false negative.
+#[test]
+fn regression_rare_byte_prefilter() {
+    use AhoCorasick;
+
+    let ac = AhoCorasick::new_auto_configured(&["ab/j/", "x/"]);
+    assert!(ac.is_match("ab/j/"));
+}
+
+#[test]
+fn regression_case_insensitive_prefilter() {
+    use AhoCorasickBuilder;
+
+    for c in b'a'..b'z' {
+        for c2 in b'a'..b'z' {
+            let c = c as char;
+            let c2 = c2 as char;
+            let needle = format!("{}{}", c, c2).to_lowercase();
+            let haystack = needle.to_uppercase();
+            let ac = AhoCorasickBuilder::new()
+                .ascii_case_insensitive(true)
+                .prefilter(true)
+                .build(&[&needle]);
+            assert_eq!(
+                1,
+                ac.find_iter(&haystack).count(),
+                "failed to find {:?} in {:?}\n\nautomaton:\n{:?}",
+                needle,
+                haystack,
+                ac,
+            );
+        }
+    }
+}
+
+fn run_search_tests<F: FnMut(&SearchTest) -> Vec<Match>>(
+    which: TestCollection,
+    mut f: F,
+) {
+    let get_match_triples =
+        |matches: Vec<Match>| -> Vec<(usize, usize, usize)> {
+            matches
+                .into_iter()
+                .map(|m| (m.pattern(), m.start(), m.end()))
+                .collect()
+        };
+    for &tests in which {
+        for test in tests {
+            assert_eq!(
+                test.matches,
+                get_match_triples(f(&test)).as_slice(),
+                "test: {}, patterns: {:?}, haystack: {:?}",
+                test.name,
+                test.patterns,
+                test.haystack
+            );
+        }
+    }
+}
author	Chih-Hung Hsieh <chh@google.com>	2020-04-17 01:35:30 +0000
committer	Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>	2020-04-17 01:35:30 +0000
commit	7b456f1d9de795231795b2541692205e00e93133 (patch)
tree	7820a857da0555fc089addcce3710b33363592ea
parent	15f90c2d64d844022e06bbb33cafdb56fae4989c (diff)
parent	70d61f6af2e52b51edfff374d4f8dfd4940018c9 (diff)
download	aho-corasick-7b456f1d9de795231795b2541692205e00e93133.tar.gz