aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJeff Vander Stoep <jeffv@google.com>2022-12-06 12:10:25 +0000
committerAutomerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>2022-12-06 12:10:25 +0000
commit2a437970c66be0021d198d6e1d51de4311c13826 (patch)
tree85cb4fbb79a503703b53a8c1c8968516a225b60b
parent89440fc6e6e3e15736cdacf5d2a557df9916641d (diff)
parent303ec9ecf9091f98337b68621b8504de08d85568 (diff)
downloadaho-corasick-2a437970c66be0021d198d6e1d51de4311c13826.tar.gz
Upgrade aho-corasick to 0.7.20 am: 13ae88ee2a am: 303ec9ecf9
Original change: https://android-review.googlesource.com/c/platform/external/rust/crates/aho-corasick/+/2327633 Change-Id: I9b90e796082fe265465b012655eb988925dbd91f Signed-off-by: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>
-rw-r--r--.cargo_vcs_info.json7
-rw-r--r--.github/workflows/ci.yml15
-rw-r--r--Android.bp6
-rw-r--r--Cargo.toml25
-rw-r--r--Cargo.toml.orig8
-rw-r--r--DESIGN.md8
-rw-r--r--METADATA14
-rw-r--r--README.md10
-rw-r--r--src/ahocorasick.rs24
-rw-r--r--src/automaton.rs14
-rw-r--r--src/lib.rs6
-rw-r--r--src/nfa.rs321
-rw-r--r--src/packed/api.rs9
-rw-r--r--src/packed/mod.rs2
-rw-r--r--src/packed/rabinkarp.rs2
-rw-r--r--src/packed/teddy/README.md20
-rw-r--r--src/tests.rs2
17 files changed, 171 insertions, 322 deletions
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json
index f618c96..873e57f 100644
--- a/.cargo_vcs_info.json
+++ b/.cargo_vcs_info.json
@@ -1,5 +1,6 @@
{
"git": {
- "sha1": "1b116376d6a8f920adf5add27a689b2134eec8d0"
- }
-}
+ "sha1": "7e231db4b4ac192ebc674078f2b03cd37b9ed5b9"
+ },
+ "path_in_vcs": ""
+} \ No newline at end of file
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 82f1216..f419055 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -64,17 +64,17 @@ jobs:
with:
fetch-depth: 1
- name: Install Rust
- uses: actions-rs/toolchain@v1
+ uses: dtolnay/rust-toolchain@master
with:
toolchain: ${{ matrix.rust }}
- profile: minimal
- override: true
- name: Use Cross
if: matrix.target != ''
run: |
- # FIXME: to work around bugs in latest cross release, install master.
- # See: https://github.com/rust-embedded/cross/issues/357
- cargo install --git https://github.com/rust-embedded/cross
+ # We used to install 'cross' from master, but it kept failing. So now
+ # we build from a known-good version until 'cross' becomes more stable
+ # or we find an alternative. Notably, between v0.2.1 and current
+ # master (2022-06-14), the number of Cross's dependencies has doubled.
+ cargo install --bins --git https://github.com/rust-embedded/cross --tag v0.2.1
echo "CARGO=cross" >> $GITHUB_ENV
echo "TARGET=--target ${{ matrix.target }}" >> $GITHUB_ENV
- name: Show command used for Cargo
@@ -101,10 +101,9 @@ jobs:
with:
fetch-depth: 1
- name: Install Rust
- uses: actions-rs/toolchain@v1
+ uses: dtolnay/rust-toolchain@master
with:
toolchain: stable
- profile: minimal
components: rustfmt
- name: Check formatting
run: |
diff --git a/Android.bp b/Android.bp
index 9cfd02c..6ffe93f 100644
--- a/Android.bp
+++ b/Android.bp
@@ -40,11 +40,10 @@ license {
rust_test {
name: "aho-corasick_test_src_lib",
- // has rustc warnings
host_supported: true,
crate_name: "aho_corasick",
cargo_env_compat: true,
- cargo_pkg_version: "0.7.18",
+ cargo_pkg_version: "0.7.20",
srcs: ["src/lib.rs"],
test_suites: ["general-tests"],
auto_gen_config: true,
@@ -63,11 +62,10 @@ rust_test {
rust_library {
name: "libaho_corasick",
- // has rustc warnings
host_supported: true,
crate_name: "aho_corasick",
cargo_env_compat: true,
- cargo_pkg_version: "0.7.18",
+ cargo_pkg_version: "0.7.20",
srcs: ["src/lib.rs"],
edition: "2018",
features: [
diff --git a/Cargo.toml b/Cargo.toml
index 62b5f7e..ee8e94e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,27 +3,33 @@
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
-# to registry (e.g., crates.io) dependencies
+# to registry (e.g., crates.io) dependencies.
#
-# If you believe there's an error in this file please file an
-# issue against the rust-lang/cargo repository. If you're
-# editing this file be aware that the upstream Cargo.toml
-# will likely look very different (and much more reasonable)
+# If you are reading this file be aware that the original Cargo.toml
+# will likely look very different (and much more reasonable).
+# See Cargo.toml.orig for the original contents.
[package]
edition = "2018"
name = "aho-corasick"
-version = "0.7.18"
+version = "0.7.20"
authors = ["Andrew Gallant <jamslam@gmail.com>"]
-exclude = ["/aho-corasick-debug", "/ci/*", "/.travis.yml", "/appveyor.yml"]
+exclude = ["/aho-corasick-debug"]
autotests = false
description = "Fast multiple substring searching."
homepage = "https://github.com/BurntSushi/aho-corasick"
readme = "README.md"
-keywords = ["string", "search", "text", "aho", "multi"]
+keywords = [
+ "string",
+ "search",
+ "text",
+ "aho",
+ "multi",
+]
categories = ["text-processing"]
-license = "Unlicense/MIT"
+license = "Unlicense OR MIT"
repository = "https://github.com/BurntSushi/aho-corasick"
+
[profile.bench]
debug = true
@@ -32,6 +38,7 @@ debug = true
[lib]
name = "aho_corasick"
+
[dependencies.memchr]
version = "2.4.0"
default-features = false
diff --git a/Cargo.toml.orig b/Cargo.toml.orig
index 06a91f1..a43e872 100644
--- a/Cargo.toml.orig
+++ b/Cargo.toml.orig
@@ -1,18 +1,16 @@
[package]
name = "aho-corasick"
-version = "0.7.18" #:version
+version = "0.7.20" #:version
authors = ["Andrew Gallant <jamslam@gmail.com>"]
description = "Fast multiple substring searching."
homepage = "https://github.com/BurntSushi/aho-corasick"
repository = "https://github.com/BurntSushi/aho-corasick"
readme = "README.md"
keywords = ["string", "search", "text", "aho", "multi"]
-license = "Unlicense/MIT"
+license = "Unlicense OR MIT"
categories = ["text-processing"]
autotests = false
-exclude = [
- "/aho-corasick-debug", "/ci/*", "/.travis.yml", "/appveyor.yml",
-]
+exclude = ["/aho-corasick-debug"]
edition = "2018"
[workspace]
diff --git a/DESIGN.md b/DESIGN.md
index 367e203..0e15ad0 100644
--- a/DESIGN.md
+++ b/DESIGN.md
@@ -250,8 +250,8 @@ A DFA's search code, by contrast, looks like this:
// An Aho-Corasick DFA *never* has a missing state that requires
// failure transitions to be followed. One byte of input advances the
// automaton by one state. Always.
- state_id = trie.next_state(state_id, b);
- if fsm.is_match(state_id) {
+ state_id = dfa.next_state(state_id, b);
+ if dfa.is_match(state_id) {
return true;
}
}
@@ -278,7 +278,7 @@ there are a small number of patterns.
# More DFA tricks
As described in the previous section, one of the downsides of using a DFA
-is that is uses more memory and can take longer to build. One small way of
+is that it uses more memory and can take longer to build. One small way of
mitigating these concerns is to map the alphabet used by the automaton into
a smaller space. Typically, the alphabet of a DFA has 256 elements in it:
one element for each possible value that fits into a byte. However, in many
@@ -425,7 +425,7 @@ method.
Since Aho-Corasick is an automaton, it is possible to do partial searches on
partial parts of the haystack, and then resume that search on subsequent pieces
of the haystack. This is useful when the haystack you're trying to search is
-not stored contiguous in memory, or if one does not want to read the entire
+not stored contiguously in memory, or if one does not want to read the entire
haystack into memory at once.
Currently, only standard semantics are supported for stream searching. This is
diff --git a/METADATA b/METADATA
index 3c735ca..cd2bd6e 100644
--- a/METADATA
+++ b/METADATA
@@ -1,3 +1,7 @@
+# This project was upgraded with external_updater.
+# Usage: tools/external_updater/updater.sh update rust/crates/aho-corasick
+# For more info, check https://cs.android.com/android/platform/superproject/+/master:tools/external_updater/README.md
+
name: "aho-corasick"
description: "Fast multiple substring searching."
third_party {
@@ -7,13 +11,13 @@ third_party {
}
url {
type: ARCHIVE
- value: "https://static.crates.io/crates/aho-corasick/aho-corasick-0.7.18.crate"
+ value: "https://static.crates.io/crates/aho-corasick/aho-corasick-0.7.20.crate"
}
- version: "0.7.18"
+ version: "0.7.20"
license_type: NOTICE
last_upgrade_date {
- year: 2021
- month: 5
- day: 19
+ year: 2022
+ month: 12
+ day: 5
}
}
diff --git a/README.md b/README.md
index cd43051..f033e01 100644
--- a/README.md
+++ b/README.md
@@ -9,9 +9,9 @@ Features include case insensitive matching, overlapping matches, fast searching
via SIMD and optional full DFA construction and search & replace in streams.
[![Build status](https://github.com/BurntSushi/aho-corasick/workflows/ci/badge.svg)](https://github.com/BurntSushi/aho-corasick/actions)
-[![](http://meritbadge.herokuapp.com/aho-corasick)](https://crates.io/crates/aho-corasick)
+[![crates.io](https://img.shields.io/crates/v/aho-corasick.svg)](https://crates.io/crates/aho-corasick)
-Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org).
+Dual-licensed under MIT or the [UNLICENSE](https://unlicense.org/).
### Documentation
@@ -168,6 +168,12 @@ In general, this crate will be conservative with respect to the minimum
supported version of Rust.
+### FFI bindings
+
+* [G-Research/ahocorasick_rs](https://github.com/G-Research/ahocorasick_rs/)
+is a Python wrapper for this library.
+
+
### Future work
Here are some plans for the future:
diff --git a/src/ahocorasick.rs b/src/ahocorasick.rs
index 2b1aa5c..cfd74fd 100644
--- a/src/ahocorasick.rs
+++ b/src/ahocorasick.rs
@@ -1219,7 +1219,6 @@ pub struct FindOverlappingIter<'a, 'b, S: StateID> {
prestate: PrefilterState,
haystack: &'b [u8],
pos: usize,
- last_match_end: usize,
state_id: S,
match_index: usize,
}
@@ -1239,7 +1238,6 @@ impl<'a, 'b, S: StateID> FindOverlappingIter<'a, 'b, S> {
prestate,
haystack,
pos: 0,
- last_match_end: 0,
state_id: ac.imp.start_state(),
match_index: 0,
}
@@ -1357,7 +1355,7 @@ struct StreamChunkIter<'a, R, S: StateID> {
#[derive(Debug)]
enum StreamChunk<'r> {
/// A chunk that does not contain any matches.
- NonMatch { bytes: &'r [u8], start: usize },
+ NonMatch { bytes: &'r [u8] },
/// A chunk that precisely contains a match.
Match { bytes: &'r [u8], mat: Match },
}
@@ -1390,7 +1388,7 @@ impl<'a, R: io::Read, S: StateID> StreamChunkIter<'a, R, S> {
}
}
- fn next<'r>(&'r mut self) -> Option<io::Result<StreamChunk<'r>>> {
+ fn next(&mut self) -> Option<io::Result<StreamChunk>> {
loop {
if let Some(mut mat) = self.pending_match.take() {
let bytes = &self.buf.buffer()[mat.start()..mat.end()];
@@ -1401,9 +1399,8 @@ impl<'a, R: io::Read, S: StateID> StreamChunkIter<'a, R, S> {
if self.search_pos >= self.buf.len() {
if let Some(end) = self.unreported() {
let bytes = &self.buf.buffer()[self.report_pos..end];
- let start = self.absolute_pos + self.report_pos;
self.report_pos = end;
- return Some(Ok(StreamChunk::NonMatch { bytes, start }));
+ return Some(Ok(StreamChunk::NonMatch { bytes }));
}
if self.buf.len() >= self.buf.min_buffer_len() {
// This is the point at which we roll our buffer, which we
@@ -1426,10 +1423,9 @@ impl<'a, R: io::Read, S: StateID> StreamChunkIter<'a, R, S> {
// unreported bytes remaining, return them now.
if self.report_pos < self.buf.len() {
let bytes = &self.buf.buffer()[self.report_pos..];
- let start = self.absolute_pos + self.report_pos;
self.report_pos = self.buf.len();
- let chunk = StreamChunk::NonMatch { bytes, start };
+ let chunk = StreamChunk::NonMatch { bytes };
return Some(Ok(chunk));
} else {
// We've reported everything, but there might still
@@ -1469,10 +1465,9 @@ impl<'a, R: io::Read, S: StateID> StreamChunkIter<'a, R, S> {
if self.report_pos < mat.start() {
let bytes =
&self.buf.buffer()[self.report_pos..mat.start()];
- let start = self.absolute_pos + self.report_pos;
self.report_pos = mat.start();
- let chunk = StreamChunk::NonMatch { bytes, start };
+ let chunk = StreamChunk::NonMatch { bytes };
return Some(Ok(chunk));
}
}
@@ -1800,9 +1795,12 @@ impl AhoCorasickBuilder {
/// Enabling this option does not change the search algorithm, but it may
/// increase the size of the automaton.
///
- /// **NOTE:** In the future, support for full Unicode case insensitivity
- /// may be added, but ASCII case insensitivity is comparatively much
- /// simpler to add.
+ /// **NOTE:** It is unlikely that support for Unicode case folding will
+ /// be added in the future. The ASCII case works via a simple hack to the
+ /// underlying automaton, but full Unicode handling requires a fair bit of
+ /// sophistication. If you do need Unicode handling, you might consider
+ /// using the [`regex` crate](https://docs.rs/regex) or the lower level
+ /// [`regex-automata` crate](https://docs.rs/regex-automata).
///
/// # Examples
///
diff --git a/src/automaton.rs b/src/automaton.rs
index b971bf3..d88743a 100644
--- a/src/automaton.rs
+++ b/src/automaton.rs
@@ -68,11 +68,11 @@ use crate::Match;
///
/// Every automaton has exactly one fail state, one dead state and exactly one
/// start state. Generally, these correspond to the first, second and third
-/// states, respectively. The failure state is always treated as a sentinel.
-/// That is, no correct Aho-Corasick automaton will ever transition into the
-/// fail state. The dead state, however, can be transitioned into, but only
-/// when leftmost-first or leftmost-longest match semantics are enabled and
-/// only when at least one match has been observed.
+/// states, respectively. The dead state is always treated as a sentinel. That
+/// is, no correct Aho-Corasick automaton will ever transition into the fail
+/// state. The dead state, however, can be transitioned into, but only when
+/// leftmost-first or leftmost-longest match semantics are enabled and only
+/// when at least one match has been observed.
///
/// Every automaton also has one or more match states, such that
/// `Automaton::is_match_state(id)` returns `true` if and only if `id`
@@ -340,7 +340,7 @@ pub trait Automaton {
// dead states are used to stop a search.)
debug_assert!(
last_match.is_some() || self.anchored(),
- "failure state should only be seen after match"
+ "dead state should only be seen after match"
);
return last_match;
}
@@ -455,7 +455,7 @@ pub trait Automaton {
// case, dead states are used to stop a search.)
debug_assert!(
last_match.is_some() || self.anchored(),
- "failure state should only be seen after match"
+ "dead state should only be seen after match"
);
return last_match;
}
diff --git a/src/lib.rs b/src/lib.rs
index 9a3d084..4465a56 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -275,6 +275,12 @@ impl Match {
self.end
}
+ /// The length, in bytes, of the match.
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.len
+ }
+
/// Returns true if and only if this match is empty. That is, when
/// `start() == end()`.
///
diff --git a/src/nfa.rs b/src/nfa.rs
index e29bb27..05c5cfb 100644
--- a/src/nfa.rs
+++ b/src/nfa.rs
@@ -65,7 +65,7 @@ pub struct NFA<S> {
/// A set of equivalence classes in terms of bytes. We compute this while
/// building the NFA, but don't use it in the NFA's states. Instead, we
/// use this for building the DFA. We store it on the NFA since it's easy
- /// to compute while visiting the the patterns.
+ /// to compute while visiting the patterns.
byte_classes: ByteClasses,
/// A set of states. Each state defines its own transitions, a fail
/// transition and a set of indices corresponding to matches.
@@ -313,17 +313,6 @@ impl<S: StateID> State<S> {
!self.matches.is_empty()
}
- fn get_longest_match_len(&self) -> Option<usize> {
- // Why is this true? Because the first match in any matching state
- // will always correspond to the match added to it during trie
- // construction (since when we copy matches due to failure transitions,
- // we always append them). Therefore, it follows that the first match
- // must always be longest since any subsequent match must be from a
- // failure transition, and a failure transition by construction points
- // to a proper suffix. A proper suffix is, by definition, smaller.
- self.matches.get(0).map(|&(_, len)| len)
- }
-
fn next_state(&self, input: u8) -> S {
self.trans.next_state(input)
}
@@ -649,11 +638,7 @@ impl<'a, S: StateID> Compiler<'a, S> {
self.add_start_state_loop();
self.add_dead_state_loop();
if !self.builder.anchored {
- if self.match_kind().is_leftmost() {
- self.fill_failure_transitions_leftmost();
- } else {
- self.fill_failure_transitions_standard();
- }
+ self.fill_failure_transitions();
}
self.close_start_state_loop();
self.nfa.byte_classes = self.byte_classes.build();
@@ -739,7 +724,8 @@ impl<'a, S: StateID> Compiler<'a, S> {
}
/// This routine creates failure transitions according to the standard
- /// textbook formulation of the Aho-Corasick algorithm.
+ /// textbook formulation of the Aho-Corasick algorithm, with a couple small
+ /// tweaks to support "leftmost" semantics.
///
/// Building failure transitions is the most interesting part of building
/// the Aho-Corasick automaton, because they are what allow searches to
@@ -807,11 +793,15 @@ impl<'a, S: StateID> Compiler<'a, S> {
/// 'abcd', 'b', 'bcd' and 'cd':
///
/// ```ignore
- /// - a - S1 - b - S2 - c - S3 - d - S4*
- /// /
- /// S0 - b - S5* - c - S6 - d - S7*
- /// \
- /// - c - S8 - d - S9*
+ /// - a - S1 - b - S2* - c - S3 - d - S4*
+ /// / / /
+ /// / ------- -------
+ /// / / /
+ /// S0 --- b - S5* - c - S6 - d - S7*
+ /// \ /
+ /// \ --------
+ /// \ /
+ /// - c - S8 - d - S9*
/// ```
///
/// The failure transitions for this trie are defined from S2 to S5,
@@ -839,20 +829,50 @@ impl<'a, S: StateID> Compiler<'a, S> {
/// We don't actually use recursion to implement this, but instead, use a
/// breadth first search of the automaton. Our base case is the start
/// state, whose failure transition is just a transition to itself.
- fn fill_failure_transitions_standard(&mut self) {
+ ///
+ /// When building a leftmost automaton, we proceed as above, but only
+ /// include a subset of failure transitions. Namely, we omit any failure
+ /// transitions that appear after a match state in the trie. This is
+ /// because failure transitions always point back to a proper suffix of
+ /// what has been seen so far. Thus, following a failure transition after
+ /// a match implies looking for a match that starts after the one that has
+ /// already been seen, which is of course therefore not the leftmost match.
+ ///
+ /// N.B. I came up with this algorithm on my own, and after scouring all of
+ /// the other AC implementations I know of (Perl, Snort, many on GitHub).
+ /// I couldn't find any that implement leftmost semantics like this.
+ /// Perl of course needs leftmost-first semantics, but they implement it
+ /// with a seeming hack at *search* time instead of encoding it into the
+ /// automaton. There are also a couple Java libraries that support leftmost
+ /// longest semantics, but they do it by building a queue of matches at
+ /// search time, which is even worse than what Perl is doing. ---AG
+ fn fill_failure_transitions(&mut self) {
+ let kind = self.match_kind();
// Initialize the queue for breadth first search with all transitions
// out of the start state. We handle the start state specially because
// we only want to follow non-self transitions. If we followed self
// transitions, then this would never terminate.
let mut queue = VecDeque::new();
let mut seen = self.queued_set();
- for b in AllBytesIter::new() {
- let next = self.nfa.start().next_state(b);
- if next != self.nfa.start_id {
- if !seen.contains(next) {
- queue.push_back(next);
- seen.insert(next);
- }
+ let mut it = self.nfa.iter_transitions_mut(self.nfa.start_id);
+ while let Some((_, next)) = it.next() {
+ // Skip anything we've seen before and any self-transitions on the
+ // start state.
+ if next == it.nfa().start_id || seen.contains(next) {
+ continue;
+ }
+ queue.push_back(next);
+ seen.insert(next);
+ // Under leftmost semantics, if a state immediately following
+ // the start state is a match state, then we never want to
+ // follow its failure transition since the failure transition
+ // necessarily leads back to the start state, which we never
+ // want to do for leftmost matching after a match has been
+ // found.
+ //
+ // We apply the same logic to non-start states below as well.
+ if kind.is_leftmost() && it.nfa().state(next).is_match() {
+ it.nfa().state_mut(next).fail = dead_id();
}
}
while let Some(id) = queue.pop_front() {
@@ -870,6 +890,31 @@ impl<'a, S: StateID> Compiler<'a, S> {
queue.push_back(next);
seen.insert(next);
+ // As above for start states, under leftmost semantics, once
+ // we see a match all subsequent states should have no failure
+ // transitions because failure transitions always imply looking
+ // for a match that is a suffix of what has been seen so far
+ // (where "seen so far" corresponds to the string formed by
+ // following the transitions from the start state to the
+ // current state). Under leftmost semantics, we specifically do
+ // not want to allow this to happen because we always want to
+ // report the match found at the leftmost position.
+ //
+ // The difference between leftmost-first and leftmost-longest
+ // occurs previously while we build the trie. For
+ // leftmost-first, we simply omit any entries that would
+ // otherwise require passing through a match state.
+ //
+ // Note that for correctness, the failure transition has to be
+ // set to the dead state for ALL states following a match, not
+ // just the match state itself. However, by setting the failure
+ // transition to the dead state on all match states, the dead
+ // state will automatically propagate to all subsequent states
+ // via the failure state computation below.
+ if kind.is_leftmost() && it.nfa().state(next).is_match() {
+ it.nfa().state_mut(next).fail = dead_id();
+ continue;
+ }
let mut fail = it.nfa().state(id).fail;
while it.nfa().state(fail).next_state(b) == fail_id() {
fail = it.nfa().state(fail).fail;
@@ -887,217 +932,9 @@ impl<'a, S: StateID> Compiler<'a, S> {
// in addition to its own. For the non-overlapping case, such
// states only report the first match, which is never empty since
// it isn't a start state.
- it.nfa().copy_empty_matches(id);
- }
- }
-
- /// This routine is just like fill_failure_transitions_standard, except
- /// it adds failure transitions in a way that preserves leftmost match
- /// semantics (for both leftmost-first and leftmost-longest).
- ///
- /// The algorithms are so similar that it would be possible to write it
- /// generically. But doing so without overhead would require a bit of
- /// ceremony, so we just copy it and add in the extra leftmost logic.
- /// Moreover, the standard algorithm above is so simple that it feels like
- /// a crime to disturb it.
- ///
- /// In effect, this proceeds just like the standard approach, but we
- /// specifically add only a subset of all failure transitions. Namely, we
- /// only add failure transitions that either do not occur after a match
- /// or failure transitions that do occur after a match but preserve the
- /// match. The comments in the implementation below should help.
- ///
- /// N.B. The only differences in the automaton between leftmost-first and
- /// leftmost-longest are in trie construction. Otherwise, both have exactly
- /// the same set of failure transitions. leftmost-longest adds everything
- /// to the trie, where as leftmost-first skips any patterns for which there
- /// exists a prefix of it that was added earlier.
- ///
- /// N.B. I came up with this algorithm on my own, and after scouring all of
- /// the other AC implementations I know of (Perl, Snort, many on GitHub).
- /// I couldn't find any that implement leftmost semantics like this.
- /// Perl of course needs leftmost-first semantics, but they implement it
- /// with a seeming hack at *search* time instead of encoding it into the
- /// automaton. There are also a couple Java libraries that support leftmost
- /// longest semantics, but they do it by building a queue of matches at
- /// search time, which is even worse than what Perl is doing. ---AG
- fn fill_failure_transitions_leftmost(&mut self) {
- /// Represents an item in our queue of states to process.
- ///
- /// Fundamentally, this queue serves the same purpose as the queue
- /// for filling failure transitions using the standard formulation.
- /// In the leftmost case, though, we need to track a bit more
- /// information. See comments below.
- #[derive(Clone, Copy, Debug)]
- struct QueuedState<S> {
- /// The id of the state to visit.
- id: S,
- /// The depth at which the first match was observed in the path
- /// to this state. Note that this corresponds to the depth at
- /// which the beginning of the match was detected. If no match
- /// has been seen, then this is None.
- match_at_depth: Option<usize>,
- }
-
- impl<S: StateID> QueuedState<S> {
- /// Create a queued state corresponding to the given NFA's start
- /// state.
- fn start(nfa: &NFA<S>) -> QueuedState<S> {
- let match_at_depth =
- if nfa.start().is_match() { Some(0) } else { None };
- QueuedState { id: nfa.start_id, match_at_depth }
- }
-
- /// Return the next state to queue up. The given id must be a state
- /// corresponding to a single transition from this queued state.
- fn next_queued_state(
- &self,
- nfa: &NFA<S>,
- id: S,
- ) -> QueuedState<S> {
- let match_at_depth = self.next_match_at_depth(nfa, id);
- QueuedState { id, match_at_depth }
- }
-
- /// Return the earliest depth at which a match has occurred for
- /// the given state. The given state must correspond to a single
- /// transition from this queued state.
- fn next_match_at_depth(
- &self,
- nfa: &NFA<S>,
- next: S,
- ) -> Option<usize> {
- // This is a little tricky. If the previous state has already
- // seen a match or if `next` isn't a match state, then nothing
- // needs to change since a later state cannot find an earlier
- // match.
- match self.match_at_depth {
- Some(x) => return Some(x),
- None if nfa.state(next).is_match() => {}
- None => return None,
- }
- let depth = nfa.state(next).depth
- - nfa.state(next).get_longest_match_len().unwrap()
- + 1;
- Some(depth)
- }
- }
-
- // Initialize the queue for breadth first search with all transitions
- // out of the start state. We handle the start state specially because
- // we only want to follow non-self transitions. If we followed self
- // transitions, then this would never terminate.
- let mut queue: VecDeque<QueuedState<S>> = VecDeque::new();
- let mut seen = self.queued_set();
- let start = QueuedState::start(&self.nfa);
- for b in AllBytesIter::new() {
- let next_id = self.nfa.start().next_state(b);
- if next_id != start.id {
- let next = start.next_queued_state(&self.nfa, next_id);
- if !seen.contains(next.id) {
- queue.push_back(next);
- seen.insert(next.id);
- }
- // If a state immediately following the start state is a match
- // state, then we never want to follow its failure transition
- // since the failure transition necessarily leads back to the
- // start state, which we never want to do for leftmost matching
- // after a match has been found.
- //
- // N.B. This is a special case of the more general handling
- // found below.
- if self.nfa.state(next_id).is_match() {
- self.nfa.state_mut(next_id).fail = dead_id();
- }
- }
- }
- while let Some(item) = queue.pop_front() {
- let mut any_trans = false;
- let mut it = self.nfa.iter_transitions_mut(item.id);
- while let Some((b, next_id)) = it.next() {
- any_trans = true;
-
- // Queue up the next state.
- let next = item.next_queued_state(it.nfa(), next_id);
- if seen.contains(next.id) {
- // The only way to visit a duplicate state in a transition
- // list is when ASCII case insensitivity is enabled. In
- // this case, we want to skip it since it's redundant work.
- // But it would also end up duplicating matches, which
- // results in reporting duplicate matches in some cases.
- // See the 'acasei010' regression test.
- continue;
- }
- queue.push_back(next);
- seen.insert(next.id);
-
- // Find the failure state for next. Same as standard.
- let mut fail = it.nfa().state(item.id).fail;
- while it.nfa().state(fail).next_state(b) == fail_id() {
- fail = it.nfa().state(fail).fail;
- }
- fail = it.nfa().state(fail).next_state(b);
-
- // This is the key difference from the standard formulation.
- // Namely, if we've seen a match, then we only want a failure
- // transition if the failure transition preserves the match
- // we've seen. In general, this is not true of all failure
- // transitions since they can point back to any suffix of what
- // we've seen so far. Instead, we only want to point back to
- // suffixes that contain any match we've seen.
- //
- // We achieve this by comparing the depth of the failure
- // transition with the number of states between this state
- // and the beginning of the earliest match detected. If the
- // depth of the failure state is smaller than this difference,
- // then it cannot contain the match. If it's bigger or equal
- // to the difference, then it necessarily includes the match
- // we've seen since all failure transitions correspond to a
- // suffix.
- //
- // If we've determined that we don't want the failure
- // transition, then we set this state's failure transition to
- // the dead state. In other words, when a search hits this
- // state, it will not continue and correctly stop. (N.B. A
- // dead state is different than a fail state. A dead state
- // MUST be preceded by a match and acts as a sentinel to search
- // routines to terminate.)
- //
- // Understanding this is tricky, and it took me several days
- // to think through this and get it right. If you want to grok
- // it, then I'd recommend: 1) switch the implementation to
- // always use the standard algorithm for filling in failure
- // transitions, 2) run the test suite and 3) examine the test
- // failures. Write out the automatons for them and try to work
- // backwards by figuring out which failure transitions should
- // be removed. You should arrive at the same rule used below.
- if let Some(match_depth) = next.match_at_depth {
- let fail_depth = it.nfa().state(fail).depth;
- let next_depth = it.nfa().state(next.id).depth;
- if next_depth - match_depth + 1 > fail_depth {
- it.nfa().state_mut(next.id).fail = dead_id();
- continue;
- }
- assert_ne!(
- start.id,
- it.nfa().state(next.id).fail,
- "states that are match states or follow match \
- states should never have a failure transition \
- back to the start state in leftmost searching",
- );
- }
- it.nfa().state_mut(next.id).fail = fail;
- it.nfa().copy_matches(fail, next.id);
- }
- // If there are no transitions for this state and if it's a match
- // state, then we must set its failure transition to the dead
- // state since we never want it to restart the search.
- if !any_trans && it.nfa().state(item.id).is_match() {
- it.nfa().state_mut(item.id).fail = dead_id();
+ if !kind.is_leftmost() {
+ it.nfa().copy_empty_matches(id);
}
- // We don't need to copy empty matches from the start state here
- // because that's only necessary for overlapping matches and
- // leftmost match kinds don't support overlapping matches.
}
}
@@ -1176,7 +1013,7 @@ impl<'a, S: StateID> Compiler<'a, S> {
fn calculate_size(&mut self) {
let mut size = 0;
for state in &self.nfa.states {
- size += state.heap_bytes();
+ size += size_of::<State<S>>() + state.heap_bytes();
}
self.nfa.heap_bytes = size;
}
diff --git a/src/packed/api.rs b/src/packed/api.rs
index c15ae3f..51703d0 100644
--- a/src/packed/api.rs
+++ b/src/packed/api.rs
@@ -267,13 +267,7 @@ impl Builder {
}
Some(ForceAlgorithm::RabinKarp) => (SearchKind::RabinKarp, 0),
};
- Some(Searcher {
- config: self.config.clone(),
- patterns,
- rabinkarp,
- search_kind,
- minimum_len,
- })
+ Some(Searcher { patterns, rabinkarp, search_kind, minimum_len })
}
fn build_teddy(&self, patterns: &Patterns) -> Option<Teddy> {
@@ -377,7 +371,6 @@ impl Default for Builder {
/// ```
#[derive(Clone, Debug)]
pub struct Searcher {
- config: Config,
patterns: Patterns,
rabinkarp: RabinKarp,
search_kind: SearchKind,
diff --git a/src/packed/mod.rs b/src/packed/mod.rs
index 25a7966..97c40ff 100644
--- a/src/packed/mod.rs
+++ b/src/packed/mod.rs
@@ -4,7 +4,7 @@ number of patterns.
This sub-module provides vectorized routines for quickly finding matches of a
small number of patterns. In general, users of this crate shouldn't need to
-interface with this module directory, as the primary
+interface with this module directly, as the primary
[`AhoCorasick`](../struct.AhoCorasick.html)
searcher will use these routines automatically as a prefilter when applicable.
However, in some cases, callers may want to bypass the Aho-Corasick machinery
diff --git a/src/packed/rabinkarp.rs b/src/packed/rabinkarp.rs
index fa6b1e3..c081f70 100644
--- a/src/packed/rabinkarp.rs
+++ b/src/packed/rabinkarp.rs
@@ -32,7 +32,7 @@ const NUM_BUCKETS: usize = 64;
/// https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm
///
/// But ESMAJ provides something a bit more concrete:
-/// http://www-igm.univ-mlv.fr/~lecroq/string/node5.html
+/// https://www-igm.univ-mlv.fr/~lecroq/string/node5.html
#[derive(Clone, Debug)]
pub struct RabinKarp {
/// The order of patterns in each bucket is significant. Namely, they are
diff --git a/src/packed/teddy/README.md b/src/packed/teddy/README.md
index 0c42383..51b999b 100644
--- a/src/packed/teddy/README.md
+++ b/src/packed/teddy/README.md
@@ -1,4 +1,4 @@
-Teddy is a simd accelerated multiple substring matching algorithm. The name
+Teddy is a SIMD accelerated multiple substring matching algorithm. The name
and the core ideas in the algorithm were learned from the [Hyperscan][1_u]
project. The implementation in this repository was mostly motivated for use in
accelerating regex searches by searching for small sets of required literals
@@ -341,46 +341,46 @@ runs as described, except with 16 buckets instead of 8.
# References
-- **[1]** [Hyperscan on GitHub](https://github.com/01org/hyperscan),
- [webpage](https://01.org/hyperscan)
+- **[1]** [Hyperscan on GitHub](https://github.com/intel/hyperscan),
+ [webpage](https://www.hyperscan.io/)
- **[2a]** Ben-Kiki, O., Bille, P., Breslauer, D., Gasieniec, L., Grossi, R.,
& Weimann, O. (2011).
_Optimal packed string matching_.
In LIPIcs-Leibniz International Proceedings in Informatics (Vol. 13).
Schloss Dagstuhl-Leibniz-Zentrum fuer Informatik.
DOI: 10.4230/LIPIcs.FSTTCS.2011.423.
- [PDF](http://drops.dagstuhl.de/opus/volltexte/2011/3355/pdf/37.pdf).
+ [PDF](https://drops.dagstuhl.de/opus/volltexte/2011/3355/pdf/37.pdf).
- **[2b]** Ben-Kiki, O., Bille, P., Breslauer, D., Ga̧sieniec, L., Grossi, R.,
& Weimann, O. (2014).
_Towards optimal packed string matching_.
Theoretical Computer Science, 525, 111-129.
DOI: 10.1016/j.tcs.2013.06.013.
- [PDF](http://www.cs.haifa.ac.il/~oren/Publications/bpsm.pdf).
+ [PDF](https://www.cs.haifa.ac.il/~oren/Publications/bpsm.pdf).
- **[3]** Bille, P. (2011).
_Fast searching in packed strings_.
Journal of Discrete Algorithms, 9(1), 49-56.
DOI: 10.1016/j.jda.2010.09.003.
- [PDF](http://www.sciencedirect.com/science/article/pii/S1570866710000353).
+ [PDF](https://www.sciencedirect.com/science/article/pii/S1570866710000353).
- **[4a]** Faro, S., & Külekci, M. O. (2012, October).
_Fast multiple string matching using streaming SIMD extensions technology_.
In String Processing and Information Retrieval (pp. 217-228).
Springer Berlin Heidelberg.
DOI: 10.1007/978-3-642-34109-0_23.
- [PDF](http://www.dmi.unict.it/~faro/papers/conference/faro32.pdf).
+ [PDF](https://www.dmi.unict.it/faro/papers/conference/faro32.pdf).
- **[4b]** Faro, S., & Külekci, M. O. (2013, September).
_Towards a Very Fast Multiple String Matching Algorithm for Short Patterns_.
In Stringology (pp. 78-91).
- [PDF](http://www.dmi.unict.it/~faro/papers/conference/faro36.pdf).
+ [PDF](https://www.dmi.unict.it/faro/papers/conference/faro36.pdf).
- **[4c]** Faro, S., & Külekci, M. O. (2013, January).
_Fast packed string matching for short patterns_.
In Proceedings of the Meeting on Algorithm Engineering & Expermiments
(pp. 113-121).
Society for Industrial and Applied Mathematics.
- [PDF](http://arxiv.org/pdf/1209.6449.pdf).
+ [PDF](https://arxiv.org/pdf/1209.6449.pdf).
- **[4d]** Faro, S., & Külekci, M. O. (2014).
_Fast and flexible packed string matching_.
Journal of Discrete Algorithms, 28, 61-72.
DOI: 10.1016/j.jda.2014.07.003.
-[1_u]: https://github.com/01org/hyperscan
+[1_u]: https://github.com/intel/hyperscan
[5_u]: https://software.intel.com/sites/landingpage/IntrinsicsGuide
diff --git a/src/tests.rs b/src/tests.rs
index 25c0d5f..20cd3d1 100644
--- a/src/tests.rs
+++ b/src/tests.rs
@@ -337,6 +337,7 @@ const LEFTMOST_FIRST: &'static [SearchTest] = &[
&[(0, 0, 1), (2, 7, 9),]
),
t!(leftfirst330, &["a", "abab"], "abab", &[(0, 0, 1), (0, 2, 3)]),
+ t!(leftfirst400, &["amwix", "samwise", "sam"], "Zsamwix", &[(2, 1, 4)]),
];
/// Like LEFTMOST_FIRST, but for anchored searches.
@@ -360,6 +361,7 @@ const ANCHORED_LEFTMOST_FIRST: &'static [SearchTest] = &[
&[(0, 0, 1)]
),
t!(aleftfirst330, &["a", "abab"], "abab", &[(0, 0, 1)]),
+ t!(aleftfirst400, &["wise", "samwise", "sam"], "samwix", &[(2, 0, 3)]),
];
/// Tests for non-overlapping leftmost-longest match semantics. These tests