aboutsummaryrefslogtreecommitdiff
path: root/re2/testing/filtered_re2_test.cc
diff options
context:
space:
mode:
Diffstat (limited to 're2/testing/filtered_re2_test.cc')
-rw-r--r--re2/testing/filtered_re2_test.cc275
1 files changed, 275 insertions, 0 deletions
diff --git a/re2/testing/filtered_re2_test.cc b/re2/testing/filtered_re2_test.cc
new file mode 100644
index 0000000..e3a0dd1
--- /dev/null
+++ b/re2/testing/filtered_re2_test.cc
@@ -0,0 +1,275 @@
+// Copyright 2009 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "util/test.h"
+#include "re2/filtered_re2.h"
+#include "re2/re2.h"
+
+DECLARE_int32(filtered_re2_min_atom_len); // From prefilter_tree.cc
+
+namespace re2 {
+
+struct FilterTestVars {
+ vector<string> atoms;
+ vector<int> atom_indices;
+ vector<int> matches;
+ RE2::Options opts;
+ FilteredRE2 f;
+};
+
+TEST(FilteredRE2Test, EmptyTest) {
+ FilterTestVars v;
+ v.f.AllMatches("foo", v.atom_indices, &v.matches);
+ EXPECT_EQ(0, v.matches.size());
+}
+
+TEST(FilteredRE2Test, SmallOrTest) {
+ FLAGS_filtered_re2_min_atom_len = 4;
+
+ FilterTestVars v;
+ int id;
+ v.f.Add("(foo|bar)", v.opts, &id);
+
+ v.f.Compile(&v.atoms);
+ EXPECT_EQ(0, v.atoms.size());
+
+ v.f.AllMatches("lemurs bar", v.atom_indices, &v.matches);
+ EXPECT_EQ(1, v.matches.size());
+ EXPECT_EQ(id, v.matches[0]);
+}
+
+TEST(FilteredRE2Test, SmallLatinTest) {
+ FLAGS_filtered_re2_min_atom_len = 3;
+ FilterTestVars v;
+ int id;
+
+ v.opts.set_utf8(false);
+ v.f.Add("\xde\xadQ\xbe\xef", v.opts, &id);
+ v.f.Compile(&v.atoms);
+ EXPECT_EQ(1, v.atoms.size());
+ EXPECT_EQ(v.atoms[0], "\xde\xadq\xbe\xef");
+
+ v.atom_indices.push_back(0);
+ v.f.AllMatches("foo\xde\xadQ\xbe\xeflemur", v.atom_indices, &v.matches);
+ EXPECT_EQ(1, v.matches.size());
+ EXPECT_EQ(id, v.matches[0]);
+}
+
+struct AtomTest {
+ const char* testname;
+ // If any test needs more than this many regexps or atoms, increase
+ // the size of the corresponding array.
+ const char* regexps[20];
+ const char* atoms[20];
+};
+
+AtomTest atom_tests[] = {
+ {
+ // This test checks to make sure empty patterns are allowed.
+ "CheckEmptyPattern",
+ {""},
+ {}
+ }, {
+ // This test checks that all atoms of length greater than min length
+ // are found, and no atoms that are of smaller length are found.
+ "AllAtomsGtMinLengthFound", {
+ "(abc123|def456|ghi789).*mnop[x-z]+",
+ "abc..yyy..zz",
+ "mnmnpp[a-z]+PPP"
+ }, {
+ "abc123",
+ "def456",
+ "ghi789",
+ "mnop",
+ "abc",
+ "yyy",
+ "mnmnpp",
+ "ppp"
+ }
+ }, {
+ // Test to make sure that any atoms that have another atom as a
+ // substring in an OR are removed; that is, only the shortest
+ // substring is kept.
+ "SubstrAtomRemovesSuperStrInOr", {
+ "(abc123|abc|ghi789|abc1234).*[x-z]+",
+ "abcd..yyy..yyyzzz",
+ "mnmnpp[a-z]+PPP"
+ }, {
+ "abc",
+ "ghi789",
+ "abcd",
+ "yyy",
+ "yyyzzz",
+ "mnmnpp",
+ "ppp"
+ }
+ }, {
+ // Test character class expansion.
+ "CharClassExpansion", {
+ "m[a-c][d-f]n.*[x-z]+",
+ "[x-y]bcde[ab]"
+ }, {
+ "madn", "maen", "mafn",
+ "mbdn", "mben", "mbfn",
+ "mcdn", "mcen", "mcfn",
+ "xbcdea", "xbcdeb",
+ "ybcdea", "ybcdeb"
+ }
+ }, {
+ // Test upper/lower of non-ASCII.
+ "UnicodeLower", {
+ "(?i)ΔδΠϖπΣςσ",
+ "ΛΜΝΟΠ",
+ "ψρστυ",
+ }, {
+ "δδπππσσσ",
+ "λμνοπ",
+ "ψρστυ",
+ },
+ },
+};
+
+void AddRegexpsAndCompile(const char* regexps[],
+ int n,
+ struct FilterTestVars* v) {
+ for (int i = 0; i < n; i++) {
+ int id;
+ v->f.Add(regexps[i], v->opts, &id);
+ }
+ v->f.Compile(&v->atoms);
+}
+
+bool CheckExpectedAtoms(const char* atoms[],
+ int n,
+ const char* testname,
+ struct FilterTestVars* v) {
+ vector<string> expected;
+ for (int i = 0; i < n; i++)
+ expected.push_back(atoms[i]);
+
+ bool pass = expected.size() == v->atoms.size();
+
+ sort(v->atoms.begin(), v->atoms.end());
+ sort(expected.begin(), expected.end());
+ for (int i = 0; pass && i < n; i++)
+ pass = pass && expected[i] == v->atoms[i];
+
+ if (!pass) {
+ LOG(WARNING) << "Failed " << testname;
+ LOG(WARNING) << "Expected #atoms = " << expected.size();
+ for (int i = 0; i < expected.size(); i++)
+ LOG(WARNING) << expected[i];
+ LOG(WARNING) << "Found #atoms = " << v->atoms.size();
+ for (int i = 0; i < v->atoms.size(); i++)
+ LOG(WARNING) << v->atoms[i];
+ }
+
+ return pass;
+}
+
+TEST(FilteredRE2Test, AtomTests) {
+ FLAGS_filtered_re2_min_atom_len = 3;
+
+ int nfail = 0;
+ for (int i = 0; i < arraysize(atom_tests); i++) {
+ FilterTestVars v;
+ AtomTest* t = &atom_tests[i];
+ int natom, nregexp;
+ for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++)
+ if (t->regexps[nregexp] == NULL)
+ break;
+ for (natom = 0; natom < arraysize(t->atoms); natom++)
+ if (t->atoms[natom] == NULL)
+ break;
+ AddRegexpsAndCompile(t->regexps, nregexp, &v);
+ if (!CheckExpectedAtoms(t->atoms, natom, t->testname, &v))
+ nfail++;
+ }
+ EXPECT_EQ(0, nfail);
+}
+
+void FindAtomIndices(const vector<string> atoms,
+ const vector<string> matched_atoms,
+ vector<int>* atom_indices) {
+ atom_indices->clear();
+ for (int i = 0; i < matched_atoms.size(); i++) {
+ int j = 0;
+ for (; j < atoms.size(); j++) {
+ if (matched_atoms[i] == atoms[j]) {
+ atom_indices->push_back(j);
+ break;
+ }
+ EXPECT_LT(j, atoms.size());
+ }
+ }
+}
+
+TEST(FilteredRE2Test, MatchEmptyPattern) {
+ FLAGS_filtered_re2_min_atom_len = 3;
+
+ FilterTestVars v;
+ AtomTest* t = &atom_tests[0];
+ // We are using the regexps used in one of the atom tests
+ // for this test. Adding the EXPECT here to make sure
+ // the index we use for the test is for the correct test.
+ EXPECT_EQ("CheckEmptyPattern", string(t->testname));
+ int nregexp;
+ for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++)
+ if (t->regexps[nregexp] == NULL)
+ break;
+ AddRegexpsAndCompile(t->regexps, nregexp, &v);
+ string text = "0123";
+ vector<int> atom_ids;
+ vector<int> matching_regexps;
+ EXPECT_EQ(0, v.f.FirstMatch(text, atom_ids));
+}
+
+TEST(FilteredRE2Test, MatchTests) {
+ FLAGS_filtered_re2_min_atom_len = 3;
+
+ FilterTestVars v;
+ AtomTest* t = &atom_tests[2];
+ // We are using the regexps used in one of the atom tests
+ // for this test.
+ EXPECT_EQ("SubstrAtomRemovesSuperStrInOr", string(t->testname));
+ int nregexp;
+ for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++)
+ if (t->regexps[nregexp] == NULL)
+ break;
+ AddRegexpsAndCompile(t->regexps, nregexp, &v);
+
+ string text = "abc121212xyz";
+ // atoms = abc
+ vector<int> atom_ids;
+ vector<string> atoms;
+ atoms.push_back("abc");
+ FindAtomIndices(v.atoms, atoms, &atom_ids);
+ vector<int> matching_regexps;
+ v.f.AllMatches(text, atom_ids, &matching_regexps);
+ EXPECT_EQ(1, matching_regexps.size());
+
+ text = "abc12312yyyzzz";
+ atoms.clear();
+ atoms.push_back("abc");
+ atoms.push_back("yyy");
+ atoms.push_back("yyyzzz");
+ FindAtomIndices(v.atoms, atoms, &atom_ids);
+ v.f.AllMatches(text, atom_ids, &matching_regexps);
+ EXPECT_EQ(1, matching_regexps.size());
+
+ text = "abcd12yyy32yyyzzz";
+ atoms.clear();
+ atoms.push_back("abc");
+ atoms.push_back("abcd");
+ atoms.push_back("yyy");
+ atoms.push_back("yyyzzz");
+ FindAtomIndices(v.atoms, atoms, &atom_ids);
+ LOG(INFO) << "S: " << atom_ids.size();
+ for (int i = 0; i < atom_ids.size(); i++)
+ LOG(INFO) << "i: " << i << " : " << atom_ids[i];
+ v.f.AllMatches(text, atom_ids, &matching_regexps);
+ EXPECT_EQ(2, matching_regexps.size());
+}
+
+} // namespace re2