aboutsummaryrefslogtreecommitdiff
path: root/re2/testing/compile_test.cc
diff options
context:
space:
mode:
Diffstat (limited to 're2/testing/compile_test.cc')
-rw-r--r--re2/testing/compile_test.cc396
1 files changed, 311 insertions, 85 deletions
diff --git a/re2/testing/compile_test.cc b/re2/testing/compile_test.cc
index 8d92105..d89d80f 100644
--- a/re2/testing/compile_test.cc
+++ b/re2/testing/compile_test.cc
@@ -5,13 +5,12 @@
// Test prog.cc, compile.cc
#include <string>
-#include <vector>
+
#include "util/test.h"
+#include "util/logging.h"
#include "re2/regexp.h"
#include "re2/prog.h"
-DEFINE_string(show, "", "regular expression to compile and dump");
-
namespace re2 {
// Simple input/output tests checking that
@@ -27,78 +26,89 @@ struct Test {
static Test tests[] = {
{ "a",
- "1. byte [61-61] -> 2\n"
- "2. match! 0\n" },
+ "3. byte [61-61] -> 4\n"
+ "4. match! 0\n" },
{ "ab",
- "1. byte [61-61] -> 2\n"
- "2. byte [62-62] -> 3\n"
- "3. match! 0\n" },
+ "3. byte [61-61] -> 4\n"
+ "4. byte [62-62] -> 5\n"
+ "5. match! 0\n" },
{ "a|c",
- "3. alt -> 1 | 2\n"
- "1. byte [61-61] -> 4\n"
- "2. byte [63-63] -> 4\n"
- "4. match! 0\n" },
+ "3+ byte [61-61] -> 5\n"
+ "4. byte [63-63] -> 5\n"
+ "5. match! 0\n" },
{ "a|b",
- "1. byte [61-62] -> 2\n"
- "2. match! 0\n" },
+ "3. byte [61-62] -> 4\n"
+ "4. match! 0\n" },
{ "[ab]",
- "1. byte [61-62] -> 2\n"
- "2. match! 0\n" },
+ "3. byte [61-62] -> 4\n"
+ "4. match! 0\n" },
{ "a+",
- "1. byte [61-61] -> 2\n"
- "2. alt -> 1 | 3\n"
- "3. match! 0\n" },
+ "3. byte [61-61] -> 4\n"
+ "4+ nop -> 3\n"
+ "5. match! 0\n" },
{ "a+?",
- "1. byte [61-61] -> 2\n"
- "2. alt -> 3 | 1\n"
- "3. match! 0\n" },
+ "3. byte [61-61] -> 4\n"
+ "4+ match! 0\n"
+ "5. nop -> 3\n" },
{ "a*",
- "2. alt -> 1 | 3\n"
- "1. byte [61-61] -> 2\n"
- "3. match! 0\n" },
+ "3+ byte [61-61] -> 3\n"
+ "4. match! 0\n" },
{ "a*?",
- "2. alt -> 3 | 1\n"
- "3. match! 0\n"
- "1. byte [61-61] -> 2\n" },
+ "3+ match! 0\n"
+ "4. byte [61-61] -> 3\n" },
{ "a?",
- "2. alt -> 1 | 3\n"
- "1. byte [61-61] -> 3\n"
- "3. match! 0\n" },
+ "3+ byte [61-61] -> 5\n"
+ "4. nop -> 5\n"
+ "5. match! 0\n" },
{ "a??",
- "2. alt -> 3 | 1\n"
- "3. match! 0\n"
- "1. byte [61-61] -> 3\n" },
+ "3+ nop -> 5\n"
+ "4. byte [61-61] -> 5\n"
+ "5. match! 0\n" },
{ "a{4}",
- "1. byte [61-61] -> 2\n"
- "2. byte [61-61] -> 3\n"
"3. byte [61-61] -> 4\n"
"4. byte [61-61] -> 5\n"
- "5. match! 0\n" },
+ "5. byte [61-61] -> 6\n"
+ "6. byte [61-61] -> 7\n"
+ "7. match! 0\n" },
{ "(a)",
- "2. capture 2 -> 1\n"
- "1. byte [61-61] -> 3\n"
- "3. capture 3 -> 4\n"
- "4. match! 0\n" },
+ "3. capture 2 -> 4\n"
+ "4. byte [61-61] -> 5\n"
+ "5. capture 3 -> 6\n"
+ "6. match! 0\n" },
{ "(?:a)",
- "1. byte [61-61] -> 2\n"
- "2. match! 0\n" },
+ "3. byte [61-61] -> 4\n"
+ "4. match! 0\n" },
{ "",
- "2. match! 0\n" },
+ "3. match! 0\n" },
{ ".",
- "3. alt -> 1 | 2\n"
- "1. byte [00-09] -> 4\n"
- "2. byte [0b-ff] -> 4\n"
- "4. match! 0\n" },
+ "3+ byte [00-09] -> 5\n"
+ "4. byte [0b-ff] -> 5\n"
+ "5. match! 0\n" },
{ "[^ab]",
- "5. alt -> 3 | 4\n"
- "3. alt -> 1 | 2\n"
- "4. byte [63-ff] -> 6\n"
- "1. byte [00-09] -> 6\n"
- "2. byte [0b-60] -> 6\n"
+ "3+ byte [00-09] -> 6\n"
+ "4+ byte [0b-60] -> 6\n"
+ "5. byte [63-ff] -> 6\n"
"6. match! 0\n" },
{ "[Aa]",
- "1. byte/i [61-61] -> 2\n"
- "2. match! 0\n" },
+ "3. byte/i [61-61] -> 4\n"
+ "4. match! 0\n" },
+ { "\\C+",
+ "3. byte [00-ff] -> 4\n"
+ "4+ altmatch -> 5 | 6\n"
+ "5+ nop -> 3\n"
+ "6. match! 0\n" },
+ { "\\C*",
+ "3+ altmatch -> 4 | 5\n"
+ "4+ byte [00-ff] -> 3\n"
+ "5. match! 0\n" },
+ { "\\C?",
+ "3+ byte [00-ff] -> 5\n"
+ "4. nop -> 5\n"
+ "5. match! 0\n" },
+ // Issue 20992936
+ { "[[-`]",
+ "3. byte [5b-60] -> 4\n"
+ "4. match! 0\n" },
};
TEST(TestRegexpCompileToProg, Simple) {
@@ -118,7 +128,7 @@ TEST(TestRegexpCompileToProg, Simple) {
failed++;
continue;
}
- CHECK(re->CompileToProg(1) == NULL);
+ ASSERT_TRUE(re->CompileToProg(1) == NULL);
string s = prog->Dump();
if (s != t.code) {
LOG(ERROR) << "Incorrect compiled code for: " << t.regexp;
@@ -132,40 +142,256 @@ TEST(TestRegexpCompileToProg, Simple) {
EXPECT_EQ(failed, 0);
}
-// The distinct byte ranges involved in the UTF-8 dot ([^\n]).
-// Once, erroneously split between 0x3f and 0x40 because it is
-// a 6-bit boundary.
-static struct UTF8ByteRange {
- int lo;
- int hi;
-} utf8ranges[] = {
- { 0x00, 0x09 },
- { 0x0A, 0x0A },
- { 0x10, 0x7F },
- { 0x80, 0x8F },
- { 0x90, 0x9F },
- { 0xA0, 0xBF },
- { 0xC0, 0xC1 },
- { 0xC2, 0xDF },
- { 0xE0, 0xE0 },
- { 0xE1, 0xEF },
- { 0xF0, 0xF0 },
- { 0xF1, 0xF3 },
- { 0xF4, 0xF4 },
- { 0xF5, 0xFF },
-};
-
-TEST(TestCompile, ByteRanges) {
- Regexp* re = Regexp::Parse(".", Regexp::PerlX, NULL);
+static void DumpByteMap(StringPiece pattern, Regexp::ParseFlags flags,
+ string* bytemap) {
+ Regexp* re = Regexp::Parse(pattern, flags, NULL);
EXPECT_TRUE(re != NULL);
+
Prog* prog = re->CompileToProg(0);
EXPECT_TRUE(prog != NULL);
- EXPECT_EQ(prog->bytemap_range(), arraysize(utf8ranges));
- for (int i = 0; i < arraysize(utf8ranges); i++)
- for (int j = utf8ranges[i].lo; j <= utf8ranges[i].hi; j++)
- EXPECT_EQ(prog->bytemap()[j], i) << " byte " << j;
+ *bytemap = prog->DumpByteMap();
delete prog;
+
re->Decref();
}
+TEST(TestCompile, Latin1Ranges) {
+ // The distinct byte ranges involved in the Latin-1 dot ([^\n]).
+
+ string bytemap;
+
+ DumpByteMap(".", Regexp::PerlX|Regexp::Latin1, &bytemap);
+ EXPECT_EQ("[00-09] -> 0\n"
+ "[0a-0a] -> 1\n"
+ "[0b-ff] -> 0\n",
+ bytemap);
+}
+
+TEST(TestCompile, OtherByteMapTests) {
+ string bytemap;
+
+ // Test that "absent" ranges are mapped to the same byte class.
+ DumpByteMap("[0-9A-Fa-f]+", Regexp::PerlX|Regexp::Latin1, &bytemap);
+ EXPECT_EQ("[00-2f] -> 0\n"
+ "[30-39] -> 1\n"
+ "[3a-40] -> 0\n"
+ "[41-46] -> 1\n"
+ "[47-60] -> 0\n"
+ "[61-66] -> 1\n"
+ "[67-ff] -> 0\n",
+ bytemap);
+
+ // Test the byte classes for \b.
+ DumpByteMap("\\b", Regexp::LikePerl|Regexp::Latin1, &bytemap);
+ EXPECT_EQ("[00-2f] -> 0\n"
+ "[30-39] -> 1\n"
+ "[3a-40] -> 0\n"
+ "[41-5a] -> 1\n"
+ "[5b-5e] -> 0\n"
+ "[5f-5f] -> 1\n"
+ "[60-60] -> 0\n"
+ "[61-7a] -> 1\n"
+ "[7b-ff] -> 0\n",
+ bytemap);
+
+ // Bug in the ASCII case-folding optimization created too many byte classes.
+ DumpByteMap("[^_]", Regexp::LikePerl|Regexp::Latin1, &bytemap);
+ EXPECT_EQ("[00-5e] -> 0\n"
+ "[5f-5f] -> 1\n"
+ "[60-ff] -> 0\n",
+ bytemap);
+}
+
+TEST(TestCompile, UTF8Ranges) {
+ // The distinct byte ranges involved in the UTF-8 dot ([^\n]).
+ // Once, erroneously split between 0x3f and 0x40 because it is
+ // a 6-bit boundary.
+
+ string bytemap;
+
+ DumpByteMap(".", Regexp::PerlX, &bytemap);
+ EXPECT_EQ("[00-09] -> 0\n"
+ "[0a-0a] -> 1\n"
+ "[0b-7f] -> 0\n"
+ "[80-8f] -> 2\n"
+ "[90-9f] -> 3\n"
+ "[a0-bf] -> 4\n"
+ "[c0-c1] -> 1\n"
+ "[c2-df] -> 5\n"
+ "[e0-e0] -> 6\n"
+ "[e1-ef] -> 7\n"
+ "[f0-f0] -> 8\n"
+ "[f1-f3] -> 9\n"
+ "[f4-f4] -> 10\n"
+ "[f5-ff] -> 1\n",
+ bytemap);
+}
+
+TEST(TestCompile, InsufficientMemory) {
+ Regexp* re = Regexp::Parse(
+ "^(?P<name1>[^\\s]+)\\s+(?P<name2>[^\\s]+)\\s+(?P<name3>.+)$",
+ Regexp::LikePerl, NULL);
+ EXPECT_TRUE(re != NULL);
+ Prog* prog = re->CompileToProg(920);
+ // If the memory budget has been exhausted, compilation should fail
+ // and return NULL instead of trying to do anything with NoMatch().
+ EXPECT_TRUE(prog == NULL);
+ re->Decref();
+}
+
+static void Dump(StringPiece pattern, Regexp::ParseFlags flags,
+ string* forward, string* reverse) {
+ Regexp* re = Regexp::Parse(pattern, flags, NULL);
+ EXPECT_TRUE(re != NULL);
+
+ if (forward != NULL) {
+ Prog* prog = re->CompileToProg(0);
+ EXPECT_TRUE(prog != NULL);
+ *forward = prog->Dump();
+ delete prog;
+ }
+
+ if (reverse != NULL) {
+ Prog* prog = re->CompileToReverseProg(0);
+ EXPECT_TRUE(prog != NULL);
+ *reverse = prog->Dump();
+ delete prog;
+ }
+
+ re->Decref();
+}
+
+TEST(TestCompile, Bug26705922) {
+ // Bug in the compiler caused inefficient bytecode to be generated for Unicode
+ // groups: common suffixes were cached, but common prefixes were not factored.
+
+ string forward, reverse;
+
+ Dump("[\\x{10000}\\x{10010}]", Regexp::LikePerl, &forward, &reverse);
+ EXPECT_EQ("3. byte [f0-f0] -> 4\n"
+ "4. byte [90-90] -> 5\n"
+ "5. byte [80-80] -> 6\n"
+ "6+ byte [80-80] -> 8\n"
+ "7. byte [90-90] -> 8\n"
+ "8. match! 0\n",
+ forward);
+ EXPECT_EQ("3+ byte [80-80] -> 5\n"
+ "4. byte [90-90] -> 5\n"
+ "5. byte [80-80] -> 6\n"
+ "6. byte [90-90] -> 7\n"
+ "7. byte [f0-f0] -> 8\n"
+ "8. match! 0\n",
+ reverse);
+
+ Dump("[\\x{8000}-\\x{10FFF}]", Regexp::LikePerl, &forward, &reverse);
+ EXPECT_EQ("3+ byte [e8-ef] -> 5\n"
+ "4. byte [f0-f0] -> 8\n"
+ "5. byte [80-bf] -> 6\n"
+ "6. byte [80-bf] -> 7\n"
+ "7. match! 0\n"
+ "8. byte [90-90] -> 5\n",
+ forward);
+ EXPECT_EQ("3. byte [80-bf] -> 4\n"
+ "4. byte [80-bf] -> 5\n"
+ "5+ byte [e8-ef] -> 7\n"
+ "6. byte [90-90] -> 8\n"
+ "7. match! 0\n"
+ "8. byte [f0-f0] -> 7\n",
+ reverse);
+
+ Dump("[\\x{80}-\\x{10FFFF}]", Regexp::LikePerl, NULL, &reverse);
+ EXPECT_EQ("3. byte [80-bf] -> 4\n"
+ "4+ byte [c2-df] -> 7\n"
+ "5+ byte [a0-bf] -> 8\n"
+ "6. byte [80-bf] -> 9\n"
+ "7. match! 0\n"
+ "8. byte [e0-e0] -> 7\n"
+ "9+ byte [e1-ef] -> 7\n"
+ "10+ byte [90-bf] -> 13\n"
+ "11+ byte [80-bf] -> 14\n"
+ "12. byte [80-8f] -> 15\n"
+ "13. byte [f0-f0] -> 7\n"
+ "14. byte [f1-f3] -> 7\n"
+ "15. byte [f4-f4] -> 7\n",
+ reverse);
+}
+
+TEST(TestCompile, Bug35237384) {
+ // Bug in the compiler caused inefficient bytecode to be generated for
+ // nested nullable subexpressions.
+
+ string forward;
+
+ Dump("a**{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
+ EXPECT_EQ("3+ byte [61-61] -> 3\n"
+ "4. nop -> 5\n"
+ "5+ byte [61-61] -> 5\n"
+ "6. nop -> 7\n"
+ "7+ byte [61-61] -> 7\n"
+ "8. match! 0\n",
+ forward);
+
+ Dump("(a*|b*)*{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
+ EXPECT_EQ("3+ nop -> 6\n"
+ "4+ nop -> 8\n"
+ "5. nop -> 21\n"
+ "6+ byte [61-61] -> 6\n"
+ "7. nop -> 3\n"
+ "8+ byte [62-62] -> 8\n"
+ "9. nop -> 3\n"
+ "10+ byte [61-61] -> 10\n"
+ "11. nop -> 21\n"
+ "12+ byte [62-62] -> 12\n"
+ "13. nop -> 21\n"
+ "14+ byte [61-61] -> 14\n"
+ "15. nop -> 18\n"
+ "16+ byte [62-62] -> 16\n"
+ "17. nop -> 18\n"
+ "18+ nop -> 14\n"
+ "19+ nop -> 16\n"
+ "20. match! 0\n"
+ "21+ nop -> 10\n"
+ "22+ nop -> 12\n"
+ "23. nop -> 18\n",
+ forward);
+
+ Dump("((|S.+)+|(|S.+)+|){2}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
+ EXPECT_EQ("3+ nop -> 36\n"
+ "4+ nop -> 31\n"
+ "5. nop -> 33\n"
+ "6+ byte [00-09] -> 8\n"
+ "7. byte [0b-ff] -> 8\n"
+ "8+ nop -> 6\n"
+ "9+ nop -> 29\n"
+ "10. nop -> 28\n"
+ "11+ byte [00-09] -> 13\n"
+ "12. byte [0b-ff] -> 13\n"
+ "13+ nop -> 11\n"
+ "14+ nop -> 26\n"
+ "15. nop -> 28\n"
+ "16+ byte [00-09] -> 18\n"
+ "17. byte [0b-ff] -> 18\n"
+ "18+ nop -> 16\n"
+ "19+ nop -> 36\n"
+ "20. nop -> 33\n"
+ "21+ byte [00-09] -> 23\n"
+ "22. byte [0b-ff] -> 23\n"
+ "23+ nop -> 21\n"
+ "24+ nop -> 31\n"
+ "25. nop -> 33\n"
+ "26+ nop -> 28\n"
+ "27. byte [53-53] -> 11\n"
+ "28. match! 0\n"
+ "29+ nop -> 28\n"
+ "30. byte [53-53] -> 6\n"
+ "31+ nop -> 33\n"
+ "32. byte [53-53] -> 21\n"
+ "33+ nop -> 29\n"
+ "34+ nop -> 26\n"
+ "35. nop -> 28\n"
+ "36+ nop -> 33\n"
+ "37. byte [53-53] -> 16\n",
+ forward);
+}
+
} // namespace re2