diff options
Diffstat (limited to 're2/testing/compile_test.cc')
-rw-r--r-- | re2/testing/compile_test.cc | 396 |
1 files changed, 311 insertions, 85 deletions
diff --git a/re2/testing/compile_test.cc b/re2/testing/compile_test.cc index 8d92105..d89d80f 100644 --- a/re2/testing/compile_test.cc +++ b/re2/testing/compile_test.cc @@ -5,13 +5,12 @@ // Test prog.cc, compile.cc #include <string> -#include <vector> + #include "util/test.h" +#include "util/logging.h" #include "re2/regexp.h" #include "re2/prog.h" -DEFINE_string(show, "", "regular expression to compile and dump"); - namespace re2 { // Simple input/output tests checking that @@ -27,78 +26,89 @@ struct Test { static Test tests[] = { { "a", - "1. byte [61-61] -> 2\n" - "2. match! 0\n" }, + "3. byte [61-61] -> 4\n" + "4. match! 0\n" }, { "ab", - "1. byte [61-61] -> 2\n" - "2. byte [62-62] -> 3\n" - "3. match! 0\n" }, + "3. byte [61-61] -> 4\n" + "4. byte [62-62] -> 5\n" + "5. match! 0\n" }, { "a|c", - "3. alt -> 1 | 2\n" - "1. byte [61-61] -> 4\n" - "2. byte [63-63] -> 4\n" - "4. match! 0\n" }, + "3+ byte [61-61] -> 5\n" + "4. byte [63-63] -> 5\n" + "5. match! 0\n" }, { "a|b", - "1. byte [61-62] -> 2\n" - "2. match! 0\n" }, + "3. byte [61-62] -> 4\n" + "4. match! 0\n" }, { "[ab]", - "1. byte [61-62] -> 2\n" - "2. match! 0\n" }, + "3. byte [61-62] -> 4\n" + "4. match! 0\n" }, { "a+", - "1. byte [61-61] -> 2\n" - "2. alt -> 1 | 3\n" - "3. match! 0\n" }, + "3. byte [61-61] -> 4\n" + "4+ nop -> 3\n" + "5. match! 0\n" }, { "a+?", - "1. byte [61-61] -> 2\n" - "2. alt -> 3 | 1\n" - "3. match! 0\n" }, + "3. byte [61-61] -> 4\n" + "4+ match! 0\n" + "5. nop -> 3\n" }, { "a*", - "2. alt -> 1 | 3\n" - "1. byte [61-61] -> 2\n" - "3. match! 0\n" }, + "3+ byte [61-61] -> 3\n" + "4. match! 0\n" }, { "a*?", - "2. alt -> 3 | 1\n" - "3. match! 0\n" - "1. byte [61-61] -> 2\n" }, + "3+ match! 0\n" + "4. byte [61-61] -> 3\n" }, { "a?", - "2. alt -> 1 | 3\n" - "1. byte [61-61] -> 3\n" - "3. match! 0\n" }, + "3+ byte [61-61] -> 5\n" + "4. nop -> 5\n" + "5. match! 0\n" }, { "a??", - "2. alt -> 3 | 1\n" - "3. match! 0\n" - "1. byte [61-61] -> 3\n" }, + "3+ nop -> 5\n" + "4. byte [61-61] -> 5\n" + "5. match! 0\n" }, { "a{4}", - "1. byte [61-61] -> 2\n" - "2. byte [61-61] -> 3\n" "3. byte [61-61] -> 4\n" "4. byte [61-61] -> 5\n" - "5. match! 0\n" }, + "5. byte [61-61] -> 6\n" + "6. byte [61-61] -> 7\n" + "7. match! 0\n" }, { "(a)", - "2. capture 2 -> 1\n" - "1. byte [61-61] -> 3\n" - "3. capture 3 -> 4\n" - "4. match! 0\n" }, + "3. capture 2 -> 4\n" + "4. byte [61-61] -> 5\n" + "5. capture 3 -> 6\n" + "6. match! 0\n" }, { "(?:a)", - "1. byte [61-61] -> 2\n" - "2. match! 0\n" }, + "3. byte [61-61] -> 4\n" + "4. match! 0\n" }, { "", - "2. match! 0\n" }, + "3. match! 0\n" }, { ".", - "3. alt -> 1 | 2\n" - "1. byte [00-09] -> 4\n" - "2. byte [0b-ff] -> 4\n" - "4. match! 0\n" }, + "3+ byte [00-09] -> 5\n" + "4. byte [0b-ff] -> 5\n" + "5. match! 0\n" }, { "[^ab]", - "5. alt -> 3 | 4\n" - "3. alt -> 1 | 2\n" - "4. byte [63-ff] -> 6\n" - "1. byte [00-09] -> 6\n" - "2. byte [0b-60] -> 6\n" + "3+ byte [00-09] -> 6\n" + "4+ byte [0b-60] -> 6\n" + "5. byte [63-ff] -> 6\n" "6. match! 0\n" }, { "[Aa]", - "1. byte/i [61-61] -> 2\n" - "2. match! 0\n" }, + "3. byte/i [61-61] -> 4\n" + "4. match! 0\n" }, + { "\\C+", + "3. byte [00-ff] -> 4\n" + "4+ altmatch -> 5 | 6\n" + "5+ nop -> 3\n" + "6. match! 0\n" }, + { "\\C*", + "3+ altmatch -> 4 | 5\n" + "4+ byte [00-ff] -> 3\n" + "5. match! 0\n" }, + { "\\C?", + "3+ byte [00-ff] -> 5\n" + "4. nop -> 5\n" + "5. match! 0\n" }, + // Issue 20992936 + { "[[-`]", + "3. byte [5b-60] -> 4\n" + "4. match! 0\n" }, }; TEST(TestRegexpCompileToProg, Simple) { @@ -118,7 +128,7 @@ TEST(TestRegexpCompileToProg, Simple) { failed++; continue; } - CHECK(re->CompileToProg(1) == NULL); + ASSERT_TRUE(re->CompileToProg(1) == NULL); string s = prog->Dump(); if (s != t.code) { LOG(ERROR) << "Incorrect compiled code for: " << t.regexp; @@ -132,40 +142,256 @@ TEST(TestRegexpCompileToProg, Simple) { EXPECT_EQ(failed, 0); } -// The distinct byte ranges involved in the UTF-8 dot ([^\n]). -// Once, erroneously split between 0x3f and 0x40 because it is -// a 6-bit boundary. -static struct UTF8ByteRange { - int lo; - int hi; -} utf8ranges[] = { - { 0x00, 0x09 }, - { 0x0A, 0x0A }, - { 0x10, 0x7F }, - { 0x80, 0x8F }, - { 0x90, 0x9F }, - { 0xA0, 0xBF }, - { 0xC0, 0xC1 }, - { 0xC2, 0xDF }, - { 0xE0, 0xE0 }, - { 0xE1, 0xEF }, - { 0xF0, 0xF0 }, - { 0xF1, 0xF3 }, - { 0xF4, 0xF4 }, - { 0xF5, 0xFF }, -}; - -TEST(TestCompile, ByteRanges) { - Regexp* re = Regexp::Parse(".", Regexp::PerlX, NULL); +static void DumpByteMap(StringPiece pattern, Regexp::ParseFlags flags, + string* bytemap) { + Regexp* re = Regexp::Parse(pattern, flags, NULL); EXPECT_TRUE(re != NULL); + Prog* prog = re->CompileToProg(0); EXPECT_TRUE(prog != NULL); - EXPECT_EQ(prog->bytemap_range(), arraysize(utf8ranges)); - for (int i = 0; i < arraysize(utf8ranges); i++) - for (int j = utf8ranges[i].lo; j <= utf8ranges[i].hi; j++) - EXPECT_EQ(prog->bytemap()[j], i) << " byte " << j; + *bytemap = prog->DumpByteMap(); delete prog; + re->Decref(); } +TEST(TestCompile, Latin1Ranges) { + // The distinct byte ranges involved in the Latin-1 dot ([^\n]). + + string bytemap; + + DumpByteMap(".", Regexp::PerlX|Regexp::Latin1, &bytemap); + EXPECT_EQ("[00-09] -> 0\n" + "[0a-0a] -> 1\n" + "[0b-ff] -> 0\n", + bytemap); +} + +TEST(TestCompile, OtherByteMapTests) { + string bytemap; + + // Test that "absent" ranges are mapped to the same byte class. + DumpByteMap("[0-9A-Fa-f]+", Regexp::PerlX|Regexp::Latin1, &bytemap); + EXPECT_EQ("[00-2f] -> 0\n" + "[30-39] -> 1\n" + "[3a-40] -> 0\n" + "[41-46] -> 1\n" + "[47-60] -> 0\n" + "[61-66] -> 1\n" + "[67-ff] -> 0\n", + bytemap); + + // Test the byte classes for \b. + DumpByteMap("\\b", Regexp::LikePerl|Regexp::Latin1, &bytemap); + EXPECT_EQ("[00-2f] -> 0\n" + "[30-39] -> 1\n" + "[3a-40] -> 0\n" + "[41-5a] -> 1\n" + "[5b-5e] -> 0\n" + "[5f-5f] -> 1\n" + "[60-60] -> 0\n" + "[61-7a] -> 1\n" + "[7b-ff] -> 0\n", + bytemap); + + // Bug in the ASCII case-folding optimization created too many byte classes. + DumpByteMap("[^_]", Regexp::LikePerl|Regexp::Latin1, &bytemap); + EXPECT_EQ("[00-5e] -> 0\n" + "[5f-5f] -> 1\n" + "[60-ff] -> 0\n", + bytemap); +} + +TEST(TestCompile, UTF8Ranges) { + // The distinct byte ranges involved in the UTF-8 dot ([^\n]). + // Once, erroneously split between 0x3f and 0x40 because it is + // a 6-bit boundary. + + string bytemap; + + DumpByteMap(".", Regexp::PerlX, &bytemap); + EXPECT_EQ("[00-09] -> 0\n" + "[0a-0a] -> 1\n" + "[0b-7f] -> 0\n" + "[80-8f] -> 2\n" + "[90-9f] -> 3\n" + "[a0-bf] -> 4\n" + "[c0-c1] -> 1\n" + "[c2-df] -> 5\n" + "[e0-e0] -> 6\n" + "[e1-ef] -> 7\n" + "[f0-f0] -> 8\n" + "[f1-f3] -> 9\n" + "[f4-f4] -> 10\n" + "[f5-ff] -> 1\n", + bytemap); +} + +TEST(TestCompile, InsufficientMemory) { + Regexp* re = Regexp::Parse( + "^(?P<name1>[^\\s]+)\\s+(?P<name2>[^\\s]+)\\s+(?P<name3>.+)$", + Regexp::LikePerl, NULL); + EXPECT_TRUE(re != NULL); + Prog* prog = re->CompileToProg(920); + // If the memory budget has been exhausted, compilation should fail + // and return NULL instead of trying to do anything with NoMatch(). + EXPECT_TRUE(prog == NULL); + re->Decref(); +} + +static void Dump(StringPiece pattern, Regexp::ParseFlags flags, + string* forward, string* reverse) { + Regexp* re = Regexp::Parse(pattern, flags, NULL); + EXPECT_TRUE(re != NULL); + + if (forward != NULL) { + Prog* prog = re->CompileToProg(0); + EXPECT_TRUE(prog != NULL); + *forward = prog->Dump(); + delete prog; + } + + if (reverse != NULL) { + Prog* prog = re->CompileToReverseProg(0); + EXPECT_TRUE(prog != NULL); + *reverse = prog->Dump(); + delete prog; + } + + re->Decref(); +} + +TEST(TestCompile, Bug26705922) { + // Bug in the compiler caused inefficient bytecode to be generated for Unicode + // groups: common suffixes were cached, but common prefixes were not factored. + + string forward, reverse; + + Dump("[\\x{10000}\\x{10010}]", Regexp::LikePerl, &forward, &reverse); + EXPECT_EQ("3. byte [f0-f0] -> 4\n" + "4. byte [90-90] -> 5\n" + "5. byte [80-80] -> 6\n" + "6+ byte [80-80] -> 8\n" + "7. byte [90-90] -> 8\n" + "8. match! 0\n", + forward); + EXPECT_EQ("3+ byte [80-80] -> 5\n" + "4. byte [90-90] -> 5\n" + "5. byte [80-80] -> 6\n" + "6. byte [90-90] -> 7\n" + "7. byte [f0-f0] -> 8\n" + "8. match! 0\n", + reverse); + + Dump("[\\x{8000}-\\x{10FFF}]", Regexp::LikePerl, &forward, &reverse); + EXPECT_EQ("3+ byte [e8-ef] -> 5\n" + "4. byte [f0-f0] -> 8\n" + "5. byte [80-bf] -> 6\n" + "6. byte [80-bf] -> 7\n" + "7. match! 0\n" + "8. byte [90-90] -> 5\n", + forward); + EXPECT_EQ("3. byte [80-bf] -> 4\n" + "4. byte [80-bf] -> 5\n" + "5+ byte [e8-ef] -> 7\n" + "6. byte [90-90] -> 8\n" + "7. match! 0\n" + "8. byte [f0-f0] -> 7\n", + reverse); + + Dump("[\\x{80}-\\x{10FFFF}]", Regexp::LikePerl, NULL, &reverse); + EXPECT_EQ("3. byte [80-bf] -> 4\n" + "4+ byte [c2-df] -> 7\n" + "5+ byte [a0-bf] -> 8\n" + "6. byte [80-bf] -> 9\n" + "7. match! 0\n" + "8. byte [e0-e0] -> 7\n" + "9+ byte [e1-ef] -> 7\n" + "10+ byte [90-bf] -> 13\n" + "11+ byte [80-bf] -> 14\n" + "12. byte [80-8f] -> 15\n" + "13. byte [f0-f0] -> 7\n" + "14. byte [f1-f3] -> 7\n" + "15. byte [f4-f4] -> 7\n", + reverse); +} + +TEST(TestCompile, Bug35237384) { + // Bug in the compiler caused inefficient bytecode to be generated for + // nested nullable subexpressions. + + string forward; + + Dump("a**{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL); + EXPECT_EQ("3+ byte [61-61] -> 3\n" + "4. nop -> 5\n" + "5+ byte [61-61] -> 5\n" + "6. nop -> 7\n" + "7+ byte [61-61] -> 7\n" + "8. match! 0\n", + forward); + + Dump("(a*|b*)*{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL); + EXPECT_EQ("3+ nop -> 6\n" + "4+ nop -> 8\n" + "5. nop -> 21\n" + "6+ byte [61-61] -> 6\n" + "7. nop -> 3\n" + "8+ byte [62-62] -> 8\n" + "9. nop -> 3\n" + "10+ byte [61-61] -> 10\n" + "11. nop -> 21\n" + "12+ byte [62-62] -> 12\n" + "13. nop -> 21\n" + "14+ byte [61-61] -> 14\n" + "15. nop -> 18\n" + "16+ byte [62-62] -> 16\n" + "17. nop -> 18\n" + "18+ nop -> 14\n" + "19+ nop -> 16\n" + "20. match! 0\n" + "21+ nop -> 10\n" + "22+ nop -> 12\n" + "23. nop -> 18\n", + forward); + + Dump("((|S.+)+|(|S.+)+|){2}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL); + EXPECT_EQ("3+ nop -> 36\n" + "4+ nop -> 31\n" + "5. nop -> 33\n" + "6+ byte [00-09] -> 8\n" + "7. byte [0b-ff] -> 8\n" + "8+ nop -> 6\n" + "9+ nop -> 29\n" + "10. nop -> 28\n" + "11+ byte [00-09] -> 13\n" + "12. byte [0b-ff] -> 13\n" + "13+ nop -> 11\n" + "14+ nop -> 26\n" + "15. nop -> 28\n" + "16+ byte [00-09] -> 18\n" + "17. byte [0b-ff] -> 18\n" + "18+ nop -> 16\n" + "19+ nop -> 36\n" + "20. nop -> 33\n" + "21+ byte [00-09] -> 23\n" + "22. byte [0b-ff] -> 23\n" + "23+ nop -> 21\n" + "24+ nop -> 31\n" + "25. nop -> 33\n" + "26+ nop -> 28\n" + "27. byte [53-53] -> 11\n" + "28. match! 0\n" + "29+ nop -> 28\n" + "30. byte [53-53] -> 6\n" + "31+ nop -> 33\n" + "32. byte [53-53] -> 21\n" + "33+ nop -> 29\n" + "34+ nop -> 26\n" + "35. nop -> 28\n" + "36+ nop -> 33\n" + "37. byte [53-53] -> 16\n", + forward); +} + } // namespace re2 |