aboutsummaryrefslogtreecommitdiff
path: root/re2/testing/exhaustive3_test.cc
blob: cf09e182e0fcbe846c4c0ccc03f02c06235652fd (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
// Copyright 2008 The RE2 Authors.  All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Exhaustive testing of regular expression matching.

#include <stddef.h>
#include <memory>
#include <string>
#include <vector>

#include "util/test.h"
#include "util/utf.h"
#include "re2/testing/exhaustive_tester.h"

namespace re2 {

// Test simple character classes by themselves.
TEST(CharacterClasses, Exhaustive) {
  std::vector<string> atoms = Split(" ",
    "[a] [b] [ab] [^bc] [b-d] [^b-d] []a] [-a] [a-] [^-a] [a-b-c] a b .");
  ExhaustiveTest(2, 1, atoms, RegexpGenerator::EgrepOps(),
                 5, Explode("ab"), "", "");
}

// Test simple character classes inside a___b (for example, a[a]b).
TEST(CharacterClasses, ExhaustiveAB) {
  std::vector<string> atoms = Split(" ",
    "[a] [b] [ab] [^bc] [b-d] [^b-d] []a] [-a] [a-] [^-a] [a-b-c] a b .");
  ExhaustiveTest(2, 1, atoms, RegexpGenerator::EgrepOps(),
                 5, Explode("ab"), "a%sb", "");
}

// Returns UTF8 for Rune r
static string UTF8(Rune r) {
  char buf[UTFmax+1];
  buf[runetochar(buf, &r)] = 0;
  return string(buf);
}

// Returns a vector of "interesting" UTF8 characters.
// Unicode is now too big to just return all of them,
// so UTF8Characters return a set likely to be good test cases.
static const std::vector<string>& InterestingUTF8() {
  static bool init;
  static std::vector<string> v;

  if (init)
    return v;

  init = true;
  // All the Latin1 equivalents are interesting.
  for (int i = 1; i < 256; i++)
    v.push_back(UTF8(i));

  // After that, the codes near bit boundaries are
  // interesting, because they span byte sequence lengths.
  for (int j = 0; j < 8; j++)
    v.push_back(UTF8(256 + j));
  for (int i = 512; i < Runemax; i <<= 1)
    for (int j = -8; j < 8; j++)
      v.push_back(UTF8(i + j));

  // The codes near Runemax, including Runemax itself, are interesting.
  for (int j = -8; j <= 0; j++)
    v.push_back(UTF8(Runemax + j));

  return v;
}

// Test interesting UTF-8 characters against character classes.
TEST(InterestingUTF8, SingleOps) {
  std::vector<string> atoms = Split(" ",
    ". ^ $ \\a \\f \\n \\r \\t \\v \\d \\D \\s \\S \\w \\W \\b \\B "
    "[[:alnum:]] [[:alpha:]] [[:blank:]] [[:cntrl:]] [[:digit:]] "
    "[[:graph:]] [[:lower:]] [[:print:]] [[:punct:]] [[:space:]] "
    "[[:upper:]] [[:xdigit:]] [\\s\\S] [\\d\\D] [^\\w\\W] [^\\d\\D]");
  std::vector<string> ops;  // no ops
  ExhaustiveTest(1, 0, atoms, ops,
                 1, InterestingUTF8(), "", "");
}

// Test interesting UTF-8 characters against character classes,
// but wrap everything inside AB.
TEST(InterestingUTF8, AB) {
  std::vector<string> atoms = Split(" ",
    ". ^ $ \\a \\f \\n \\r \\t \\v \\d \\D \\s \\S \\w \\W \\b \\B "
    "[[:alnum:]] [[:alpha:]] [[:blank:]] [[:cntrl:]] [[:digit:]] "
    "[[:graph:]] [[:lower:]] [[:print:]] [[:punct:]] [[:space:]] "
    "[[:upper:]] [[:xdigit:]] [\\s\\S] [\\d\\D] [^\\w\\W] [^\\d\\D]");
  std::vector<string> ops;  // no ops
  std::vector<string> alpha = InterestingUTF8();
  for (size_t i = 0; i < alpha.size(); i++)
    alpha[i] = "a" + alpha[i] + "b";
  ExhaustiveTest(1, 0, atoms, ops,
                 1, alpha, "a%sb", "");
}

}  // namespace re2