aboutsummaryrefslogtreecommitdiff
path: root/re2
diff options
context:
space:
mode:
authorRuss Cox <rsc@swtch.com>2011-06-21 15:17:24 -0400
committerRuss Cox <rsc@swtch.com>2011-06-21 15:17:24 -0400
commit295ae016889386fec74048b186b95ac61b7f5802 (patch)
treeb088d52e4a342af46db6cbd673d228a7c5d4e017 /re2
parente6ca1cb28c789fc47f501cffe309a04c9da07cfe (diff)
downloadregex-re2-295ae016889386fec74048b186b95ac61b7f5802.tar.gz
export changes made at Google
R=rsc CC=re2-dev http://codereview.appspot.com/4625055
Diffstat (limited to 're2')
-rwxr-xr-xre2/make_unicode_casefold.py63
-rw-r--r--re2/parse.cc27
-rw-r--r--re2/prefilter.cc30
-rw-r--r--re2/re2.cc98
-rw-r--r--re2/re2.h15
-rw-r--r--re2/testing/filtered_re2_test.cc14
-rw-r--r--re2/testing/re2_test.cc142
-rwxr-xr-xre2/unicode.py2
-rw-r--r--re2/unicode_casefold.cc169
-rw-r--r--re2/unicode_casefold.h12
10 files changed, 455 insertions, 117 deletions
diff --git a/re2/make_unicode_casefold.py b/re2/make_unicode_casefold.py
index 2e50845..3375d2e 100755
--- a/re2/make_unicode_casefold.py
+++ b/re2/make_unicode_casefold.py
@@ -9,7 +9,7 @@
"""Generate C++ table for Unicode case folding."""
-import unicode
+import unicode, sys
_header = """
// GENERATED BY make_unicode_casefold.py; DO NOT EDIT.
@@ -56,6 +56,7 @@ def _AddDelta(a, delta):
return a+1
else:
return a-1
+ print >>sys.stderr, "Bad Delta: ", delta
raise "Bad Delta"
def _MakeRanges(pairs):
@@ -63,15 +64,40 @@ def _MakeRanges(pairs):
into [(65, 90, +32)]."""
ranges = []
last = -100
+
+ def evenodd(last, a, b, r):
+ if a != last+1 or b != _AddDelta(a, r[2]):
+ return False
+ r[1] = a
+ return True
+
+ def evenoddpair(last, a, b, r):
+ if a != last+2:
+ return False
+ delta = r[2]
+ d = delta
+ if type(delta) is not str:
+ return False
+ if delta.endswith('Skip'):
+ d = delta[:-4]
+ else:
+ delta = d + 'Skip'
+ if b != _AddDelta(a, d):
+ return False
+ r[1] = a
+ r[2] = delta
+ return True
+
for a, b in pairs:
- if a == last+1 and b == _AddDelta(a, ranges[-1][2]):
- ranges[-1][1] = a
+ if ranges and evenodd(last, a, b, ranges[-1]):
+ pass
+ elif ranges and evenoddpair(last, a, b, ranges[-1]):
+ pass
else:
ranges.append([a, a, _Delta(a, b)])
last = a
return ranges
-
# The maximum size of a case-folding group.
# Case folding is implemented in parse.cc by a recursive process
# with a recursion depth equal to the size of the largest
@@ -82,7 +108,7 @@ def _MakeRanges(pairs):
MaxCasefoldGroup = 4
def main():
- casegroups = unicode.CaseGroups()
+ lowergroups, casegroups = unicode.CaseGroups()
foldpairs = []
seen = {}
for c in casegroups:
@@ -93,16 +119,27 @@ def main():
raise unicode.Error("bad casegroups %d -> %d" % (c[i-1], c[i]))
seen[c[i-1]] = True
foldpairs.append([c[i-1], c[i]])
- foldpairs.sort()
- foldranges = _MakeRanges(foldpairs)
+
+ lowerpairs = []
+ for lower, group in lowergroups.iteritems():
+ for g in group:
+ if g != lower:
+ lowerpairs.append([g, lower])
+
+ def printpairs(name, foldpairs):
+ foldpairs.sort()
+ foldranges = _MakeRanges(foldpairs)
+ print "// %d groups, %d pairs, %d ranges" % (len(casegroups), len(foldpairs), len(foldranges))
+ print "CaseFold unicode_%s[] = {" % (name,)
+ for lo, hi, delta in foldranges:
+ print "\t{ %d, %d, %s }," % (lo, hi, delta)
+ print "};"
+ print "int num_unicode_%s = %d;" % (name, len(foldranges),)
+ print ""
print _header
- print "// %d groups, %d pairs, %d ranges" % (len(casegroups), len(foldpairs), len(foldranges))
- print "CaseFold unicode_casefold[] = {"
- for lo, hi, delta in foldranges:
- print "\t{ %d, %d, %s }," % (lo, hi, delta)
- print "};"
- print "int num_unicode_casefold = %d;" % (len(foldranges),)
+ printpairs("casefold", foldpairs)
+ printpairs("tolower", lowerpairs)
print _trailer
if __name__ == '__main__':
diff --git a/re2/parse.cc b/re2/parse.cc
index 605a320..9ae96eb 100644
--- a/re2/parse.cc
+++ b/re2/parse.cc
@@ -240,12 +240,8 @@ bool Regexp::ParseState::PushRegexp(Regexp* re) {
// Searches the case folding tables and returns the CaseFold* that contains r.
// If there isn't one, returns the CaseFold* with smallest f->lo bigger than r.
// If there isn't one, returns NULL.
-CaseFold* LookupCaseFold(Rune r) {
- CaseFold *f;
- int n, m;
-
- f = unicode_casefold;
- n = num_unicode_casefold;
+CaseFold* LookupCaseFold(CaseFold *f, int n, Rune r) {
+ int m;
// Binary search for entry containing r.
while (n > 0) {
@@ -272,16 +268,24 @@ CaseFold* LookupCaseFold(Rune r) {
}
// Returns the result of applying the fold f to the rune r.
-static Rune ApplyFold(CaseFold *f, Rune r) {
+Rune ApplyFold(CaseFold *f, Rune r) {
switch (f->delta) {
default:
return r + f->delta;
+ case EvenOddSkip: // even <-> odd but only applies to every other
+ if ((r - f->lo) % 2)
+ return r;
+ // fall through
case EvenOdd: // even <-> odd
if (r%2 == 0)
return r + 1;
return r - 1;
+ case OddEvenSkip: // odd <-> even but only applies to every other
+ if ((r - f->lo) % 2)
+ return r;
+ // fall through
case OddEven: // odd <-> even
if (r%2 == 1)
return r + 1;
@@ -300,7 +304,7 @@ static Rune ApplyFold(CaseFold *f, Rune r) {
//
// CycleFoldRune('?') = '?'
Rune CycleFoldRune(Rune r) {
- CaseFold* f = LookupCaseFold(r);
+ CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, r);
if (f == NULL || r < f->lo)
return r;
return ApplyFold(f, r);
@@ -323,7 +327,7 @@ static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) {
return;
while (lo <= hi) {
- CaseFold* f = LookupCaseFold(lo);
+ CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, lo);
if (f == NULL) // lo has no fold, nor does anything above lo
break;
if (lo < f->lo) { // lo has no fold; next rune with a fold is f->lo
@@ -1275,8 +1279,7 @@ static bool ParseEscape(StringPiece* s, Rune* rp,
case '0':
// consume up to three octal digits; already have one.
code = c - '0';
- c = (*s)[0];
- if (s->size() > 0 && '0' <= c && c <= '7') {
+ if (s->size() > 0 && '0' <= (c = (*s)[0]) && c <= '7') {
code = code * 8 + c - '0';
s->remove_prefix(1); // digit
if (s->size() > 0) {
@@ -1360,7 +1363,7 @@ static bool ParseEscape(StringPiece* s, Rune* rp,
// in Perl, \b means word-boundary but [\b]
// means backspace. We don't support that:
// if you want a backspace embed a literal
- // backspace character or use \008.
+ // backspace character or use \x08.
//
// case 'b':
// *rp = '\b';
diff --git a/re2/prefilter.cc b/re2/prefilter.cc
index 93aea42..1367315 100644
--- a/re2/prefilter.cc
+++ b/re2/prefilter.cc
@@ -5,6 +5,7 @@
#include "util/util.h"
#include "re2/prefilter.h"
#include "re2/re2.h"
+#include "re2/unicode_casefold.h"
#include "re2/walker-inl.h"
namespace re2 {
@@ -167,17 +168,22 @@ Prefilter* Prefilter::OrStrings(set<string>* ss) {
return or_prefilter;
}
+static Rune ToLowerRune(Rune r) {
+ if (r < Runeself) {
+ if ('A' <= r && r <= 'Z')
+ r += 'a' - 'A';
+ return r;
+ }
+
+ CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
+ if (f == NULL)
+ return r;
+ return ApplyFold(f, r);
+}
+
Prefilter* Prefilter::FromString(const string& str) {
Prefilter* m = new Prefilter(Prefilter::ATOM);
- string s = str;
- for (int i = 0; i < s.size(); i++) {
- int c = s[i];
- if ('A' <= c && c <= 'Z') {
- c += 'a' - 'A';
- s[i] = c;
- }
- }
- m->atom_ = s;
+ m->atom_ = str;
return m;
}
@@ -270,7 +276,7 @@ string Prefilter::Info::ToString() {
}
return s;
}
-
+
if (match_)
return match_->DebugString();
@@ -387,7 +393,7 @@ static string RuneToString(Rune r) {
// Constructs Info for literal rune.
Prefilter::Info* Prefilter::Info::Literal(Rune r) {
Info* info = new Info();
- info->exact_.insert(RuneToString(r));
+ info->exact_.insert(RuneToString(ToLowerRune(r)));
info->is_exact_ = true;
return info;
}
@@ -440,7 +446,7 @@ Prefilter::Info* Prefilter::Info::CClass(CharClass *cc) {
Prefilter::Info *a = new Prefilter::Info();
for (CCIter i = cc->begin(); i != cc->end(); ++i)
for (Rune r = i->lo; r <= i->hi; r++)
- a->exact_.insert(RuneToString(r));
+ a->exact_.insert(RuneToString(ToLowerRune(r)));
a->is_exact_ = true;
diff --git a/re2/re2.cc b/re2/re2.cc
index 5b0a11a..e19692f 100644
--- a/re2/re2.cc
+++ b/re2/re2.cc
@@ -509,12 +509,12 @@ bool RE2::Match(const StringPiece& text,
LOG(ERROR) << "Invalid RE2: " << *error_;
return false;
}
-
+
if (startpos < 0 || startpos > endpos || endpos > text.size()) {
LOG(ERROR) << "RE2: invalid startpos, endpos pair.";
return false;
}
-
+
StringPiece subtext = text;
subtext.remove_prefix(startpos);
subtext.remove_suffix(text.size() - endpos);
@@ -941,32 +941,53 @@ bool RE2::Arg::parse_uchar(const char* str, int n, void* dest) {
static const int kMaxNumberLength = 32;
// REQUIRES "buf" must have length at least kMaxNumberLength+1
-// REQUIRES "n > 0"
-// Copies "str" into "buf" and null-terminates if necessary.
-// Returns one of:
-// a. "str" if no termination is needed
-// b. "buf" if the string was copied and null-terminated
-// c. "" if the input was invalid and has no hope of being parsed
-static const char* TerminateNumber(char* buf, const char* str, int n) {
- if ((n > 0) && isspace(*str)) {
+// Copies "str" into "buf" and null-terminates.
+// Overwrites *np with the new length.
+static const char* TerminateNumber(char* buf, const char* str, int* np) {
+ int n = *np;
+ if (n <= 0) return "";
+ if (n > 0 && isspace(*str)) {
// We are less forgiving than the strtoxxx() routines and do not
// allow leading spaces.
return "";
}
- // See if the character right after the input text may potentially
- // look like a digit.
- if (isdigit(str[n]) ||
- ((str[n] >= 'a') && (str[n] <= 'f')) ||
- ((str[n] >= 'A') && (str[n] <= 'F'))) {
- if (n > kMaxNumberLength) return ""; // Input too big to be a valid number
- memcpy(buf, str, n);
- buf[n] = '\0';
- return buf;
- } else {
- // We can parse right out of the supplied string, so return it.
- return str;
+ // Although buf has a fixed maximum size, we can still handle
+ // arbitrarily large integers correctly by omitting leading zeros.
+ // (Numbers that are still too long will be out of range.)
+ // Before deciding whether str is too long,
+ // remove leading zeros with s/000+/00/.
+ // Leaving the leading two zeros in place means that
+ // we don't change 0000x123 (invalid) into 0x123 (valid).
+ // Skip over leading - before replacing.
+ bool neg = false;
+ if (n >= 1 && str[0] == '-') {
+ neg = true;
+ n--;
+ str++;
+ }
+
+ if (n >= 3 && str[0] == '0' && str[1] == '0') {
+ while (n >= 3 && str[2] == '0') {
+ n--;
+ str++;
+ }
+ }
+
+ if (neg) { // make room in buf for -
+ n++;
+ str--;
+ }
+
+ if (n > kMaxNumberLength) return "";
+
+ memmove(buf, str, n);
+ if (neg) {
+ buf[0] = '-';
}
+ buf[n] = '\0';
+ *np = n;
+ return buf;
}
bool RE2::Arg::parse_long_radix(const char* str,
@@ -975,7 +996,7 @@ bool RE2::Arg::parse_long_radix(const char* str,
int radix) {
if (n == 0) return false;
char buf[kMaxNumberLength+1];
- str = TerminateNumber(buf, str, n);
+ str = TerminateNumber(buf, str, &n);
char* end;
errno = 0;
long r = strtol(str, &end, radix);
@@ -992,7 +1013,7 @@ bool RE2::Arg::parse_ulong_radix(const char* str,
int radix) {
if (n == 0) return false;
char buf[kMaxNumberLength+1];
- str = TerminateNumber(buf, str, n);
+ str = TerminateNumber(buf, str, &n);
if (str[0] == '-') {
// strtoul() will silently accept negative numbers and parse
// them. This module is more strict and treats them as errors.
@@ -1063,7 +1084,7 @@ bool RE2::Arg::parse_longlong_radix(const char* str,
int radix) {
if (n == 0) return false;
char buf[kMaxNumberLength+1];
- str = TerminateNumber(buf, str, n);
+ str = TerminateNumber(buf, str, &n);
char* end;
errno = 0;
int64 r = strtoll(str, &end, radix);
@@ -1080,7 +1101,7 @@ bool RE2::Arg::parse_ulonglong_radix(const char* str,
int radix) {
if (n == 0) return false;
char buf[kMaxNumberLength+1];
- str = TerminateNumber(buf, str, n);
+ str = TerminateNumber(buf, str, &n);
if (str[0] == '-') {
// strtoull() will silently accept negative numbers and parse
// them. This module is more strict and treats them as errors.
@@ -1096,7 +1117,7 @@ bool RE2::Arg::parse_ulonglong_radix(const char* str,
return true;
}
-bool RE2::Arg::parse_double(const char* str, int n, void* dest) {
+static bool parse_double_float(const char* str, int n, bool isfloat, void *dest) {
if (n == 0) return false;
static const int kMaxLength = 200;
char buf[kMaxLength];
@@ -1105,20 +1126,29 @@ bool RE2::Arg::parse_double(const char* str, int n, void* dest) {
buf[n] = '\0';
errno = 0;
char* end;
- double r = strtod(buf, &end);
+ double r;
+ if (isfloat) {
+ r = strtof(buf, &end);
+ } else {
+ r = strtod(buf, &end);
+ }
if (end != buf + n) return false; // Leftover junk
if (errno) return false;
if (dest == NULL) return true;
- *(reinterpret_cast<double*>(dest)) = r;
+ if (isfloat) {
+ *(reinterpret_cast<float*>(dest)) = r;
+ } else {
+ *(reinterpret_cast<double*>(dest)) = r;
+ }
return true;
}
+bool RE2::Arg::parse_double(const char* str, int n, void* dest) {
+ return parse_double_float(str, n, false, dest);
+}
+
bool RE2::Arg::parse_float(const char* str, int n, void* dest) {
- double r;
- if (!parse_double(str, n, &r)) return false;
- if (dest == NULL) return true;
- *(reinterpret_cast<float*>(dest)) = static_cast<float>(r);
- return true;
+ return parse_double_float(str, n, true, dest);
}
diff --git a/re2/re2.h b/re2/re2.h
index b1f71e5..9dbc99c 100644
--- a/re2/re2.h
+++ b/re2/re2.h
@@ -17,6 +17,9 @@
// some of the more complicated things thrown away. In particular,
// backreferences and generalized assertions are not available, nor is \Z.
//
+// See http://code.google.com/p/re2/wiki/Syntax for the syntax
+// supported by RE2, and a comparison with PCRE and PERL regexps.
+//
// For those not familiar with Perl's regular expressions,
// here are some examples of the most commonly used extensions:
//
@@ -478,12 +481,13 @@ class RE2 {
// never_nl (false) never match \n, even if it is in regexp
// case_sensitive (true) match is case-sensitive (regexp can override
// with (?i) unless in posix_syntax mode)
- // perl_classes (false) allow Perl's \d \s \w \D \S \W when in
- // posix_syntax mode
- // word_boundary (false) allow \b \B (word boundary and not) when in
- // posix_syntax mode
+ //
+ // The following options are only consulted when posix_syntax == true.
+ // (When posix_syntax == false these features are always enabled and
+ // cannot be turned off.)
+ // perl_classes (false) allow Perl's \d \s \w \D \S \W
+ // word_boundary (false) allow Perl's \b \B (word boundary and not)
// one_line (false) ^ and $ only match beginning and end of text
- // when in posix_syntax mode
//
// The max_mem option controls how much memory can be used
// to hold the compiled form of the regexp (the Prog) and
@@ -738,6 +742,7 @@ class RE2::Arg {
MAKE_PARSER(char, parse_char);
+ MAKE_PARSER(signed char, parse_char);
MAKE_PARSER(unsigned char, parse_uchar);
MAKE_PARSER(short, parse_short);
MAKE_PARSER(unsigned short, parse_ushort);
diff --git a/re2/testing/filtered_re2_test.cc b/re2/testing/filtered_re2_test.cc
index 2eee57d..7755d30 100644
--- a/re2/testing/filtered_re2_test.cc
+++ b/re2/testing/filtered_re2_test.cc
@@ -99,13 +99,23 @@ AtomTest atom_tests[] = {
"xbcdea", "xbcdeb",
"ybcdea", "ybcdeb"
}
- }
+ }, {
+ // Test upper/lower of non-ASCII.
+ "UnicodeLower", {
+ "(?i)ΔδΠϖπΣςσ",
+ "ΛΜΝΟΠ",
+ "ψρστυ",
+ }, {
+ "δδπππσσσ",
+ "λμνοπ",
+ "ψρστυ",
+ },
+ },
};
void AddRegexpsAndCompile(const char* regexps[],
int n,
struct FilterTestVars* v) {
- v->opts.set_utf8(false);
for (int i = 0; i < n; i++) {
int id;
v->f.Add(regexps[i], v->opts, &id);
diff --git a/re2/testing/re2_test.cc b/re2/testing/re2_test.cc
index b050680..d9a2567 100644
--- a/re2/testing/re2_test.cc
+++ b/re2/testing/re2_test.cc
@@ -6,10 +6,13 @@
// TODO: Test extractions for PartialMatch/Consume
#include <sys/types.h>
+#include <sys/mman.h>
#include <sys/stat.h>
+#include <errno.h>
#include <vector>
#include "util/test.h"
#include "re2/re2.h"
+#include "re2/regexp.h"
DECLARE_bool(logtostderr);
@@ -657,8 +660,30 @@ TEST(RE2, FullMatchTypedNullArg) {
CHECK(!RE2::FullMatch("hello", "(.*)", (float*)NULL));
}
+// Check that numeric parsing code does not read past the end of
+// the number being parsed.
+TEST(RE2, NULTerminated) {
+ char *v;
+ int x;
+
+#ifndef MAP_ANONYMOUS
+#define MAP_ANONYMOUS MAP_ANON
+#endif
+ v = static_cast<char*>(mmap(NULL, 16*1024, PROT_READ|PROT_WRITE,
+ MAP_ANONYMOUS|MAP_PRIVATE, -1, 0));
+ CHECK(v != reinterpret_cast<char*>(-1));
+ LOG(INFO) << "Memory at " << (void*)v;
+ CHECK_EQ(munmap(v + 8*1024, 8*1024), 0) << " error " << errno;
+ v[8*1024 - 1] = '1';
+
+ x = 0;
+ CHECK(RE2::FullMatch(StringPiece(v + 8*1024 - 1, 1), "(.*)", &x));
+ CHECK_EQ(x, 1);
+}
+
TEST(RE2, FullMatchTypeTests) {
// Type tests
+ string zeros(100, '0');
{
char c;
CHECK(RE2::FullMatch("Hello", "(H)ello", &c));
@@ -687,74 +712,120 @@ TEST(RE2, FullMatchTypeTests) {
}
{
int32 v;
- static const int32 max_value = 0x7fffffff;
- static const int32 min_value = -max_value - 1;
- CHECK(RE2::FullMatch("100", "(-?\\d+)",&v)); CHECK_EQ(v, 100);
- CHECK(RE2::FullMatch("-100", "(-?\\d+)",&v)); CHECK_EQ(v, -100);
- CHECK(RE2::FullMatch("2147483647", "(-?\\d+)",&v)); CHECK_EQ(v, max_value);
- CHECK(RE2::FullMatch("-2147483648", "(-?\\d+)",&v)); CHECK_EQ(v, min_value);
- CHECK(!RE2::FullMatch("-2147483649", "(-?\\d+)",&v));
- CHECK(!RE2::FullMatch("2147483648", "(-?\\d+)",&v));
+ static const int32 max = 0x7fffffff;
+ static const int32 min = -max - 1;
+ CHECK(RE2::FullMatch("100", "(-?\\d+)", &v)); CHECK_EQ(v, 100);
+ CHECK(RE2::FullMatch("-100", "(-?\\d+)", &v)); CHECK_EQ(v, -100);
+ CHECK(RE2::FullMatch("2147483647", "(-?\\d+)", &v)); CHECK_EQ(v, max);
+ CHECK(RE2::FullMatch("-2147483648", "(-?\\d+)", &v)); CHECK_EQ(v, min);
+ CHECK(!RE2::FullMatch("-2147483649", "(-?\\d+)", &v));
+ CHECK(!RE2::FullMatch("2147483648", "(-?\\d+)", &v));
+
+ CHECK(RE2::FullMatch(zeros + "2147483647", "(-?\\d+)", &v));
+ CHECK_EQ(v, max);
+ CHECK(RE2::FullMatch("-" + zeros + "2147483648", "(-?\\d+)", &v));
+ CHECK_EQ(v, min);
+
+ CHECK(!RE2::FullMatch("-" + zeros + "2147483649", "(-?\\d+)", &v));
+ CHECK(RE2::FullMatch("0x7fffffff", "(.*)", RE2::CRadix(&v)));
+ CHECK_EQ(v, max);
+ CHECK(!RE2::FullMatch("000x7fffffff", "(.*)", RE2::CRadix(&v)));
}
{
uint32 v;
- static const uint32 max_value = 0xfffffffful;
+ static const uint32 max = 0xfffffffful;
CHECK(RE2::FullMatch("100", "(\\d+)", &v)); CHECK_EQ(v, 100);
- CHECK(RE2::FullMatch("4294967295", "(\\d+)", &v)); CHECK_EQ(v, max_value);
+ CHECK(RE2::FullMatch("4294967295", "(\\d+)", &v)); CHECK_EQ(v, max);
CHECK(!RE2::FullMatch("4294967296", "(\\d+)", &v));
+ CHECK(!RE2::FullMatch("-1", "(\\d+)", &v));
+
+ CHECK(RE2::FullMatch(zeros + "4294967295", "(\\d+)", &v)); CHECK_EQ(v, max);
}
{
int64 v;
- static const int64 max_value = 0x7fffffffffffffffull;
- static const int64 min_value = -max_value - 1;
+ static const int64 max = 0x7fffffffffffffffull;
+ static const int64 min = -max - 1;
char buf[32];
- CHECK(RE2::FullMatch("100", "(-?\\d+)",&v)); CHECK_EQ(v, 100);
- CHECK(RE2::FullMatch("-100", "(-?\\d+)",&v)); CHECK_EQ(v, -100);
+ CHECK(RE2::FullMatch("100", "(-?\\d+)", &v)); CHECK_EQ(v, 100);
+ CHECK(RE2::FullMatch("-100", "(-?\\d+)", &v)); CHECK_EQ(v, -100);
- snprintf(buf, sizeof(buf), "%lld", (long long)max_value);
- CHECK(RE2::FullMatch(buf, "(-?\\d+)",&v)); CHECK_EQ(v, max_value);
+ snprintf(buf, sizeof(buf), "%lld", max);
+ CHECK(RE2::FullMatch(buf, "(-?\\d+)", &v)); CHECK_EQ(v, max);
- snprintf(buf, sizeof(buf), "%lld", (long long)min_value);
- CHECK(RE2::FullMatch(buf, "(-?\\d+)",&v)); CHECK_EQ(v, min_value);
+ snprintf(buf, sizeof(buf), "%lld", min);
+ CHECK(RE2::FullMatch(buf, "(-?\\d+)", &v)); CHECK_EQ(v, min);
- snprintf(buf, sizeof(buf), "%lld", (long long)max_value);
+ snprintf(buf, sizeof(buf), "%lld", max);
assert(buf[strlen(buf)-1] != '9');
buf[strlen(buf)-1]++;
- CHECK(!RE2::FullMatch(buf, "(-?\\d+)", &v));
+ CHECK(!RE2::FullMatch(buf, "(-?\\d+)", &v));
- snprintf(buf, sizeof(buf), "%lld", (long long)min_value);
+ snprintf(buf, sizeof(buf), "%lld", min);
assert(buf[strlen(buf)-1] != '9');
buf[strlen(buf)-1]++;
- CHECK(!RE2::FullMatch(buf, "(-?\\d+)", &v));
+ CHECK(!RE2::FullMatch(buf, "(-?\\d+)", &v));
}
{
uint64 v;
int64 v2;
- static const uint64 max_value = 0xffffffffffffffffull;
+ static const uint64 max = 0xffffffffffffffffull;
char buf[32];
- CHECK(RE2::FullMatch("100", "(-?\\d+)",&v)); CHECK_EQ(v, 100);
- CHECK(RE2::FullMatch("-100", "(-?\\d+)",&v2)); CHECK_EQ(v2, -100);
+ CHECK(RE2::FullMatch("100", "(-?\\d+)", &v)); CHECK_EQ(v, 100);
+ CHECK(RE2::FullMatch("-100", "(-?\\d+)", &v2)); CHECK_EQ(v2, -100);
- snprintf(buf, sizeof(buf), "%llu", (unsigned long long)max_value);
- CHECK(RE2::FullMatch(buf, "(-?\\d+)",&v)); CHECK_EQ(v, max_value);
+ snprintf(buf, sizeof(buf), "%llu", max);
+ CHECK(RE2::FullMatch(buf, "(-?\\d+)", &v)); CHECK_EQ(v, max);
assert(buf[strlen(buf)-1] != '9');
buf[strlen(buf)-1]++;
- CHECK(!RE2::FullMatch(buf, "(-?\\d+)", &v));
+ CHECK(!RE2::FullMatch(buf, "(-?\\d+)", &v));
}
+}
+
+TEST(RE2, FloatingPointFullMatchTypes) {
+ string zeros(100, '0');
{
float v;
- CHECK(RE2::FullMatch("100", "(.*)", &v));
- CHECK(RE2::FullMatch("-100.", "(.*)", &v));
- CHECK(RE2::FullMatch("1e23", "(.*)", &v));
+ CHECK(RE2::FullMatch("100", "(.*)", &v)); CHECK_EQ(v, 100);
+ CHECK(RE2::FullMatch("-100.", "(.*)", &v)); CHECK_EQ(v, -100);
+ CHECK(RE2::FullMatch("1e23", "(.*)", &v)); CHECK_EQ(v, float(1e23));
+
+ CHECK(RE2::FullMatch(zeros + "1e23", "(.*)", &v));
+ CHECK_EQ(v, float(1e23));
+
+ // 6700000000081920.1 is an edge case.
+ // 6700000000081920 is exactly halfway between
+ // two float32s, so the .1 should make it round up.
+ // However, the .1 is outside the precision possible with
+ // a float64: the nearest float64 is 6700000000081920.
+ // So if the code uses strtod and then converts to float32,
+ // round-to-even will make it round down instead of up.
+ // To pass the test, the parser must call strtof directly.
+ // This test case is carefully chosen to use only a 17-digit
+ // number, since C does not guarantee to get the correctly
+ // rounded answer for strtod and strtof unless the input is
+ // short.
+ CHECK(RE2::FullMatch("0.1", "(.*)", &v));
+ CHECK_EQ(v, 0.1f) << StringPrintf("%.8g != %.8g", v, 0.1f);
+ CHECK(RE2::FullMatch("6700000000081920.1", "(.*)", &v));
+ CHECK_EQ(v, 6700000000081920.1f)
+ << StringPrintf("%.8g != %.8g", v, 6700000000081920.1f);
}
{
double v;
- CHECK(RE2::FullMatch("100", "(.*)", &v));
- CHECK(RE2::FullMatch("-100.", "(.*)", &v));
- CHECK(RE2::FullMatch("1e23", "(.*)", &v));
+ CHECK(RE2::FullMatch("100", "(.*)", &v)); CHECK_EQ(v, 100);
+ CHECK(RE2::FullMatch("-100.", "(.*)", &v)); CHECK_EQ(v, -100);
+ CHECK(RE2::FullMatch("1e23", "(.*)", &v)); CHECK_EQ(v, 1e23);
+ CHECK(RE2::FullMatch(zeros + "1e23", "(.*)", &v));
+ CHECK_EQ(v, double(1e23));
+
+ CHECK(RE2::FullMatch("0.1", "(.*)", &v));
+ CHECK_EQ(v, 0.1) << StringPrintf("%.17g != %.17g", v, 0.1);
+ CHECK(RE2::FullMatch("1.00000005960464485", "(.*)", &v));
+ CHECK_EQ(v, 1.0000000596046448)
+ << StringPrintf("%.17g != %.17g", v, 1.0000000596046448);
}
}
@@ -1184,8 +1255,7 @@ TEST(RE2, BitstateCaptureBug) {
opt.set_max_mem(20000);
RE2 re("(_________$)", opt);
StringPiece s = "xxxxxxxxxxxxxxxxxxxxxxxxxx_________x";
- EXPECT_FALSE(re.Match(
- s, 0, s.size(), RE2::UNANCHORED, NULL, 0));
+ EXPECT_FALSE(re.Match(s, 0, s.size(), RE2::UNANCHORED, NULL, 0));
}
// C++ version of bug 609710.
diff --git a/re2/unicode.py b/re2/unicode.py
index a7fb778..8d78312 100755
--- a/re2/unicode.py
+++ b/re2/unicode.py
@@ -247,7 +247,7 @@ def CaseGroups(unicode_dir=_UNICODE_DIR):
for g in groups:
g.sort()
groups.sort()
- return groups
+ return togroup, groups
def Scripts(unicode_dir=_UNICODE_DIR):
diff --git a/re2/unicode_casefold.cc b/re2/unicode_casefold.cc
index cb35f7a..6d4e878 100644
--- a/re2/unicode_casefold.cc
+++ b/re2/unicode_casefold.cc
@@ -294,6 +294,175 @@ CaseFold unicode_casefold[] = {
};
int num_unicode_casefold = 282;
+// 1029 groups, 1050 pairs, 163 ranges
+CaseFold unicode_tolower[] = {
+ { 65, 90, 32 },
+ { 181, 181, 775 },
+ { 192, 214, 32 },
+ { 216, 222, 32 },
+ { 256, 302, EvenOddSkip },
+ { 306, 310, EvenOddSkip },
+ { 313, 327, OddEvenSkip },
+ { 330, 374, EvenOddSkip },
+ { 376, 376, -121 },
+ { 377, 381, OddEvenSkip },
+ { 383, 383, -268 },
+ { 385, 385, 210 },
+ { 386, 388, EvenOddSkip },
+ { 390, 390, 206 },
+ { 391, 391, OddEven },
+ { 393, 394, 205 },
+ { 395, 395, OddEven },
+ { 398, 398, 79 },
+ { 399, 399, 202 },
+ { 400, 400, 203 },
+ { 401, 401, OddEven },
+ { 403, 403, 205 },
+ { 404, 404, 207 },
+ { 406, 406, 211 },
+ { 407, 407, 209 },
+ { 408, 408, EvenOdd },
+ { 412, 412, 211 },
+ { 413, 413, 213 },
+ { 415, 415, 214 },
+ { 416, 420, EvenOddSkip },
+ { 422, 422, 218 },
+ { 423, 423, OddEven },
+ { 425, 425, 218 },
+ { 428, 428, EvenOdd },
+ { 430, 430, 218 },
+ { 431, 431, OddEven },
+ { 433, 434, 217 },
+ { 435, 437, OddEvenSkip },
+ { 439, 439, 219 },
+ { 440, 440, EvenOdd },
+ { 444, 444, EvenOdd },
+ { 452, 452, 2 },
+ { 453, 453, OddEven },
+ { 455, 455, 2 },
+ { 456, 456, EvenOdd },
+ { 458, 458, 2 },
+ { 459, 475, OddEvenSkip },
+ { 478, 494, EvenOddSkip },
+ { 497, 497, 2 },
+ { 498, 500, EvenOddSkip },
+ { 502, 502, -97 },
+ { 503, 503, -56 },
+ { 504, 542, EvenOddSkip },
+ { 544, 544, -130 },
+ { 546, 562, EvenOddSkip },
+ { 570, 570, 10795 },
+ { 571, 571, OddEven },
+ { 573, 573, -163 },
+ { 574, 574, 10792 },
+ { 577, 577, OddEven },
+ { 579, 579, -195 },
+ { 580, 580, 69 },
+ { 581, 581, 71 },
+ { 582, 590, EvenOddSkip },
+ { 837, 837, 116 },
+ { 880, 882, EvenOddSkip },
+ { 886, 886, EvenOdd },
+ { 902, 902, 38 },
+ { 904, 906, 37 },
+ { 908, 908, 64 },
+ { 910, 911, 63 },
+ { 913, 929, 32 },
+ { 931, 939, 32 },
+ { 962, 962, EvenOdd },
+ { 975, 975, 8 },
+ { 976, 976, -30 },
+ { 977, 977, -25 },
+ { 981, 981, -15 },
+ { 982, 982, -22 },
+ { 984, 1006, EvenOddSkip },
+ { 1008, 1008, -54 },
+ { 1009, 1009, -48 },
+ { 1012, 1012, -60 },
+ { 1013, 1013, -64 },
+ { 1015, 1015, OddEven },
+ { 1017, 1017, -7 },
+ { 1018, 1018, EvenOdd },
+ { 1021, 1023, -130 },
+ { 1024, 1039, 80 },
+ { 1040, 1071, 32 },
+ { 1120, 1152, EvenOddSkip },
+ { 1162, 1214, EvenOddSkip },
+ { 1216, 1216, 15 },
+ { 1217, 1229, OddEvenSkip },
+ { 1232, 1318, EvenOddSkip },
+ { 1329, 1366, 48 },
+ { 4256, 4293, 7264 },
+ { 7680, 7828, EvenOddSkip },
+ { 7835, 7835, -58 },
+ { 7838, 7838, -7615 },
+ { 7840, 7934, EvenOddSkip },
+ { 7944, 7951, -8 },
+ { 7960, 7965, -8 },
+ { 7976, 7983, -8 },
+ { 7992, 7999, -8 },
+ { 8008, 8013, -8 },
+ { 8025, 8025, -8 },
+ { 8027, 8027, -8 },
+ { 8029, 8029, -8 },
+ { 8031, 8031, -8 },
+ { 8040, 8047, -8 },
+ { 8072, 8079, -8 },
+ { 8088, 8095, -8 },
+ { 8104, 8111, -8 },
+ { 8120, 8121, -8 },
+ { 8122, 8123, -74 },
+ { 8124, 8124, -9 },
+ { 8126, 8126, -7173 },
+ { 8136, 8139, -86 },
+ { 8140, 8140, -9 },
+ { 8152, 8153, -8 },
+ { 8154, 8155, -100 },
+ { 8168, 8169, -8 },
+ { 8170, 8171, -112 },
+ { 8172, 8172, -7 },
+ { 8184, 8185, -128 },
+ { 8186, 8187, -126 },
+ { 8188, 8188, -9 },
+ { 8486, 8486, -7517 },
+ { 8490, 8490, -8383 },
+ { 8491, 8491, -8262 },
+ { 8498, 8498, 28 },
+ { 8544, 8559, 16 },
+ { 8579, 8579, OddEven },
+ { 9398, 9423, 26 },
+ { 11264, 11310, 48 },
+ { 11360, 11360, EvenOdd },
+ { 11362, 11362, -10743 },
+ { 11363, 11363, -3814 },
+ { 11364, 11364, -10727 },
+ { 11367, 11371, OddEvenSkip },
+ { 11373, 11373, -10780 },
+ { 11374, 11374, -10749 },
+ { 11375, 11375, -10783 },
+ { 11376, 11376, -10782 },
+ { 11378, 11378, EvenOdd },
+ { 11381, 11381, OddEven },
+ { 11390, 11391, -10815 },
+ { 11392, 11490, EvenOddSkip },
+ { 11499, 11501, OddEvenSkip },
+ { 42560, 42604, EvenOddSkip },
+ { 42624, 42646, EvenOddSkip },
+ { 42786, 42798, EvenOddSkip },
+ { 42802, 42862, EvenOddSkip },
+ { 42873, 42875, OddEvenSkip },
+ { 42877, 42877, -35332 },
+ { 42878, 42886, EvenOddSkip },
+ { 42891, 42891, OddEven },
+ { 42893, 42893, -42280 },
+ { 42896, 42896, EvenOdd },
+ { 42912, 42920, EvenOddSkip },
+ { 65313, 65338, 32 },
+ { 66560, 66599, 40 },
+};
+int num_unicode_tolower = 163;
+
+
} // namespace re2
diff --git a/re2/unicode_casefold.h b/re2/unicode_casefold.h
index 12744b7..160b07e 100644
--- a/re2/unicode_casefold.h
+++ b/re2/unicode_casefold.h
@@ -45,7 +45,9 @@ namespace re2 {
enum {
EvenOdd = 1,
- OddEven = -1
+ OddEven = -1,
+ EvenOddSkip = 1<<30,
+ OddEvenSkip,
};
struct CaseFold {
@@ -57,10 +59,16 @@ struct CaseFold {
extern CaseFold unicode_casefold[];
extern int num_unicode_casefold;
+extern CaseFold unicode_tolower[];
+extern int num_unicode_tolower;
+
// Returns the CaseFold* in the tables that contains rune.
// If rune is not in the tables, returns the first CaseFold* after rune.
// If rune is larger than any value in the tables, returns NULL.
-extern CaseFold* LookupCaseFold(uint32 rune);
+extern CaseFold* LookupCaseFold(CaseFold*, int, Rune rune);
+
+// Returns the result of applying the fold f to the rune r.
+extern Rune ApplyFold(CaseFold *f, Rune r);
} // namespace re2