diff options
author | Russ Cox <rsc@swtch.com> | 2011-06-21 15:17:24 -0400 |
---|---|---|
committer | Russ Cox <rsc@swtch.com> | 2011-06-21 15:17:24 -0400 |
commit | 295ae016889386fec74048b186b95ac61b7f5802 (patch) | |
tree | b088d52e4a342af46db6cbd673d228a7c5d4e017 /re2 | |
parent | e6ca1cb28c789fc47f501cffe309a04c9da07cfe (diff) | |
download | regex-re2-295ae016889386fec74048b186b95ac61b7f5802.tar.gz |
export changes made at Google
R=rsc
CC=re2-dev
http://codereview.appspot.com/4625055
Diffstat (limited to 're2')
-rwxr-xr-x | re2/make_unicode_casefold.py | 63 | ||||
-rw-r--r-- | re2/parse.cc | 27 | ||||
-rw-r--r-- | re2/prefilter.cc | 30 | ||||
-rw-r--r-- | re2/re2.cc | 98 | ||||
-rw-r--r-- | re2/re2.h | 15 | ||||
-rw-r--r-- | re2/testing/filtered_re2_test.cc | 14 | ||||
-rw-r--r-- | re2/testing/re2_test.cc | 142 | ||||
-rwxr-xr-x | re2/unicode.py | 2 | ||||
-rw-r--r-- | re2/unicode_casefold.cc | 169 | ||||
-rw-r--r-- | re2/unicode_casefold.h | 12 |
10 files changed, 455 insertions, 117 deletions
diff --git a/re2/make_unicode_casefold.py b/re2/make_unicode_casefold.py index 2e50845..3375d2e 100755 --- a/re2/make_unicode_casefold.py +++ b/re2/make_unicode_casefold.py @@ -9,7 +9,7 @@ """Generate C++ table for Unicode case folding.""" -import unicode +import unicode, sys _header = """ // GENERATED BY make_unicode_casefold.py; DO NOT EDIT. @@ -56,6 +56,7 @@ def _AddDelta(a, delta): return a+1 else: return a-1 + print >>sys.stderr, "Bad Delta: ", delta raise "Bad Delta" def _MakeRanges(pairs): @@ -63,15 +64,40 @@ def _MakeRanges(pairs): into [(65, 90, +32)].""" ranges = [] last = -100 + + def evenodd(last, a, b, r): + if a != last+1 or b != _AddDelta(a, r[2]): + return False + r[1] = a + return True + + def evenoddpair(last, a, b, r): + if a != last+2: + return False + delta = r[2] + d = delta + if type(delta) is not str: + return False + if delta.endswith('Skip'): + d = delta[:-4] + else: + delta = d + 'Skip' + if b != _AddDelta(a, d): + return False + r[1] = a + r[2] = delta + return True + for a, b in pairs: - if a == last+1 and b == _AddDelta(a, ranges[-1][2]): - ranges[-1][1] = a + if ranges and evenodd(last, a, b, ranges[-1]): + pass + elif ranges and evenoddpair(last, a, b, ranges[-1]): + pass else: ranges.append([a, a, _Delta(a, b)]) last = a return ranges - # The maximum size of a case-folding group. # Case folding is implemented in parse.cc by a recursive process # with a recursion depth equal to the size of the largest @@ -82,7 +108,7 @@ def _MakeRanges(pairs): MaxCasefoldGroup = 4 def main(): - casegroups = unicode.CaseGroups() + lowergroups, casegroups = unicode.CaseGroups() foldpairs = [] seen = {} for c in casegroups: @@ -93,16 +119,27 @@ def main(): raise unicode.Error("bad casegroups %d -> %d" % (c[i-1], c[i])) seen[c[i-1]] = True foldpairs.append([c[i-1], c[i]]) - foldpairs.sort() - foldranges = _MakeRanges(foldpairs) + + lowerpairs = [] + for lower, group in lowergroups.iteritems(): + for g in group: + if g != lower: + lowerpairs.append([g, lower]) + + def printpairs(name, foldpairs): + foldpairs.sort() + foldranges = _MakeRanges(foldpairs) + print "// %d groups, %d pairs, %d ranges" % (len(casegroups), len(foldpairs), len(foldranges)) + print "CaseFold unicode_%s[] = {" % (name,) + for lo, hi, delta in foldranges: + print "\t{ %d, %d, %s }," % (lo, hi, delta) + print "};" + print "int num_unicode_%s = %d;" % (name, len(foldranges),) + print "" print _header - print "// %d groups, %d pairs, %d ranges" % (len(casegroups), len(foldpairs), len(foldranges)) - print "CaseFold unicode_casefold[] = {" - for lo, hi, delta in foldranges: - print "\t{ %d, %d, %s }," % (lo, hi, delta) - print "};" - print "int num_unicode_casefold = %d;" % (len(foldranges),) + printpairs("casefold", foldpairs) + printpairs("tolower", lowerpairs) print _trailer if __name__ == '__main__': diff --git a/re2/parse.cc b/re2/parse.cc index 605a320..9ae96eb 100644 --- a/re2/parse.cc +++ b/re2/parse.cc @@ -240,12 +240,8 @@ bool Regexp::ParseState::PushRegexp(Regexp* re) { // Searches the case folding tables and returns the CaseFold* that contains r. // If there isn't one, returns the CaseFold* with smallest f->lo bigger than r. // If there isn't one, returns NULL. -CaseFold* LookupCaseFold(Rune r) { - CaseFold *f; - int n, m; - - f = unicode_casefold; - n = num_unicode_casefold; +CaseFold* LookupCaseFold(CaseFold *f, int n, Rune r) { + int m; // Binary search for entry containing r. while (n > 0) { @@ -272,16 +268,24 @@ CaseFold* LookupCaseFold(Rune r) { } // Returns the result of applying the fold f to the rune r. -static Rune ApplyFold(CaseFold *f, Rune r) { +Rune ApplyFold(CaseFold *f, Rune r) { switch (f->delta) { default: return r + f->delta; + case EvenOddSkip: // even <-> odd but only applies to every other + if ((r - f->lo) % 2) + return r; + // fall through case EvenOdd: // even <-> odd if (r%2 == 0) return r + 1; return r - 1; + case OddEvenSkip: // odd <-> even but only applies to every other + if ((r - f->lo) % 2) + return r; + // fall through case OddEven: // odd <-> even if (r%2 == 1) return r + 1; @@ -300,7 +304,7 @@ static Rune ApplyFold(CaseFold *f, Rune r) { // // CycleFoldRune('?') = '?' Rune CycleFoldRune(Rune r) { - CaseFold* f = LookupCaseFold(r); + CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, r); if (f == NULL || r < f->lo) return r; return ApplyFold(f, r); @@ -323,7 +327,7 @@ static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) { return; while (lo <= hi) { - CaseFold* f = LookupCaseFold(lo); + CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, lo); if (f == NULL) // lo has no fold, nor does anything above lo break; if (lo < f->lo) { // lo has no fold; next rune with a fold is f->lo @@ -1275,8 +1279,7 @@ static bool ParseEscape(StringPiece* s, Rune* rp, case '0': // consume up to three octal digits; already have one. code = c - '0'; - c = (*s)[0]; - if (s->size() > 0 && '0' <= c && c <= '7') { + if (s->size() > 0 && '0' <= (c = (*s)[0]) && c <= '7') { code = code * 8 + c - '0'; s->remove_prefix(1); // digit if (s->size() > 0) { @@ -1360,7 +1363,7 @@ static bool ParseEscape(StringPiece* s, Rune* rp, // in Perl, \b means word-boundary but [\b] // means backspace. We don't support that: // if you want a backspace embed a literal - // backspace character or use \008. + // backspace character or use \x08. // // case 'b': // *rp = '\b'; diff --git a/re2/prefilter.cc b/re2/prefilter.cc index 93aea42..1367315 100644 --- a/re2/prefilter.cc +++ b/re2/prefilter.cc @@ -5,6 +5,7 @@ #include "util/util.h" #include "re2/prefilter.h" #include "re2/re2.h" +#include "re2/unicode_casefold.h" #include "re2/walker-inl.h" namespace re2 { @@ -167,17 +168,22 @@ Prefilter* Prefilter::OrStrings(set<string>* ss) { return or_prefilter; } +static Rune ToLowerRune(Rune r) { + if (r < Runeself) { + if ('A' <= r && r <= 'Z') + r += 'a' - 'A'; + return r; + } + + CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r); + if (f == NULL) + return r; + return ApplyFold(f, r); +} + Prefilter* Prefilter::FromString(const string& str) { Prefilter* m = new Prefilter(Prefilter::ATOM); - string s = str; - for (int i = 0; i < s.size(); i++) { - int c = s[i]; - if ('A' <= c && c <= 'Z') { - c += 'a' - 'A'; - s[i] = c; - } - } - m->atom_ = s; + m->atom_ = str; return m; } @@ -270,7 +276,7 @@ string Prefilter::Info::ToString() { } return s; } - + if (match_) return match_->DebugString(); @@ -387,7 +393,7 @@ static string RuneToString(Rune r) { // Constructs Info for literal rune. Prefilter::Info* Prefilter::Info::Literal(Rune r) { Info* info = new Info(); - info->exact_.insert(RuneToString(r)); + info->exact_.insert(RuneToString(ToLowerRune(r))); info->is_exact_ = true; return info; } @@ -440,7 +446,7 @@ Prefilter::Info* Prefilter::Info::CClass(CharClass *cc) { Prefilter::Info *a = new Prefilter::Info(); for (CCIter i = cc->begin(); i != cc->end(); ++i) for (Rune r = i->lo; r <= i->hi; r++) - a->exact_.insert(RuneToString(r)); + a->exact_.insert(RuneToString(ToLowerRune(r))); a->is_exact_ = true; @@ -509,12 +509,12 @@ bool RE2::Match(const StringPiece& text, LOG(ERROR) << "Invalid RE2: " << *error_; return false; } - + if (startpos < 0 || startpos > endpos || endpos > text.size()) { LOG(ERROR) << "RE2: invalid startpos, endpos pair."; return false; } - + StringPiece subtext = text; subtext.remove_prefix(startpos); subtext.remove_suffix(text.size() - endpos); @@ -941,32 +941,53 @@ bool RE2::Arg::parse_uchar(const char* str, int n, void* dest) { static const int kMaxNumberLength = 32; // REQUIRES "buf" must have length at least kMaxNumberLength+1 -// REQUIRES "n > 0" -// Copies "str" into "buf" and null-terminates if necessary. -// Returns one of: -// a. "str" if no termination is needed -// b. "buf" if the string was copied and null-terminated -// c. "" if the input was invalid and has no hope of being parsed -static const char* TerminateNumber(char* buf, const char* str, int n) { - if ((n > 0) && isspace(*str)) { +// Copies "str" into "buf" and null-terminates. +// Overwrites *np with the new length. +static const char* TerminateNumber(char* buf, const char* str, int* np) { + int n = *np; + if (n <= 0) return ""; + if (n > 0 && isspace(*str)) { // We are less forgiving than the strtoxxx() routines and do not // allow leading spaces. return ""; } - // See if the character right after the input text may potentially - // look like a digit. - if (isdigit(str[n]) || - ((str[n] >= 'a') && (str[n] <= 'f')) || - ((str[n] >= 'A') && (str[n] <= 'F'))) { - if (n > kMaxNumberLength) return ""; // Input too big to be a valid number - memcpy(buf, str, n); - buf[n] = '\0'; - return buf; - } else { - // We can parse right out of the supplied string, so return it. - return str; + // Although buf has a fixed maximum size, we can still handle + // arbitrarily large integers correctly by omitting leading zeros. + // (Numbers that are still too long will be out of range.) + // Before deciding whether str is too long, + // remove leading zeros with s/000+/00/. + // Leaving the leading two zeros in place means that + // we don't change 0000x123 (invalid) into 0x123 (valid). + // Skip over leading - before replacing. + bool neg = false; + if (n >= 1 && str[0] == '-') { + neg = true; + n--; + str++; + } + + if (n >= 3 && str[0] == '0' && str[1] == '0') { + while (n >= 3 && str[2] == '0') { + n--; + str++; + } + } + + if (neg) { // make room in buf for - + n++; + str--; + } + + if (n > kMaxNumberLength) return ""; + + memmove(buf, str, n); + if (neg) { + buf[0] = '-'; } + buf[n] = '\0'; + *np = n; + return buf; } bool RE2::Arg::parse_long_radix(const char* str, @@ -975,7 +996,7 @@ bool RE2::Arg::parse_long_radix(const char* str, int radix) { if (n == 0) return false; char buf[kMaxNumberLength+1]; - str = TerminateNumber(buf, str, n); + str = TerminateNumber(buf, str, &n); char* end; errno = 0; long r = strtol(str, &end, radix); @@ -992,7 +1013,7 @@ bool RE2::Arg::parse_ulong_radix(const char* str, int radix) { if (n == 0) return false; char buf[kMaxNumberLength+1]; - str = TerminateNumber(buf, str, n); + str = TerminateNumber(buf, str, &n); if (str[0] == '-') { // strtoul() will silently accept negative numbers and parse // them. This module is more strict and treats them as errors. @@ -1063,7 +1084,7 @@ bool RE2::Arg::parse_longlong_radix(const char* str, int radix) { if (n == 0) return false; char buf[kMaxNumberLength+1]; - str = TerminateNumber(buf, str, n); + str = TerminateNumber(buf, str, &n); char* end; errno = 0; int64 r = strtoll(str, &end, radix); @@ -1080,7 +1101,7 @@ bool RE2::Arg::parse_ulonglong_radix(const char* str, int radix) { if (n == 0) return false; char buf[kMaxNumberLength+1]; - str = TerminateNumber(buf, str, n); + str = TerminateNumber(buf, str, &n); if (str[0] == '-') { // strtoull() will silently accept negative numbers and parse // them. This module is more strict and treats them as errors. @@ -1096,7 +1117,7 @@ bool RE2::Arg::parse_ulonglong_radix(const char* str, return true; } -bool RE2::Arg::parse_double(const char* str, int n, void* dest) { +static bool parse_double_float(const char* str, int n, bool isfloat, void *dest) { if (n == 0) return false; static const int kMaxLength = 200; char buf[kMaxLength]; @@ -1105,20 +1126,29 @@ bool RE2::Arg::parse_double(const char* str, int n, void* dest) { buf[n] = '\0'; errno = 0; char* end; - double r = strtod(buf, &end); + double r; + if (isfloat) { + r = strtof(buf, &end); + } else { + r = strtod(buf, &end); + } if (end != buf + n) return false; // Leftover junk if (errno) return false; if (dest == NULL) return true; - *(reinterpret_cast<double*>(dest)) = r; + if (isfloat) { + *(reinterpret_cast<float*>(dest)) = r; + } else { + *(reinterpret_cast<double*>(dest)) = r; + } return true; } +bool RE2::Arg::parse_double(const char* str, int n, void* dest) { + return parse_double_float(str, n, false, dest); +} + bool RE2::Arg::parse_float(const char* str, int n, void* dest) { - double r; - if (!parse_double(str, n, &r)) return false; - if (dest == NULL) return true; - *(reinterpret_cast<float*>(dest)) = static_cast<float>(r); - return true; + return parse_double_float(str, n, true, dest); } @@ -17,6 +17,9 @@ // some of the more complicated things thrown away. In particular, // backreferences and generalized assertions are not available, nor is \Z. // +// See http://code.google.com/p/re2/wiki/Syntax for the syntax +// supported by RE2, and a comparison with PCRE and PERL regexps. +// // For those not familiar with Perl's regular expressions, // here are some examples of the most commonly used extensions: // @@ -478,12 +481,13 @@ class RE2 { // never_nl (false) never match \n, even if it is in regexp // case_sensitive (true) match is case-sensitive (regexp can override // with (?i) unless in posix_syntax mode) - // perl_classes (false) allow Perl's \d \s \w \D \S \W when in - // posix_syntax mode - // word_boundary (false) allow \b \B (word boundary and not) when in - // posix_syntax mode + // + // The following options are only consulted when posix_syntax == true. + // (When posix_syntax == false these features are always enabled and + // cannot be turned off.) + // perl_classes (false) allow Perl's \d \s \w \D \S \W + // word_boundary (false) allow Perl's \b \B (word boundary and not) // one_line (false) ^ and $ only match beginning and end of text - // when in posix_syntax mode // // The max_mem option controls how much memory can be used // to hold the compiled form of the regexp (the Prog) and @@ -738,6 +742,7 @@ class RE2::Arg { MAKE_PARSER(char, parse_char); + MAKE_PARSER(signed char, parse_char); MAKE_PARSER(unsigned char, parse_uchar); MAKE_PARSER(short, parse_short); MAKE_PARSER(unsigned short, parse_ushort); diff --git a/re2/testing/filtered_re2_test.cc b/re2/testing/filtered_re2_test.cc index 2eee57d..7755d30 100644 --- a/re2/testing/filtered_re2_test.cc +++ b/re2/testing/filtered_re2_test.cc @@ -99,13 +99,23 @@ AtomTest atom_tests[] = { "xbcdea", "xbcdeb", "ybcdea", "ybcdeb" } - } + }, { + // Test upper/lower of non-ASCII. + "UnicodeLower", { + "(?i)ΔδΠϖπΣςσ", + "ΛΜΝΟΠ", + "ψρστυ", + }, { + "δδπππσσσ", + "λμνοπ", + "ψρστυ", + }, + }, }; void AddRegexpsAndCompile(const char* regexps[], int n, struct FilterTestVars* v) { - v->opts.set_utf8(false); for (int i = 0; i < n; i++) { int id; v->f.Add(regexps[i], v->opts, &id); diff --git a/re2/testing/re2_test.cc b/re2/testing/re2_test.cc index b050680..d9a2567 100644 --- a/re2/testing/re2_test.cc +++ b/re2/testing/re2_test.cc @@ -6,10 +6,13 @@ // TODO: Test extractions for PartialMatch/Consume #include <sys/types.h> +#include <sys/mman.h> #include <sys/stat.h> +#include <errno.h> #include <vector> #include "util/test.h" #include "re2/re2.h" +#include "re2/regexp.h" DECLARE_bool(logtostderr); @@ -657,8 +660,30 @@ TEST(RE2, FullMatchTypedNullArg) { CHECK(!RE2::FullMatch("hello", "(.*)", (float*)NULL)); } +// Check that numeric parsing code does not read past the end of +// the number being parsed. +TEST(RE2, NULTerminated) { + char *v; + int x; + +#ifndef MAP_ANONYMOUS +#define MAP_ANONYMOUS MAP_ANON +#endif + v = static_cast<char*>(mmap(NULL, 16*1024, PROT_READ|PROT_WRITE, + MAP_ANONYMOUS|MAP_PRIVATE, -1, 0)); + CHECK(v != reinterpret_cast<char*>(-1)); + LOG(INFO) << "Memory at " << (void*)v; + CHECK_EQ(munmap(v + 8*1024, 8*1024), 0) << " error " << errno; + v[8*1024 - 1] = '1'; + + x = 0; + CHECK(RE2::FullMatch(StringPiece(v + 8*1024 - 1, 1), "(.*)", &x)); + CHECK_EQ(x, 1); +} + TEST(RE2, FullMatchTypeTests) { // Type tests + string zeros(100, '0'); { char c; CHECK(RE2::FullMatch("Hello", "(H)ello", &c)); @@ -687,74 +712,120 @@ TEST(RE2, FullMatchTypeTests) { } { int32 v; - static const int32 max_value = 0x7fffffff; - static const int32 min_value = -max_value - 1; - CHECK(RE2::FullMatch("100", "(-?\\d+)",&v)); CHECK_EQ(v, 100); - CHECK(RE2::FullMatch("-100", "(-?\\d+)",&v)); CHECK_EQ(v, -100); - CHECK(RE2::FullMatch("2147483647", "(-?\\d+)",&v)); CHECK_EQ(v, max_value); - CHECK(RE2::FullMatch("-2147483648", "(-?\\d+)",&v)); CHECK_EQ(v, min_value); - CHECK(!RE2::FullMatch("-2147483649", "(-?\\d+)",&v)); - CHECK(!RE2::FullMatch("2147483648", "(-?\\d+)",&v)); + static const int32 max = 0x7fffffff; + static const int32 min = -max - 1; + CHECK(RE2::FullMatch("100", "(-?\\d+)", &v)); CHECK_EQ(v, 100); + CHECK(RE2::FullMatch("-100", "(-?\\d+)", &v)); CHECK_EQ(v, -100); + CHECK(RE2::FullMatch("2147483647", "(-?\\d+)", &v)); CHECK_EQ(v, max); + CHECK(RE2::FullMatch("-2147483648", "(-?\\d+)", &v)); CHECK_EQ(v, min); + CHECK(!RE2::FullMatch("-2147483649", "(-?\\d+)", &v)); + CHECK(!RE2::FullMatch("2147483648", "(-?\\d+)", &v)); + + CHECK(RE2::FullMatch(zeros + "2147483647", "(-?\\d+)", &v)); + CHECK_EQ(v, max); + CHECK(RE2::FullMatch("-" + zeros + "2147483648", "(-?\\d+)", &v)); + CHECK_EQ(v, min); + + CHECK(!RE2::FullMatch("-" + zeros + "2147483649", "(-?\\d+)", &v)); + CHECK(RE2::FullMatch("0x7fffffff", "(.*)", RE2::CRadix(&v))); + CHECK_EQ(v, max); + CHECK(!RE2::FullMatch("000x7fffffff", "(.*)", RE2::CRadix(&v))); } { uint32 v; - static const uint32 max_value = 0xfffffffful; + static const uint32 max = 0xfffffffful; CHECK(RE2::FullMatch("100", "(\\d+)", &v)); CHECK_EQ(v, 100); - CHECK(RE2::FullMatch("4294967295", "(\\d+)", &v)); CHECK_EQ(v, max_value); + CHECK(RE2::FullMatch("4294967295", "(\\d+)", &v)); CHECK_EQ(v, max); CHECK(!RE2::FullMatch("4294967296", "(\\d+)", &v)); + CHECK(!RE2::FullMatch("-1", "(\\d+)", &v)); + + CHECK(RE2::FullMatch(zeros + "4294967295", "(\\d+)", &v)); CHECK_EQ(v, max); } { int64 v; - static const int64 max_value = 0x7fffffffffffffffull; - static const int64 min_value = -max_value - 1; + static const int64 max = 0x7fffffffffffffffull; + static const int64 min = -max - 1; char buf[32]; - CHECK(RE2::FullMatch("100", "(-?\\d+)",&v)); CHECK_EQ(v, 100); - CHECK(RE2::FullMatch("-100", "(-?\\d+)",&v)); CHECK_EQ(v, -100); + CHECK(RE2::FullMatch("100", "(-?\\d+)", &v)); CHECK_EQ(v, 100); + CHECK(RE2::FullMatch("-100", "(-?\\d+)", &v)); CHECK_EQ(v, -100); - snprintf(buf, sizeof(buf), "%lld", (long long)max_value); - CHECK(RE2::FullMatch(buf, "(-?\\d+)",&v)); CHECK_EQ(v, max_value); + snprintf(buf, sizeof(buf), "%lld", max); + CHECK(RE2::FullMatch(buf, "(-?\\d+)", &v)); CHECK_EQ(v, max); - snprintf(buf, sizeof(buf), "%lld", (long long)min_value); - CHECK(RE2::FullMatch(buf, "(-?\\d+)",&v)); CHECK_EQ(v, min_value); + snprintf(buf, sizeof(buf), "%lld", min); + CHECK(RE2::FullMatch(buf, "(-?\\d+)", &v)); CHECK_EQ(v, min); - snprintf(buf, sizeof(buf), "%lld", (long long)max_value); + snprintf(buf, sizeof(buf), "%lld", max); assert(buf[strlen(buf)-1] != '9'); buf[strlen(buf)-1]++; - CHECK(!RE2::FullMatch(buf, "(-?\\d+)", &v)); + CHECK(!RE2::FullMatch(buf, "(-?\\d+)", &v)); - snprintf(buf, sizeof(buf), "%lld", (long long)min_value); + snprintf(buf, sizeof(buf), "%lld", min); assert(buf[strlen(buf)-1] != '9'); buf[strlen(buf)-1]++; - CHECK(!RE2::FullMatch(buf, "(-?\\d+)", &v)); + CHECK(!RE2::FullMatch(buf, "(-?\\d+)", &v)); } { uint64 v; int64 v2; - static const uint64 max_value = 0xffffffffffffffffull; + static const uint64 max = 0xffffffffffffffffull; char buf[32]; - CHECK(RE2::FullMatch("100", "(-?\\d+)",&v)); CHECK_EQ(v, 100); - CHECK(RE2::FullMatch("-100", "(-?\\d+)",&v2)); CHECK_EQ(v2, -100); + CHECK(RE2::FullMatch("100", "(-?\\d+)", &v)); CHECK_EQ(v, 100); + CHECK(RE2::FullMatch("-100", "(-?\\d+)", &v2)); CHECK_EQ(v2, -100); - snprintf(buf, sizeof(buf), "%llu", (unsigned long long)max_value); - CHECK(RE2::FullMatch(buf, "(-?\\d+)",&v)); CHECK_EQ(v, max_value); + snprintf(buf, sizeof(buf), "%llu", max); + CHECK(RE2::FullMatch(buf, "(-?\\d+)", &v)); CHECK_EQ(v, max); assert(buf[strlen(buf)-1] != '9'); buf[strlen(buf)-1]++; - CHECK(!RE2::FullMatch(buf, "(-?\\d+)", &v)); + CHECK(!RE2::FullMatch(buf, "(-?\\d+)", &v)); } +} + +TEST(RE2, FloatingPointFullMatchTypes) { + string zeros(100, '0'); { float v; - CHECK(RE2::FullMatch("100", "(.*)", &v)); - CHECK(RE2::FullMatch("-100.", "(.*)", &v)); - CHECK(RE2::FullMatch("1e23", "(.*)", &v)); + CHECK(RE2::FullMatch("100", "(.*)", &v)); CHECK_EQ(v, 100); + CHECK(RE2::FullMatch("-100.", "(.*)", &v)); CHECK_EQ(v, -100); + CHECK(RE2::FullMatch("1e23", "(.*)", &v)); CHECK_EQ(v, float(1e23)); + + CHECK(RE2::FullMatch(zeros + "1e23", "(.*)", &v)); + CHECK_EQ(v, float(1e23)); + + // 6700000000081920.1 is an edge case. + // 6700000000081920 is exactly halfway between + // two float32s, so the .1 should make it round up. + // However, the .1 is outside the precision possible with + // a float64: the nearest float64 is 6700000000081920. + // So if the code uses strtod and then converts to float32, + // round-to-even will make it round down instead of up. + // To pass the test, the parser must call strtof directly. + // This test case is carefully chosen to use only a 17-digit + // number, since C does not guarantee to get the correctly + // rounded answer for strtod and strtof unless the input is + // short. + CHECK(RE2::FullMatch("0.1", "(.*)", &v)); + CHECK_EQ(v, 0.1f) << StringPrintf("%.8g != %.8g", v, 0.1f); + CHECK(RE2::FullMatch("6700000000081920.1", "(.*)", &v)); + CHECK_EQ(v, 6700000000081920.1f) + << StringPrintf("%.8g != %.8g", v, 6700000000081920.1f); } { double v; - CHECK(RE2::FullMatch("100", "(.*)", &v)); - CHECK(RE2::FullMatch("-100.", "(.*)", &v)); - CHECK(RE2::FullMatch("1e23", "(.*)", &v)); + CHECK(RE2::FullMatch("100", "(.*)", &v)); CHECK_EQ(v, 100); + CHECK(RE2::FullMatch("-100.", "(.*)", &v)); CHECK_EQ(v, -100); + CHECK(RE2::FullMatch("1e23", "(.*)", &v)); CHECK_EQ(v, 1e23); + CHECK(RE2::FullMatch(zeros + "1e23", "(.*)", &v)); + CHECK_EQ(v, double(1e23)); + + CHECK(RE2::FullMatch("0.1", "(.*)", &v)); + CHECK_EQ(v, 0.1) << StringPrintf("%.17g != %.17g", v, 0.1); + CHECK(RE2::FullMatch("1.00000005960464485", "(.*)", &v)); + CHECK_EQ(v, 1.0000000596046448) + << StringPrintf("%.17g != %.17g", v, 1.0000000596046448); } } @@ -1184,8 +1255,7 @@ TEST(RE2, BitstateCaptureBug) { opt.set_max_mem(20000); RE2 re("(_________$)", opt); StringPiece s = "xxxxxxxxxxxxxxxxxxxxxxxxxx_________x"; - EXPECT_FALSE(re.Match( - s, 0, s.size(), RE2::UNANCHORED, NULL, 0)); + EXPECT_FALSE(re.Match(s, 0, s.size(), RE2::UNANCHORED, NULL, 0)); } // C++ version of bug 609710. diff --git a/re2/unicode.py b/re2/unicode.py index a7fb778..8d78312 100755 --- a/re2/unicode.py +++ b/re2/unicode.py @@ -247,7 +247,7 @@ def CaseGroups(unicode_dir=_UNICODE_DIR): for g in groups: g.sort() groups.sort() - return groups + return togroup, groups def Scripts(unicode_dir=_UNICODE_DIR): diff --git a/re2/unicode_casefold.cc b/re2/unicode_casefold.cc index cb35f7a..6d4e878 100644 --- a/re2/unicode_casefold.cc +++ b/re2/unicode_casefold.cc @@ -294,6 +294,175 @@ CaseFold unicode_casefold[] = { }; int num_unicode_casefold = 282; +// 1029 groups, 1050 pairs, 163 ranges +CaseFold unicode_tolower[] = { + { 65, 90, 32 }, + { 181, 181, 775 }, + { 192, 214, 32 }, + { 216, 222, 32 }, + { 256, 302, EvenOddSkip }, + { 306, 310, EvenOddSkip }, + { 313, 327, OddEvenSkip }, + { 330, 374, EvenOddSkip }, + { 376, 376, -121 }, + { 377, 381, OddEvenSkip }, + { 383, 383, -268 }, + { 385, 385, 210 }, + { 386, 388, EvenOddSkip }, + { 390, 390, 206 }, + { 391, 391, OddEven }, + { 393, 394, 205 }, + { 395, 395, OddEven }, + { 398, 398, 79 }, + { 399, 399, 202 }, + { 400, 400, 203 }, + { 401, 401, OddEven }, + { 403, 403, 205 }, + { 404, 404, 207 }, + { 406, 406, 211 }, + { 407, 407, 209 }, + { 408, 408, EvenOdd }, + { 412, 412, 211 }, + { 413, 413, 213 }, + { 415, 415, 214 }, + { 416, 420, EvenOddSkip }, + { 422, 422, 218 }, + { 423, 423, OddEven }, + { 425, 425, 218 }, + { 428, 428, EvenOdd }, + { 430, 430, 218 }, + { 431, 431, OddEven }, + { 433, 434, 217 }, + { 435, 437, OddEvenSkip }, + { 439, 439, 219 }, + { 440, 440, EvenOdd }, + { 444, 444, EvenOdd }, + { 452, 452, 2 }, + { 453, 453, OddEven }, + { 455, 455, 2 }, + { 456, 456, EvenOdd }, + { 458, 458, 2 }, + { 459, 475, OddEvenSkip }, + { 478, 494, EvenOddSkip }, + { 497, 497, 2 }, + { 498, 500, EvenOddSkip }, + { 502, 502, -97 }, + { 503, 503, -56 }, + { 504, 542, EvenOddSkip }, + { 544, 544, -130 }, + { 546, 562, EvenOddSkip }, + { 570, 570, 10795 }, + { 571, 571, OddEven }, + { 573, 573, -163 }, + { 574, 574, 10792 }, + { 577, 577, OddEven }, + { 579, 579, -195 }, + { 580, 580, 69 }, + { 581, 581, 71 }, + { 582, 590, EvenOddSkip }, + { 837, 837, 116 }, + { 880, 882, EvenOddSkip }, + { 886, 886, EvenOdd }, + { 902, 902, 38 }, + { 904, 906, 37 }, + { 908, 908, 64 }, + { 910, 911, 63 }, + { 913, 929, 32 }, + { 931, 939, 32 }, + { 962, 962, EvenOdd }, + { 975, 975, 8 }, + { 976, 976, -30 }, + { 977, 977, -25 }, + { 981, 981, -15 }, + { 982, 982, -22 }, + { 984, 1006, EvenOddSkip }, + { 1008, 1008, -54 }, + { 1009, 1009, -48 }, + { 1012, 1012, -60 }, + { 1013, 1013, -64 }, + { 1015, 1015, OddEven }, + { 1017, 1017, -7 }, + { 1018, 1018, EvenOdd }, + { 1021, 1023, -130 }, + { 1024, 1039, 80 }, + { 1040, 1071, 32 }, + { 1120, 1152, EvenOddSkip }, + { 1162, 1214, EvenOddSkip }, + { 1216, 1216, 15 }, + { 1217, 1229, OddEvenSkip }, + { 1232, 1318, EvenOddSkip }, + { 1329, 1366, 48 }, + { 4256, 4293, 7264 }, + { 7680, 7828, EvenOddSkip }, + { 7835, 7835, -58 }, + { 7838, 7838, -7615 }, + { 7840, 7934, EvenOddSkip }, + { 7944, 7951, -8 }, + { 7960, 7965, -8 }, + { 7976, 7983, -8 }, + { 7992, 7999, -8 }, + { 8008, 8013, -8 }, + { 8025, 8025, -8 }, + { 8027, 8027, -8 }, + { 8029, 8029, -8 }, + { 8031, 8031, -8 }, + { 8040, 8047, -8 }, + { 8072, 8079, -8 }, + { 8088, 8095, -8 }, + { 8104, 8111, -8 }, + { 8120, 8121, -8 }, + { 8122, 8123, -74 }, + { 8124, 8124, -9 }, + { 8126, 8126, -7173 }, + { 8136, 8139, -86 }, + { 8140, 8140, -9 }, + { 8152, 8153, -8 }, + { 8154, 8155, -100 }, + { 8168, 8169, -8 }, + { 8170, 8171, -112 }, + { 8172, 8172, -7 }, + { 8184, 8185, -128 }, + { 8186, 8187, -126 }, + { 8188, 8188, -9 }, + { 8486, 8486, -7517 }, + { 8490, 8490, -8383 }, + { 8491, 8491, -8262 }, + { 8498, 8498, 28 }, + { 8544, 8559, 16 }, + { 8579, 8579, OddEven }, + { 9398, 9423, 26 }, + { 11264, 11310, 48 }, + { 11360, 11360, EvenOdd }, + { 11362, 11362, -10743 }, + { 11363, 11363, -3814 }, + { 11364, 11364, -10727 }, + { 11367, 11371, OddEvenSkip }, + { 11373, 11373, -10780 }, + { 11374, 11374, -10749 }, + { 11375, 11375, -10783 }, + { 11376, 11376, -10782 }, + { 11378, 11378, EvenOdd }, + { 11381, 11381, OddEven }, + { 11390, 11391, -10815 }, + { 11392, 11490, EvenOddSkip }, + { 11499, 11501, OddEvenSkip }, + { 42560, 42604, EvenOddSkip }, + { 42624, 42646, EvenOddSkip }, + { 42786, 42798, EvenOddSkip }, + { 42802, 42862, EvenOddSkip }, + { 42873, 42875, OddEvenSkip }, + { 42877, 42877, -35332 }, + { 42878, 42886, EvenOddSkip }, + { 42891, 42891, OddEven }, + { 42893, 42893, -42280 }, + { 42896, 42896, EvenOdd }, + { 42912, 42920, EvenOddSkip }, + { 65313, 65338, 32 }, + { 66560, 66599, 40 }, +}; +int num_unicode_tolower = 163; + + } // namespace re2 diff --git a/re2/unicode_casefold.h b/re2/unicode_casefold.h index 12744b7..160b07e 100644 --- a/re2/unicode_casefold.h +++ b/re2/unicode_casefold.h @@ -45,7 +45,9 @@ namespace re2 { enum { EvenOdd = 1, - OddEven = -1 + OddEven = -1, + EvenOddSkip = 1<<30, + OddEvenSkip, }; struct CaseFold { @@ -57,10 +59,16 @@ struct CaseFold { extern CaseFold unicode_casefold[]; extern int num_unicode_casefold; +extern CaseFold unicode_tolower[]; +extern int num_unicode_tolower; + // Returns the CaseFold* in the tables that contains rune. // If rune is not in the tables, returns the first CaseFold* after rune. // If rune is larger than any value in the tables, returns NULL. -extern CaseFold* LookupCaseFold(uint32 rune); +extern CaseFold* LookupCaseFold(CaseFold*, int, Rune rune); + +// Returns the result of applying the fold f to the rune r. +extern Rune ApplyFold(CaseFold *f, Rune r); } // namespace re2 |