export changes made at Google

R=rsc CC=re2-dev http://codereview.appspot.com/4625055
author: Russ Cox <rsc@swtch.com> 2011-06-21 15:17:24 -0400
committer: Russ Cox <rsc@swtch.com> 2011-06-21 15:17:24 -0400
commit: 295ae016889386fec74048b186b95ac61b7f5802 (patch)
tree: b088d52e4a342af46db6cbd673d228a7c5d4e017 /re2
parent: e6ca1cb28c789fc47f501cffe309a04c9da07cfe (diff)
download: regex-re2-295ae016889386fec74048b186b95ac61b7f5802.tar.gz
10 files changed, 455 insertions, 117 deletions
diff --git a/re2/make_unicode_casefold.py b/re2/make_unicode_casefold.py
index 2e50845..3375d2e 100755
--- a/re2/make_unicode_casefold.py
+++ b/re2/make_unicode_casefold.py
@@ -9,7 +9,7 @@
 
 """Generate C++ table for Unicode case folding."""
 
-import unicode
+import unicode, sys
 
 _header = """
 // GENERATED BY make_unicode_casefold.py; DO NOT EDIT.
@@ -56,6 +56,7 @@ def _AddDelta(a, delta):
       return a+1
     else:
       return a-1
+  print >>sys.stderr, "Bad Delta: ", delta
   raise "Bad Delta"
 
 def _MakeRanges(pairs):
@@ -63,15 +64,40 @@ def _MakeRanges(pairs):
      into [(65, 90, +32)]."""
   ranges = []
   last = -100
+
+  def evenodd(last, a, b, r):
+    if a != last+1 or b != _AddDelta(a, r[2]):
+      return False
+    r[1] = a
+    return True
+
+  def evenoddpair(last, a, b, r):
+    if a != last+2:
+      return False
+    delta = r[2]
+    d = delta
+    if type(delta) is not str:
+      return False
+    if delta.endswith('Skip'):
+      d = delta[:-4]
+    else:
+      delta = d + 'Skip'
+    if b != _AddDelta(a, d):
+      return False
+    r[1] = a
+    r[2] = delta
+    return True
+
   for a, b in pairs:
-    if a == last+1 and b == _AddDelta(a, ranges[-1][2]):
-      ranges[-1][1] = a
+    if ranges and evenodd(last, a, b, ranges[-1]):
+      pass
+    elif ranges and evenoddpair(last, a, b, ranges[-1]):
+      pass
     else:
       ranges.append([a, a, _Delta(a, b)])
     last = a
   return ranges
 
-
 # The maximum size of a case-folding group.
 # Case folding is implemented in parse.cc by a recursive process
 # with a recursion depth equal to the size of the largest
@@ -82,7 +108,7 @@ def _MakeRanges(pairs):
 MaxCasefoldGroup = 4
 
 def main():
-  casegroups = unicode.CaseGroups()
+  lowergroups, casegroups = unicode.CaseGroups()
   foldpairs = []
   seen = {}
   for c in casegroups:
@@ -93,16 +119,27 @@ def main():
         raise unicode.Error("bad casegroups %d -> %d" % (c[i-1], c[i]))
       seen[c[i-1]] = True
       foldpairs.append([c[i-1], c[i]])
-  foldpairs.sort()
-  foldranges = _MakeRanges(foldpairs)
+
+  lowerpairs = []
+  for lower, group in lowergroups.iteritems():
+    for g in group:
+      if g != lower:
+        lowerpairs.append([g, lower])
+
+  def printpairs(name, foldpairs):
+    foldpairs.sort()
+    foldranges = _MakeRanges(foldpairs)
+    print "// %d groups, %d pairs, %d ranges" % (len(casegroups), len(foldpairs), len(foldranges))
+    print "CaseFold unicode_%s[] = {" % (name,)
+    for lo, hi, delta in foldranges:
+      print "\t{ %d, %d, %s }," % (lo, hi, delta)
+    print "};"
+    print "int num_unicode_%s = %d;" % (name, len(foldranges),)
+    print ""
 
   print _header
-  print "// %d groups, %d pairs, %d ranges" % (len(casegroups), len(foldpairs), len(foldranges))
-  print "CaseFold unicode_casefold[] = {"
-  for lo, hi, delta in foldranges:
-    print "\t{ %d, %d, %s }," % (lo, hi, delta)
-  print "};"
-  print "int num_unicode_casefold = %d;" % (len(foldranges),)
+  printpairs("casefold", foldpairs)
+  printpairs("tolower", lowerpairs)
   print _trailer
 
 if __name__ == '__main__':
diff --git a/re2/parse.cc b/re2/parse.cc
index 605a320..9ae96eb 100644
--- a/re2/parse.cc
+++ b/re2/parse.cc
@@ -240,12 +240,8 @@ bool Regexp::ParseState::PushRegexp(Regexp* re) {
 // Searches the case folding tables and returns the CaseFold* that contains r.
 // If there isn't one, returns the CaseFold* with smallest f->lo bigger than r.
 // If there isn't one, returns NULL.
-CaseFold* LookupCaseFold(Rune r) {
-  CaseFold *f;
-  int n, m;
-
-  f = unicode_casefold;
-  n = num_unicode_casefold;
+CaseFold* LookupCaseFold(CaseFold *f, int n, Rune r) {
+  int m;
 
   // Binary search for entry containing r.
   while (n > 0) {
@@ -272,16 +268,24 @@ CaseFold* LookupCaseFold(Rune r) {
 }
 
 // Returns the result of applying the fold f to the rune r.
-static Rune ApplyFold(CaseFold *f, Rune r) {
+Rune ApplyFold(CaseFold *f, Rune r) {
   switch (f->delta) {
     default:
       return r + f->delta;
 
+    case EvenOddSkip:  // even <-> odd but only applies to every other
+      if ((r - f->lo) % 2)
+        return r;
+      // fall through
     case EvenOdd:  // even <-> odd
       if (r%2 == 0)
         return r + 1;
       return r - 1;
 
+    case OddEvenSkip:  // odd <-> even but only applies to every other
+      if ((r - f->lo) % 2)
+        return r;
+      // fall through
     case OddEven:  // odd <-> even
       if (r%2 == 1)
         return r + 1;
@@ -300,7 +304,7 @@ static Rune ApplyFold(CaseFold *f, Rune r) {
 //
 //   CycleFoldRune('?') = '?'
 Rune CycleFoldRune(Rune r) {
-  CaseFold* f = LookupCaseFold(r);
+  CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, r);
   if (f == NULL || r < f->lo)
     return r;
   return ApplyFold(f, r);
@@ -323,7 +327,7 @@ static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) {
     return;
 
   while (lo <= hi) {
-    CaseFold* f = LookupCaseFold(lo);
+    CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, lo);
     if (f == NULL)  // lo has no fold, nor does anything above lo
       break;
     if (lo < f->lo) {  // lo has no fold; next rune with a fold is f->lo
@@ -1275,8 +1279,7 @@ static bool ParseEscape(StringPiece* s, Rune* rp,
     case '0':
       // consume up to three octal digits; already have one.
       code = c - '0';
-      c = (*s)[0];
-      if (s->size() > 0 && '0' <= c && c <= '7') {
+      if (s->size() > 0 && '0' <= (c = (*s)[0]) && c <= '7') {
         code = code * 8 + c - '0';
         s->remove_prefix(1);  // digit
         if (s->size() > 0) {
@@ -1360,7 +1363,7 @@ static bool ParseEscape(StringPiece* s, Rune* rp,
     // in Perl, \b means word-boundary but [\b]
     // means backspace.  We don't support that:
     // if you want a backspace embed a literal
-    // backspace character or use \008.
+    // backspace character or use \x08.
     //
     // case 'b':
     //   *rp = '\b';
diff --git a/re2/prefilter.cc b/re2/prefilter.cc
index 93aea42..1367315 100644
--- a/re2/prefilter.cc
+++ b/re2/prefilter.cc
@@ -5,6 +5,7 @@
 #include "util/util.h"
 #include "re2/prefilter.h"
 #include "re2/re2.h"
+#include "re2/unicode_casefold.h"
 #include "re2/walker-inl.h"
 
 namespace re2 {
@@ -167,17 +168,22 @@ Prefilter* Prefilter::OrStrings(set<string>* ss) {
   return or_prefilter;
 }
 
+static Rune ToLowerRune(Rune r) {
+  if (r < Runeself) {
+    if ('A' <= r && r <= 'Z')
+      r += 'a' - 'A';
+    return r;
+  }
+
+  CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
+  if (f == NULL)
+    return r;
+  return ApplyFold(f, r);
+}
+
 Prefilter* Prefilter::FromString(const string& str) {
   Prefilter* m = new Prefilter(Prefilter::ATOM);
-  string s = str;
-  for (int i = 0; i < s.size(); i++) {
-    int c = s[i];
-    if ('A' <= c && c <= 'Z') {
-      c += 'a' - 'A';
-      s[i] = c;
-    }
-  }
-  m->atom_ = s;
+  m->atom_ = str;
   return m;
 }
 
@@ -270,7 +276,7 @@ string Prefilter::Info::ToString() {
     }
     return s;
   }
-  
+
   if (match_)
     return match_->DebugString();
 
@@ -387,7 +393,7 @@ static string RuneToString(Rune r) {
 // Constructs Info for literal rune.
 Prefilter::Info* Prefilter::Info::Literal(Rune r) {
   Info* info = new Info();
-  info->exact_.insert(RuneToString(r));
+  info->exact_.insert(RuneToString(ToLowerRune(r)));
   info->is_exact_ = true;
   return info;
 }
@@ -440,7 +446,7 @@ Prefilter::Info* Prefilter::Info::CClass(CharClass *cc) {
   Prefilter::Info *a = new Prefilter::Info();
   for (CCIter i = cc->begin(); i != cc->end(); ++i)
     for (Rune r = i->lo; r <= i->hi; r++)
-      a->exact_.insert(RuneToString(r));
+      a->exact_.insert(RuneToString(ToLowerRune(r)));
 
   a->is_exact_ = true;
 
diff --git a/re2/re2.cc b/re2/re2.cc
index 5b0a11a..e19692f 100644
--- a/re2/re2.cc
+++ b/re2/re2.cc
@@ -509,12 +509,12 @@ bool RE2::Match(const StringPiece& text,
       LOG(ERROR) << "Invalid RE2: " << *error_;
     return false;
   }
-  
+
   if (startpos < 0 || startpos > endpos || endpos > text.size()) {
     LOG(ERROR) << "RE2: invalid startpos, endpos pair.";
     return false;
   }
-
+  
   StringPiece subtext = text;
   subtext.remove_prefix(startpos);
   subtext.remove_suffix(text.size() - endpos);
@@ -941,32 +941,53 @@ bool RE2::Arg::parse_uchar(const char* str, int n, void* dest) {
 static const int kMaxNumberLength = 32;
 
 // REQUIRES "buf" must have length at least kMaxNumberLength+1
-// REQUIRES "n > 0"
-// Copies "str" into "buf" and null-terminates if necessary.
-// Returns one of:
-//      a. "str" if no termination is needed
-//      b. "buf" if the string was copied and null-terminated
-//      c. "" if the input was invalid and has no hope of being parsed
-static const char* TerminateNumber(char* buf, const char* str, int n) {
-  if ((n > 0) && isspace(*str)) {
+// Copies "str" into "buf" and null-terminates.
+// Overwrites *np with the new length.
+static const char* TerminateNumber(char* buf, const char* str, int* np) {
+  int n = *np;
+  if (n <= 0) return "";
+  if (n > 0 && isspace(*str)) {
     // We are less forgiving than the strtoxxx() routines and do not
     // allow leading spaces.
     return "";
   }
 
-  // See if the character right after the input text may potentially
-  // look like a digit.
-  if (isdigit(str[n]) ||
-      ((str[n] >= 'a') && (str[n] <= 'f')) ||
-      ((str[n] >= 'A') && (str[n] <= 'F'))) {
-    if (n > kMaxNumberLength) return ""; // Input too big to be a valid number
-    memcpy(buf, str, n);
-    buf[n] = '\0';
-    return buf;
-  } else {
-    // We can parse right out of the supplied string, so return it.
-    return str;
+  // Although buf has a fixed maximum size, we can still handle
+  // arbitrarily large integers correctly by omitting leading zeros.
+  // (Numbers that are still too long will be out of range.)
+  // Before deciding whether str is too long,
+  // remove leading zeros with s/000+/00/.
+  // Leaving the leading two zeros in place means that
+  // we don't change 0000x123 (invalid) into 0x123 (valid).
+  // Skip over leading - before replacing.
+  bool neg = false;
+  if (n >= 1 && str[0] == '-') {
+    neg = true;
+    n--;
+    str++;
+  }
+
+  if (n >= 3 && str[0] == '0' && str[1] == '0') {
+    while (n >= 3 && str[2] == '0') {
+      n--;
+      str++;
+    }
+  }
+
+  if (neg) {  // make room in buf for -
+    n++;
+    str--;
+  }
+
+  if (n > kMaxNumberLength) return "";
+
+  memmove(buf, str, n);
+  if (neg) {
+    buf[0] = '-';
   }
+  buf[n] = '\0';
+  *np = n;
+  return buf;
 }
 
 bool RE2::Arg::parse_long_radix(const char* str,
@@ -975,7 +996,7 @@ bool RE2::Arg::parse_long_radix(const char* str,
                                int radix) {
   if (n == 0) return false;
   char buf[kMaxNumberLength+1];
-  str = TerminateNumber(buf, str, n);
+  str = TerminateNumber(buf, str, &n);
   char* end;
   errno = 0;
   long r = strtol(str, &end, radix);
@@ -992,7 +1013,7 @@ bool RE2::Arg::parse_ulong_radix(const char* str,
                                 int radix) {
   if (n == 0) return false;
   char buf[kMaxNumberLength+1];
-  str = TerminateNumber(buf, str, n);
+  str = TerminateNumber(buf, str, &n);
   if (str[0] == '-') {
    // strtoul() will silently accept negative numbers and parse
    // them.  This module is more strict and treats them as errors.
@@ -1063,7 +1084,7 @@ bool RE2::Arg::parse_longlong_radix(const char* str,
                                    int radix) {
   if (n == 0) return false;
   char buf[kMaxNumberLength+1];
-  str = TerminateNumber(buf, str, n);
+  str = TerminateNumber(buf, str, &n);
   char* end;
   errno = 0;
   int64 r = strtoll(str, &end, radix);
@@ -1080,7 +1101,7 @@ bool RE2::Arg::parse_ulonglong_radix(const char* str,
                                     int radix) {
   if (n == 0) return false;
   char buf[kMaxNumberLength+1];
-  str = TerminateNumber(buf, str, n);
+  str = TerminateNumber(buf, str, &n);
   if (str[0] == '-') {
     // strtoull() will silently accept negative numbers and parse
     // them.  This module is more strict and treats them as errors.
@@ -1096,7 +1117,7 @@ bool RE2::Arg::parse_ulonglong_radix(const char* str,
   return true;
 }
 
-bool RE2::Arg::parse_double(const char* str, int n, void* dest) {
+static bool parse_double_float(const char* str, int n, bool isfloat, void *dest) {
   if (n == 0) return false;
   static const int kMaxLength = 200;
   char buf[kMaxLength];
@@ -1105,20 +1126,29 @@ bool RE2::Arg::parse_double(const char* str, int n, void* dest) {
   buf[n] = '\0';
   errno = 0;
   char* end;
-  double r = strtod(buf, &end);
+  double r;
+  if (isfloat) {
+    r = strtof(buf, &end);
+  } else {
+    r = strtod(buf, &end);
+  }
   if (end != buf + n) return false;   // Leftover junk
   if (errno) return false;
   if (dest == NULL) return true;
-  *(reinterpret_cast<double*>(dest)) = r;
+  if (isfloat) {
+    *(reinterpret_cast<float*>(dest)) = r;
+  } else {
+    *(reinterpret_cast<double*>(dest)) = r;
+  }
   return true;
 }
 
+bool RE2::Arg::parse_double(const char* str, int n, void* dest) {
+  return parse_double_float(str, n, false, dest);
+}
+
 bool RE2::Arg::parse_float(const char* str, int n, void* dest) {
-  double r;
-  if (!parse_double(str, n, &r)) return false;
-  if (dest == NULL) return true;
-  *(reinterpret_cast<float*>(dest)) = static_cast<float>(r);
-  return true;
+  return parse_double_float(str, n, true, dest);
 }
 
 
diff --git a/re2/re2.h b/re2/re2.h
index b1f71e5..9dbc99c 100644
--- a/re2/re2.h
+++ b/re2/re2.h
@@ -17,6 +17,9 @@
 // some of the more complicated things thrown away.  In particular,
 // backreferences and generalized assertions are not available, nor is \Z.
 //
+// See http://code.google.com/p/re2/wiki/Syntax for the syntax
+// supported by RE2, and a comparison with PCRE and PERL regexps.
+//
 // For those not familiar with Perl's regular expressions,
 // here are some examples of the most commonly used extensions:
 //
@@ -478,12 +481,13 @@ class RE2 {
     //   never_nl         (false) never match \n, even if it is in regexp
     //   case_sensitive   (true)  match is case-sensitive (regexp can override
     //                              with (?i) unless in posix_syntax mode)
-    //   perl_classes     (false) allow Perl's \d \s \w \D \S \W when in
-    //                              posix_syntax mode
-    //   word_boundary    (false) allow \b \B (word boundary and not) when in
-    //                              posix_syntax mode
+    //
+    // The following options are only consulted when posix_syntax == true.
+    // (When posix_syntax == false these features are always enabled and
+    // cannot be turned off.)
+    //   perl_classes     (false) allow Perl's \d \s \w \D \S \W
+    //   word_boundary    (false) allow Perl's \b \B (word boundary and not)
     //   one_line         (false) ^ and $ only match beginning and end of text
-    //                              when in posix_syntax mode
     //
     // The max_mem option controls how much memory can be used
     // to hold the compiled form of the regexp (the Prog) and
@@ -738,6 +742,7 @@ class RE2::Arg {
 
 
   MAKE_PARSER(char,               parse_char);
+  MAKE_PARSER(signed char,        parse_char);
   MAKE_PARSER(unsigned char,      parse_uchar);
   MAKE_PARSER(short,              parse_short);
   MAKE_PARSER(unsigned short,     parse_ushort);
diff --git a/re2/testing/filtered_re2_test.cc b/re2/testing/filtered_re2_test.cc
index 2eee57d..7755d30 100644
--- a/re2/testing/filtered_re2_test.cc
+++ b/re2/testing/filtered_re2_test.cc
@@ -99,13 +99,23 @@ AtomTest atom_tests[] = {
       "xbcdea", "xbcdeb",
       "ybcdea", "ybcdeb"
     }
-  }
+  }, {
+    // Test upper/lower of non-ASCII.
+    "UnicodeLower", {
+      "(?i)ΔδΠϖπΣςσ",
+      "ΛΜΝΟΠ",
+      "ψρστυ",
+    }, {
+      "δδπππσσσ",
+      "λμνοπ",
+      "ψρστυ",
+    },
+  },
 };
 
 void AddRegexpsAndCompile(const char* regexps[],
                           int n,
                           struct FilterTestVars* v) {
-  v->opts.set_utf8(false);
   for (int i = 0; i < n; i++) {
     int id;
     v->f.Add(regexps[i], v->opts, &id);
diff --git a/re2/testing/re2_test.cc b/re2/testing/re2_test.cc
index b050680..d9a2567 100644
--- a/re2/testing/re2_test.cc
+++ b/re2/testing/re2_test.cc
@@ -6,10 +6,13 @@
 // TODO: Test extractions for PartialMatch/Consume
 
 #include <sys/types.h>
+#include <sys/mman.h>
 #include <sys/stat.h>
+#include <errno.h>
 #include <vector>
 #include "util/test.h"
 #include "re2/re2.h"
+#include "re2/regexp.h"
 
 DECLARE_bool(logtostderr);
 
@@ -657,8 +660,30 @@ TEST(RE2, FullMatchTypedNullArg) {
   CHECK(!RE2::FullMatch("hello", "(.*)", (float*)NULL));
 }
 
+// Check that numeric parsing code does not read past the end of
+// the number being parsed.
+TEST(RE2, NULTerminated) {
+  char *v;
+  int x;
+
+#ifndef MAP_ANONYMOUS
+#define MAP_ANONYMOUS MAP_ANON
+#endif
+  v = static_cast<char*>(mmap(NULL, 16*1024, PROT_READ|PROT_WRITE,
+                              MAP_ANONYMOUS|MAP_PRIVATE, -1, 0));
+  CHECK(v != reinterpret_cast<char*>(-1));
+  LOG(INFO) << "Memory at " << (void*)v;
+  CHECK_EQ(munmap(v + 8*1024, 8*1024), 0) << " error " << errno;
+  v[8*1024 - 1] = '1';
+
+  x = 0;
+  CHECK(RE2::FullMatch(StringPiece(v + 8*1024 - 1, 1), "(.*)", &x));
+  CHECK_EQ(x, 1);
+}
+
 TEST(RE2, FullMatchTypeTests) {
   // Type tests
+  string zeros(100, '0');
   {
     char c;
     CHECK(RE2::FullMatch("Hello", "(H)ello", &c));
@@ -687,74 +712,120 @@ TEST(RE2, FullMatchTypeTests) {
   }
   {
     int32 v;
-    static const int32 max_value = 0x7fffffff;
-    static const int32 min_value = -max_value - 1;
-    CHECK(RE2::FullMatch("100",         "(-?\\d+)",&v)); CHECK_EQ(v, 100);
-    CHECK(RE2::FullMatch("-100",        "(-?\\d+)",&v)); CHECK_EQ(v, -100);
-    CHECK(RE2::FullMatch("2147483647",  "(-?\\d+)",&v)); CHECK_EQ(v, max_value);
-    CHECK(RE2::FullMatch("-2147483648", "(-?\\d+)",&v)); CHECK_EQ(v, min_value);
-    CHECK(!RE2::FullMatch("-2147483649", "(-?\\d+)",&v));
-    CHECK(!RE2::FullMatch("2147483648",  "(-?\\d+)",&v));
+    static const int32 max = 0x7fffffff;
+    static const int32 min = -max - 1;
+    CHECK(RE2::FullMatch("100",          "(-?\\d+)", &v)); CHECK_EQ(v, 100);
+    CHECK(RE2::FullMatch("-100",         "(-?\\d+)", &v)); CHECK_EQ(v, -100);
+    CHECK(RE2::FullMatch("2147483647",   "(-?\\d+)", &v)); CHECK_EQ(v, max);
+    CHECK(RE2::FullMatch("-2147483648",  "(-?\\d+)", &v)); CHECK_EQ(v, min);
+    CHECK(!RE2::FullMatch("-2147483649", "(-?\\d+)", &v));
+    CHECK(!RE2::FullMatch("2147483648",  "(-?\\d+)", &v));
+
+    CHECK(RE2::FullMatch(zeros + "2147483647", "(-?\\d+)", &v));
+    CHECK_EQ(v, max);
+    CHECK(RE2::FullMatch("-" + zeros + "2147483648", "(-?\\d+)", &v));
+    CHECK_EQ(v, min);
+
+    CHECK(!RE2::FullMatch("-" + zeros + "2147483649", "(-?\\d+)", &v));
+    CHECK(RE2::FullMatch("0x7fffffff", "(.*)", RE2::CRadix(&v)));
+    CHECK_EQ(v, max);
+    CHECK(!RE2::FullMatch("000x7fffffff", "(.*)", RE2::CRadix(&v)));
   }
   {
     uint32 v;
-    static const uint32 max_value = 0xfffffffful;
+    static const uint32 max = 0xfffffffful;
     CHECK(RE2::FullMatch("100",         "(\\d+)", &v)); CHECK_EQ(v, 100);
-    CHECK(RE2::FullMatch("4294967295",  "(\\d+)", &v)); CHECK_EQ(v, max_value);
+    CHECK(RE2::FullMatch("4294967295",  "(\\d+)", &v)); CHECK_EQ(v, max);
     CHECK(!RE2::FullMatch("4294967296", "(\\d+)", &v));
+    CHECK(!RE2::FullMatch("-1",         "(\\d+)", &v));
+
+    CHECK(RE2::FullMatch(zeros + "4294967295", "(\\d+)", &v)); CHECK_EQ(v, max);
   }
   {
     int64 v;
-    static const int64 max_value = 0x7fffffffffffffffull;
-    static const int64 min_value = -max_value - 1;
+    static const int64 max = 0x7fffffffffffffffull;
+    static const int64 min = -max - 1;
     char buf[32];
 
-    CHECK(RE2::FullMatch("100",         "(-?\\d+)",&v)); CHECK_EQ(v, 100);
-    CHECK(RE2::FullMatch("-100",        "(-?\\d+)",&v)); CHECK_EQ(v, -100);
+    CHECK(RE2::FullMatch("100",  "(-?\\d+)", &v)); CHECK_EQ(v, 100);
+    CHECK(RE2::FullMatch("-100", "(-?\\d+)", &v)); CHECK_EQ(v, -100);
 
-    snprintf(buf, sizeof(buf), "%lld", (long long)max_value);
-    CHECK(RE2::FullMatch(buf,  "(-?\\d+)",&v)); CHECK_EQ(v, max_value);
+    snprintf(buf, sizeof(buf), "%lld", max);
+    CHECK(RE2::FullMatch(buf,    "(-?\\d+)", &v)); CHECK_EQ(v, max);
 
-    snprintf(buf, sizeof(buf), "%lld", (long long)min_value);
-    CHECK(RE2::FullMatch(buf,  "(-?\\d+)",&v)); CHECK_EQ(v, min_value);
+    snprintf(buf, sizeof(buf), "%lld", min);
+    CHECK(RE2::FullMatch(buf,    "(-?\\d+)", &v)); CHECK_EQ(v, min);
 
-    snprintf(buf, sizeof(buf), "%lld", (long long)max_value);
+    snprintf(buf, sizeof(buf), "%lld", max);
     assert(buf[strlen(buf)-1] != '9');
     buf[strlen(buf)-1]++;
-    CHECK(!RE2::FullMatch(buf, "(-?\\d+)", &v));
+    CHECK(!RE2::FullMatch(buf,   "(-?\\d+)", &v));
 
-    snprintf(buf, sizeof(buf), "%lld", (long long)min_value);
+    snprintf(buf, sizeof(buf), "%lld", min);
     assert(buf[strlen(buf)-1] != '9');
     buf[strlen(buf)-1]++;
-    CHECK(!RE2::FullMatch(buf, "(-?\\d+)", &v));
+    CHECK(!RE2::FullMatch(buf,   "(-?\\d+)", &v));
   }
   {
     uint64 v;
     int64 v2;
-    static const uint64 max_value = 0xffffffffffffffffull;
+    static const uint64 max = 0xffffffffffffffffull;
     char buf[32];
 
-    CHECK(RE2::FullMatch("100",         "(-?\\d+)",&v)); CHECK_EQ(v, 100);
-    CHECK(RE2::FullMatch("-100",        "(-?\\d+)",&v2)); CHECK_EQ(v2, -100);
+    CHECK(RE2::FullMatch("100",  "(-?\\d+)", &v));  CHECK_EQ(v, 100);
+    CHECK(RE2::FullMatch("-100", "(-?\\d+)", &v2)); CHECK_EQ(v2, -100);
 
-    snprintf(buf, sizeof(buf), "%llu", (unsigned long long)max_value);
-    CHECK(RE2::FullMatch(buf,  "(-?\\d+)",&v)); CHECK_EQ(v, max_value);
+    snprintf(buf, sizeof(buf), "%llu", max);
+    CHECK(RE2::FullMatch(buf,    "(-?\\d+)", &v)); CHECK_EQ(v, max);
 
     assert(buf[strlen(buf)-1] != '9');
     buf[strlen(buf)-1]++;
-    CHECK(!RE2::FullMatch(buf, "(-?\\d+)", &v));
+    CHECK(!RE2::FullMatch(buf,   "(-?\\d+)", &v));
   }
+}
+
+TEST(RE2, FloatingPointFullMatchTypes) {
+  string zeros(100, '0');
   {
     float v;
-    CHECK(RE2::FullMatch("100",   "(.*)", &v));
-    CHECK(RE2::FullMatch("-100.", "(.*)", &v));
-    CHECK(RE2::FullMatch("1e23",  "(.*)", &v));
+    CHECK(RE2::FullMatch("100",   "(.*)", &v));  CHECK_EQ(v, 100);
+    CHECK(RE2::FullMatch("-100.", "(.*)", &v));  CHECK_EQ(v, -100);
+    CHECK(RE2::FullMatch("1e23",  "(.*)", &v));  CHECK_EQ(v, float(1e23));
+
+    CHECK(RE2::FullMatch(zeros + "1e23",  "(.*)", &v));
+    CHECK_EQ(v, float(1e23));
+
+    // 6700000000081920.1 is an edge case.
+    // 6700000000081920 is exactly halfway between
+    // two float32s, so the .1 should make it round up.
+    // However, the .1 is outside the precision possible with
+    // a float64: the nearest float64 is 6700000000081920.
+    // So if the code uses strtod and then converts to float32,
+    // round-to-even will make it round down instead of up.
+    // To pass the test, the parser must call strtof directly.
+    // This test case is carefully chosen to use only a 17-digit
+    // number, since C does not guarantee to get the correctly
+    // rounded answer for strtod and strtof unless the input is
+    // short.
+    CHECK(RE2::FullMatch("0.1", "(.*)", &v));
+    CHECK_EQ(v, 0.1f) << StringPrintf("%.8g != %.8g", v, 0.1f);
+    CHECK(RE2::FullMatch("6700000000081920.1", "(.*)", &v));
+    CHECK_EQ(v, 6700000000081920.1f)
+      << StringPrintf("%.8g != %.8g", v, 6700000000081920.1f);
   }
   {
     double v;
-    CHECK(RE2::FullMatch("100",   "(.*)", &v));
-    CHECK(RE2::FullMatch("-100.", "(.*)", &v));
-    CHECK(RE2::FullMatch("1e23",  "(.*)", &v));
+    CHECK(RE2::FullMatch("100",   "(.*)", &v));  CHECK_EQ(v, 100);
+    CHECK(RE2::FullMatch("-100.", "(.*)", &v));  CHECK_EQ(v, -100);
+    CHECK(RE2::FullMatch("1e23",  "(.*)", &v));  CHECK_EQ(v, 1e23);
+    CHECK(RE2::FullMatch(zeros + "1e23", "(.*)", &v));
+    CHECK_EQ(v, double(1e23));
+
+    CHECK(RE2::FullMatch("0.1", "(.*)", &v));
+    CHECK_EQ(v, 0.1) << StringPrintf("%.17g != %.17g", v, 0.1);
+    CHECK(RE2::FullMatch("1.00000005960464485", "(.*)", &v));
+    CHECK_EQ(v, 1.0000000596046448)
+      << StringPrintf("%.17g != %.17g", v, 1.0000000596046448);
   }
 }
 
@@ -1184,8 +1255,7 @@ TEST(RE2, BitstateCaptureBug) {
   opt.set_max_mem(20000);
   RE2 re("(_________$)", opt);
   StringPiece s = "xxxxxxxxxxxxxxxxxxxxxxxxxx_________x";
-  EXPECT_FALSE(re.Match(
-    s, 0, s.size(), RE2::UNANCHORED, NULL, 0));
+  EXPECT_FALSE(re.Match(s, 0, s.size(), RE2::UNANCHORED, NULL, 0));
 }
 
 // C++ version of bug 609710.
diff --git a/re2/unicode.py b/re2/unicode.py
index a7fb778..8d78312 100755
--- a/re2/unicode.py
+++ b/re2/unicode.py
@@ -247,7 +247,7 @@ def CaseGroups(unicode_dir=_UNICODE_DIR):
   for g in groups:
     g.sort()
   groups.sort()
-  return groups
+  return togroup, groups
 
 
 def Scripts(unicode_dir=_UNICODE_DIR):
diff --git a/re2/unicode_casefold.cc b/re2/unicode_casefold.cc
index cb35f7a..6d4e878 100644
--- a/re2/unicode_casefold.cc
+++ b/re2/unicode_casefold.cc
@@ -294,6 +294,175 @@ CaseFold unicode_casefold[] = {
 };
 int num_unicode_casefold = 282;
 
+// 1029 groups, 1050 pairs, 163 ranges
+CaseFold unicode_tolower[] = {
+	{ 65, 90, 32 },
+	{ 181, 181, 775 },
+	{ 192, 214, 32 },
+	{ 216, 222, 32 },
+	{ 256, 302, EvenOddSkip },
+	{ 306, 310, EvenOddSkip },
+	{ 313, 327, OddEvenSkip },
+	{ 330, 374, EvenOddSkip },
+	{ 376, 376, -121 },
+	{ 377, 381, OddEvenSkip },
+	{ 383, 383, -268 },
+	{ 385, 385, 210 },
+	{ 386, 388, EvenOddSkip },
+	{ 390, 390, 206 },
+	{ 391, 391, OddEven },
+	{ 393, 394, 205 },
+	{ 395, 395, OddEven },
+	{ 398, 398, 79 },
+	{ 399, 399, 202 },
+	{ 400, 400, 203 },
+	{ 401, 401, OddEven },
+	{ 403, 403, 205 },
+	{ 404, 404, 207 },
+	{ 406, 406, 211 },
+	{ 407, 407, 209 },
+	{ 408, 408, EvenOdd },
+	{ 412, 412, 211 },
+	{ 413, 413, 213 },
+	{ 415, 415, 214 },
+	{ 416, 420, EvenOddSkip },
+	{ 422, 422, 218 },
+	{ 423, 423, OddEven },
+	{ 425, 425, 218 },
+	{ 428, 428, EvenOdd },
+	{ 430, 430, 218 },
+	{ 431, 431, OddEven },
+	{ 433, 434, 217 },
+	{ 435, 437, OddEvenSkip },
+	{ 439, 439, 219 },
+	{ 440, 440, EvenOdd },
+	{ 444, 444, EvenOdd },
+	{ 452, 452, 2 },
+	{ 453, 453, OddEven },
+	{ 455, 455, 2 },
+	{ 456, 456, EvenOdd },
+	{ 458, 458, 2 },
+	{ 459, 475, OddEvenSkip },
+	{ 478, 494, EvenOddSkip },
+	{ 497, 497, 2 },
+	{ 498, 500, EvenOddSkip },
+	{ 502, 502, -97 },
+	{ 503, 503, -56 },
+	{ 504, 542, EvenOddSkip },
+	{ 544, 544, -130 },
+	{ 546, 562, EvenOddSkip },
+	{ 570, 570, 10795 },
+	{ 571, 571, OddEven },
+	{ 573, 573, -163 },
+	{ 574, 574, 10792 },
+	{ 577, 577, OddEven },
+	{ 579, 579, -195 },
+	{ 580, 580, 69 },
+	{ 581, 581, 71 },
+	{ 582, 590, EvenOddSkip },
+	{ 837, 837, 116 },
+	{ 880, 882, EvenOddSkip },
+	{ 886, 886, EvenOdd },
+	{ 902, 902, 38 },
+	{ 904, 906, 37 },
+	{ 908, 908, 64 },
+	{ 910, 911, 63 },
+	{ 913, 929, 32 },
+	{ 931, 939, 32 },
+	{ 962, 962, EvenOdd },
+	{ 975, 975, 8 },
+	{ 976, 976, -30 },
+	{ 977, 977, -25 },
+	{ 981, 981, -15 },
+	{ 982, 982, -22 },
+	{ 984, 1006, EvenOddSkip },
+	{ 1008, 1008, -54 },
+	{ 1009, 1009, -48 },
+	{ 1012, 1012, -60 },
+	{ 1013, 1013, -64 },
+	{ 1015, 1015, OddEven },
+	{ 1017, 1017, -7 },
+	{ 1018, 1018, EvenOdd },
+	{ 1021, 1023, -130 },
+	{ 1024, 1039, 80 },
+	{ 1040, 1071, 32 },
+	{ 1120, 1152, EvenOddSkip },
+	{ 1162, 1214, EvenOddSkip },
+	{ 1216, 1216, 15 },
+	{ 1217, 1229, OddEvenSkip },
+	{ 1232, 1318, EvenOddSkip },
+	{ 1329, 1366, 48 },
+	{ 4256, 4293, 7264 },
+	{ 7680, 7828, EvenOddSkip },
+	{ 7835, 7835, -58 },
+	{ 7838, 7838, -7615 },
+	{ 7840, 7934, EvenOddSkip },
+	{ 7944, 7951, -8 },
+	{ 7960, 7965, -8 },
+	{ 7976, 7983, -8 },
+	{ 7992, 7999, -8 },
+	{ 8008, 8013, -8 },
+	{ 8025, 8025, -8 },
+	{ 8027, 8027, -8 },
+	{ 8029, 8029, -8 },
+	{ 8031, 8031, -8 },
+	{ 8040, 8047, -8 },
+	{ 8072, 8079, -8 },
+	{ 8088, 8095, -8 },
+	{ 8104, 8111, -8 },
+	{ 8120, 8121, -8 },
+	{ 8122, 8123, -74 },
+	{ 8124, 8124, -9 },
+	{ 8126, 8126, -7173 },
+	{ 8136, 8139, -86 },
+	{ 8140, 8140, -9 },
+	{ 8152, 8153, -8 },
+	{ 8154, 8155, -100 },
+	{ 8168, 8169, -8 },
+	{ 8170, 8171, -112 },
+	{ 8172, 8172, -7 },
+	{ 8184, 8185, -128 },
+	{ 8186, 8187, -126 },
+	{ 8188, 8188, -9 },
+	{ 8486, 8486, -7517 },
+	{ 8490, 8490, -8383 },
+	{ 8491, 8491, -8262 },
+	{ 8498, 8498, 28 },
+	{ 8544, 8559, 16 },
+	{ 8579, 8579, OddEven },
+	{ 9398, 9423, 26 },
+	{ 11264, 11310, 48 },
+	{ 11360, 11360, EvenOdd },
+	{ 11362, 11362, -10743 },
+	{ 11363, 11363, -3814 },
+	{ 11364, 11364, -10727 },
+	{ 11367, 11371, OddEvenSkip },
+	{ 11373, 11373, -10780 },
+	{ 11374, 11374, -10749 },
+	{ 11375, 11375, -10783 },
+	{ 11376, 11376, -10782 },
+	{ 11378, 11378, EvenOdd },
+	{ 11381, 11381, OddEven },
+	{ 11390, 11391, -10815 },
+	{ 11392, 11490, EvenOddSkip },
+	{ 11499, 11501, OddEvenSkip },
+	{ 42560, 42604, EvenOddSkip },
+	{ 42624, 42646, EvenOddSkip },
+	{ 42786, 42798, EvenOddSkip },
+	{ 42802, 42862, EvenOddSkip },
+	{ 42873, 42875, OddEvenSkip },
+	{ 42877, 42877, -35332 },
+	{ 42878, 42886, EvenOddSkip },
+	{ 42891, 42891, OddEven },
+	{ 42893, 42893, -42280 },
+	{ 42896, 42896, EvenOdd },
+	{ 42912, 42920, EvenOddSkip },
+	{ 65313, 65338, 32 },
+	{ 66560, 66599, 40 },
+};
+int num_unicode_tolower = 163;
+
+
 
 } // namespace re2
 
diff --git a/re2/unicode_casefold.h b/re2/unicode_casefold.h
index 12744b7..160b07e 100644
--- a/re2/unicode_casefold.h
+++ b/re2/unicode_casefold.h
@@ -45,7 +45,9 @@ namespace re2 {
 
 enum {
   EvenOdd = 1,
-  OddEven = -1
+  OddEven = -1,
+  EvenOddSkip = 1<<30,
+  OddEvenSkip,
 };
 
 struct CaseFold {
@@ -57,10 +59,16 @@ struct CaseFold {
 extern CaseFold unicode_casefold[];
 extern int num_unicode_casefold;
 
+extern CaseFold unicode_tolower[];
+extern int num_unicode_tolower;
+
 // Returns the CaseFold* in the tables that contains rune.
 // If rune is not in the tables, returns the first CaseFold* after rune.
 // If rune is larger than any value in the tables, returns NULL.
-extern CaseFold* LookupCaseFold(uint32 rune);
+extern CaseFold* LookupCaseFold(CaseFold*, int, Rune rune);
+
+// Returns the result of applying the fold f to the rune r.
+extern Rune ApplyFold(CaseFold *f, Rune r);
 
 }  // namespace re2
author	Russ Cox <rsc@swtch.com>	2011-06-21 15:17:24 -0400
committer	Russ Cox <rsc@swtch.com>	2011-06-21 15:17:24 -0400
commit	295ae016889386fec74048b186b95ac61b7f5802 (patch)
tree	b088d52e4a342af46db6cbd673d228a7c5d4e017 /re2
parent	e6ca1cb28c789fc47f501cffe309a04c9da07cfe (diff)
download	regex-re2-295ae016889386fec74048b186b95ac61b7f5802.tar.gz