1 files changed, 1181 insertions, 0 deletions
diff --git a/re2/re2.cc b/re2/re2.cc
new file mode 100644
index 0000000..448f28e
--- /dev/null
+++ b/re2/re2.cc
@@ -0,0 +1,1181 @@
+// Copyright 2003-2009 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Regular expression interface RE2.
+//
+// Originally the PCRE C++ wrapper, but adapted to use
+// the new automata-based regular expression engines.
+
+#include "re2/re2.h"
+
+#include <stdio.h>
+#include <ctype.h>
+#include <string>
+#include <pthread.h>
+#include <errno.h>
+#include "util/util.h"
+#include "util/flags.h"
+#include "re2/prog.h"
+#include "re2/regexp.h"
+
+DEFINE_bool(trace_re2, false, "trace RE2 execution");
+
+namespace re2 {
+
+// Maximum number of args we can set
+static const int kMaxArgs = 16;
+static const int kVecSize = 1+kMaxArgs;
+
+const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::FullMatchN> RE2::FullMatch;
+const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::PartialMatchN> RE2::PartialMatch;
+const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::ConsumeN> RE2::Consume;
+const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::FindAndConsumeN> RE2::FindAndConsume;
+
+const int RE2::Options::kDefaultMaxMem;  // initialized in re2.h
+
+// Commonly-used option sets; arguments to constructor are:
+//   utf8 input
+//   posix syntax
+//   longest match
+//   log errors
+const RE2::Options RE2::DefaultOptions;  // EncodingUTF8, false, false, true
+const RE2::Options RE2::Latin1(RE2::Options::EncodingLatin1, false, false, true);
+const RE2::Options RE2::POSIX(RE2::Options::EncodingUTF8, true, true, true);
+const RE2::Options RE2::Quiet(RE2::Options::EncodingUTF8, false, false, false);
+
+// If a regular expression has no error, its error_ field points here
+static const string empty_string;
+
+// Converts from Regexp error code to RE2 error code.
+// Maybe some day they will diverge.  In any event, this
+// hides the existence of Regexp from RE2 users.
+static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) {
+  switch (code) {
+    case re2::kRegexpSuccess:
+      return RE2::NoError;
+    case re2::kRegexpInternalError:
+      return RE2::ErrorInternal;
+    case re2::kRegexpBadEscape:
+      return RE2::ErrorBadEscape;
+    case re2::kRegexpBadCharClass:
+      return RE2::ErrorBadCharClass;
+    case re2::kRegexpBadCharRange:
+      return RE2::ErrorBadCharRange;
+    case re2::kRegexpMissingBracket:
+      return RE2::ErrorMissingBracket;
+    case re2::kRegexpMissingParen:
+      return RE2::ErrorMissingParen;
+    case re2::kRegexpTrailingBackslash:
+      return RE2::ErrorTrailingBackslash;
+    case re2::kRegexpRepeatArgument:
+      return RE2::ErrorRepeatArgument;
+    case re2::kRegexpRepeatSize:
+      return RE2::ErrorRepeatSize;
+    case re2::kRegexpRepeatOp:
+      return RE2::ErrorRepeatOp;
+    case re2::kRegexpBadPerlOp:
+      return RE2::ErrorBadPerlOp;
+    case re2::kRegexpBadUTF8:
+      return RE2::ErrorBadUTF8;
+    case re2::kRegexpBadNamedCapture:
+      return RE2::ErrorBadNamedCapture;
+  }
+  return RE2::ErrorInternal;
+}
+
+static string trunc(const StringPiece& pattern) {
+  if (pattern.size() < 100)
+    return pattern.as_string();
+  return pattern.substr(0, 100).as_string() + "...";
+}
+
+
+RE2::RE2(const char* pattern) {
+  Init(pattern, DefaultOptions);
+}
+
+RE2::RE2(const string& pattern) {
+  Init(pattern, DefaultOptions);
+}
+
+RE2::RE2(const StringPiece& pattern) {
+  Init(pattern, DefaultOptions);
+}
+
+RE2::RE2(const StringPiece& pattern, const Options& options) {
+  Init(pattern, options);
+}
+
+int RE2::Options::ParseFlags() const {
+  int flags = Regexp::ClassNL;
+  switch (encoding()) {
+    default:
+      LOG(ERROR) << "Unknown encoding " << encoding();
+      break;
+    case RE2::Options::EncodingUTF8:
+      break;
+    case RE2::Options::EncodingLatin1:
+      flags |= Regexp::Latin1;
+      break;
+  }
+
+  if (!posix_syntax())
+    flags |= Regexp::LikePerl;
+
+  if (literal())
+    flags |= Regexp::Literal;
+
+  if (never_nl())
+    flags |= Regexp::NeverNL;
+
+  if (!case_sensitive())
+    flags |= Regexp::FoldCase;
+
+  if (perl_classes())
+    flags |= Regexp::PerlClasses;
+
+  if (word_boundary())
+    flags |= Regexp::PerlB;
+
+  if (one_line())
+    flags |= Regexp::OneLine;
+
+  return flags;
+}
+
+void RE2::Init(const StringPiece& pattern, const Options& options) {
+  mutex_ = new Mutex;
+  pattern_ = pattern.as_string();
+  options_.Copy(options);
+  error_ = &empty_string;
+  error_code_ = NoError;
+  suffix_regexp_ = NULL;
+  entire_regexp_ = NULL;
+  prog_ = NULL;
+  rprog_ = NULL;
+  named_groups_ = NULL;
+  group_names_ = NULL;
+  num_captures_ = -1;
+
+  RegexpStatus status;
+  entire_regexp_ = Regexp::Parse(
+    pattern_,
+    static_cast<Regexp::ParseFlags>(options_.ParseFlags()),
+    &status);
+  if (entire_regexp_ == NULL) {
+    if (error_ == &empty_string)
+      error_ = new string(status.Text());
+    if (options_.log_errors()) {
+      LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': "
+                 << status.Text();
+    }
+    error_arg_ = status.error_arg().as_string();
+    error_code_ = RegexpErrorToRE2(status.code());
+    return;
+  }
+
+  prefix_.clear();
+  prefix_foldcase_ = false;
+  re2::Regexp* suffix;
+  if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix))
+    suffix_regexp_ = suffix;
+  else
+    suffix_regexp_ = entire_regexp_->Incref();
+
+  // Two thirds of the memory goes to the forward Prog,
+  // one third to the reverse prog, because the forward
+  // Prog has two DFAs but the reverse prog has one.
+  prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3);
+  if (prog_ == NULL) {
+    if (options_.log_errors())
+      LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'";
+    error_ = new string("pattern too large - compile failed");
+    error_code_ = RE2::ErrorPatternTooLarge;
+    return;
+  }
+
+  // Could delay this until the first match call that
+  // cares about submatch information, but the one-pass
+  // machine's memory gets cut from the DFA memory budget,
+  // and that is harder to do if the DFA has already
+  // been built.
+  is_one_pass_ = prog_->IsOnePass();
+}
+
+// Returns rprog_, computing it if needed.
+re2::Prog* RE2::ReverseProg() const {
+  MutexLock l(mutex_);
+  if (rprog_ == NULL && error_ == &empty_string) {
+    rprog_ = suffix_regexp_->CompileToReverseProg(options_.max_mem()/3);
+    if (rprog_ == NULL) {
+      if (options_.log_errors())
+        LOG(ERROR) << "Error reverse compiling '" << trunc(pattern_) << "'";
+      error_ = new string("pattern too large - reverse compile failed");
+      error_code_ = RE2::ErrorPatternTooLarge;
+      return NULL;
+    }
+  }
+  return rprog_;
+}
+
+static const map<string, int> empty_named_groups;
+static const map<int, string> empty_group_names;
+
+RE2::~RE2() {
+  if (suffix_regexp_)
+    suffix_regexp_->Decref();
+  if (entire_regexp_)
+    entire_regexp_->Decref();
+  delete mutex_;
+  delete prog_;
+  delete rprog_;
+  if (error_ != &empty_string)
+    delete error_;
+  if (named_groups_ != NULL && named_groups_ != &empty_named_groups)
+    delete named_groups_;
+  if (group_names_ != NULL &&  group_names_ != &empty_group_names)
+    delete group_names_;
+}
+
+int RE2::ProgramSize() const {
+  if (prog_ == NULL)
+    return -1;
+  return prog_->size();
+}
+
+// Returns named_groups_, computing it if needed.
+const map<string, int>&  RE2::NamedCapturingGroups() const {
+  MutexLock l(mutex_);
+  if (!ok())
+    return empty_named_groups;
+  if (named_groups_ == NULL) {
+    named_groups_ = suffix_regexp_->NamedCaptures();
+    if (named_groups_ == NULL)
+      named_groups_ = &empty_named_groups;
+  }
+  return *named_groups_;
+}
+
+// Returns group_names_, computing it if needed.
+const map<int, string>&  RE2::CapturingGroupNames() const {
+  MutexLock l(mutex_);
+  if (!ok())
+    return empty_group_names;
+  if (group_names_ == NULL) {
+    group_names_ = suffix_regexp_->CaptureNames();
+    if (group_names_ == NULL)
+      group_names_ = &empty_group_names;
+  }
+  return *group_names_;
+}
+
+/***** Convenience interfaces *****/
+
+bool RE2::FullMatchN(const StringPiece& text, const RE2& re,
+                     const Arg* const args[], int n) {
+  return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n);
+}
+
+bool RE2::PartialMatchN(const StringPiece& text, const RE2& re,
+                        const Arg* const args[], int n) {
+  return re.DoMatch(text, UNANCHORED, NULL, args, n);
+}
+
+bool RE2::ConsumeN(StringPiece* input, const RE2& re,
+                   const Arg* const args[], int n) {
+  int consumed;
+  if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) {
+    input->remove_prefix(consumed);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re,
+                          const Arg* const args[], int n) {
+  int consumed;
+  if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) {
+    input->remove_prefix(consumed);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// Returns the maximum submatch needed for the rewrite to be done by Replace().
+// E.g. if rewrite == "foo \\2,\\1", returns 2.
+static int MaxSubmatch(const StringPiece& rewrite) {
+  int max = 0;
+  for (const char *s = rewrite.data(), *end = s + rewrite.size();
+       s < end; s++) {
+    if (*s == '\\') {
+      s++;
+      int c = (s < end) ? *s : -1;
+      if (isdigit(c)) {
+        int n = (c - '0');
+        if (n > max)
+          max = n;
+      }
+    }
+  }
+  return max;
+}
+
+bool RE2::Replace(string *str,
+                 const RE2& re,
+                 const StringPiece& rewrite) {
+  StringPiece vec[kVecSize];
+  int nvec = 1 + MaxSubmatch(rewrite);
+  if (nvec > arraysize(vec))
+    return false;
+  if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec))
+    return false;
+
+  string s;
+  if (!re.Rewrite(&s, rewrite, vec, nvec))
+    return false;
+
+  assert(vec[0].begin() >= str->data());
+  assert(vec[0].end() <= str->data()+str->size());
+  str->replace(vec[0].data() - str->data(), vec[0].size(), s);
+  return true;
+}
+
+int RE2::GlobalReplace(string *str,
+                      const RE2& re,
+                      const StringPiece& rewrite) {
+  StringPiece vec[kVecSize];
+  int nvec = 1 + MaxSubmatch(rewrite);
+  if (nvec > arraysize(vec))
+    return false;
+
+  const char* p = str->data();
+  const char* ep = p + str->size();
+  const char* lastend = NULL;
+  string out;
+  int count = 0;
+  while (p <= ep) {
+    if (!re.Match(*str, p - str->data(), str->size(), UNANCHORED, vec, nvec))
+      break;
+    if (p < vec[0].begin())
+      out.append(p, vec[0].begin() - p);
+    if (vec[0].begin() == lastend && vec[0].size() == 0) {
+      // Disallow empty match at end of last match: skip ahead.
+      if (p < ep)
+        out.append(p, 1);
+      p++;
+      continue;
+    }
+    re.Rewrite(&out, rewrite, vec, nvec);
+    p = vec[0].end();
+    lastend = p;
+    count++;
+  }
+
+  if (count == 0)
+    return 0;
+
+  if (p < ep)
+    out.append(p, ep - p);
+  swap(out, *str);
+  return count;
+}
+
+bool RE2::Extract(const StringPiece &text,
+                 const RE2& re,
+                 const StringPiece &rewrite,
+                 string *out) {
+  StringPiece vec[kVecSize];
+  int nvec = 1 + MaxSubmatch(rewrite);
+  if (nvec > arraysize(vec))
+    return false;
+
+  if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec))
+    return false;
+
+  out->clear();
+  return re.Rewrite(out, rewrite, vec, nvec);
+}
+
+string RE2::QuoteMeta(const StringPiece& unquoted) {
+  string result;
+  result.reserve(unquoted.size() << 1);
+
+  // Escape any ascii character not in [A-Za-z_0-9].
+  //
+  // Note that it's legal to escape a character even if it has no
+  // special meaning in a regular expression -- so this function does
+  // that.  (This also makes it identical to the perl function of the
+  // same name except for the null-character special case;
+  // see `perldoc -f quotemeta`.)
+  for (int ii = 0; ii < unquoted.length(); ++ii) {
+    // Note that using 'isalnum' here raises the benchmark time from
+    // 32ns to 58ns:
+    if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
+        (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
+        (unquoted[ii] < '0' || unquoted[ii] > '9') &&
+        unquoted[ii] != '_' &&
+        // If this is the part of a UTF8 or Latin1 character, we need
+        // to copy this byte without escaping.  Experimentally this is
+        // what works correctly with the regexp library.
+        !(unquoted[ii] & 128)) {
+      if (unquoted[ii] == '\0') {  // Special handling for null chars.
+        // Note that this special handling is not strictly required for RE2,
+        // but this quoting is required for other regexp libraries such as
+        // PCRE.
+        // Can't use "\\0" since the next character might be a digit.
+        result += "\\x00";
+        continue;
+      }
+      result += '\\';
+    }
+    result += unquoted[ii];
+  }
+
+  return result;
+}
+
+bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const {
+  if (prog_ == NULL)
+    return false;
+
+  int n = prefix_.size();
+  if (n > maxlen)
+    n = maxlen;
+
+  // Determine initial min max from prefix_ literal.
+  string pmin, pmax;
+  pmin = prefix_.substr(0, n);
+  pmax = prefix_.substr(0, n);
+  if (prefix_foldcase_) {
+    // prefix is ASCII lowercase; change pmin to uppercase.
+    for (int i = 0; i < n; i++) {
+      if ('a' <= pmin[i] && pmin[i] <= 'z')
+        pmin[i] += 'A' - 'a';
+    }
+  }
+
+  // Add to prefix min max using PossibleMatchRange on regexp.
+  string dmin, dmax;
+  maxlen -= n;
+  if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) {
+    pmin += dmin;
+    pmax += dmax;
+  } else if (pmax.size() > 0) {
+    // prog_->PossibleMatchRange has failed us,
+    // but we still have useful information from prefix_.
+    // Round up pmax to allow any possible suffix.
+    pmax = PrefixSuccessor(pmax);
+  } else {
+    // Nothing useful.
+    *min = "";
+    *max = "";
+    return false;
+  }
+
+  *min = pmin;
+  *max = pmax;
+  return true;
+}
+
+// Avoid possible locale nonsense in standard strcasecmp.
+// The string a is known to be all lowercase.
+static int ascii_strcasecmp(const char* a, const char* b, int len) {
+  const char *ae = a + len;
+
+  for (; a < ae; a++, b++) {
+    uint8 x = *a;
+    uint8 y = *b;
+    if ('A' <= y && y <= 'Z')
+      y += 'a' - 'A';
+    if (x != y)
+      return x - y;
+  }
+  return 0;
+}
+
+
+/***** Actual matching and rewriting code *****/
+
+bool RE2::Match(const StringPiece& text,
+                int startpos,
+                int endpos,
+                Anchor re_anchor,
+                StringPiece* submatch,
+                int nsubmatch) const {
+  if (!ok() || suffix_regexp_ == NULL) {
+    if (options_.log_errors())
+      LOG(ERROR) << "Invalid RE2: " << *error_;
+    return false;
+  }
+
+  if (startpos < 0 || startpos > endpos || endpos > text.size()) {
+    LOG(ERROR) << "RE2: invalid startpos, endpos pair.";
+    return false;
+  }
+  
+  StringPiece subtext = text;
+  subtext.remove_prefix(startpos);
+  subtext.remove_suffix(text.size() - endpos);
+
+  // Use DFAs to find exact location of match, filter out non-matches.
+
+  // Don't ask for the location if we won't use it.
+  // SearchDFA can do extra optimizations in that case.
+  StringPiece match;
+  StringPiece* matchp = &match;
+  if (nsubmatch == 0)
+    matchp = NULL;
+
+  int ncap = 1 + NumberOfCapturingGroups();
+  if (ncap > nsubmatch)
+    ncap = nsubmatch;
+
+  // If the regexp is anchored explicitly, must not be in middle of text.
+  if (prog_->anchor_start() && startpos != 0)
+    return false;
+
+  // If the regexp is anchored explicitly, update re_anchor
+  // so that we can potentially fall into a faster case below.
+  if (prog_->anchor_start() && prog_->anchor_end())
+    re_anchor = ANCHOR_BOTH;
+  else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH)
+    re_anchor = ANCHOR_START;
+
+  // Check for the required prefix, if any.
+  int prefixlen = 0;
+  if (!prefix_.empty()) {
+    if (startpos != 0)
+      return false;
+    prefixlen = prefix_.size();
+    if (prefixlen > subtext.size())
+      return false;
+    if (prefix_foldcase_) {
+      if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0)
+        return false;
+    } else {
+      if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0)
+        return false;
+    }
+    subtext.remove_prefix(prefixlen);
+    // If there is a required prefix, the anchor must be at least ANCHOR_START.
+    if (re_anchor != ANCHOR_BOTH)
+      re_anchor = ANCHOR_START;
+  }
+
+  Prog::Anchor anchor = Prog::kUnanchored;
+  Prog::MatchKind kind = Prog::kFirstMatch;
+  if (options_.longest_match())
+    kind = Prog::kLongestMatch;
+  bool skipped_test = false;
+
+  bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture);
+
+  // SearchBitState allocates a bit vector of size prog_->size() * text.size().
+  // It also allocates a stack of 3-word structures which could potentially
+  // grow as large as prog_->size() * text.size() but in practice is much
+  // smaller.
+  // Conditions for using SearchBitState:
+  const int MaxBitStateProg = 500;   // prog_->size() <= Max.
+  const int MaxBitStateVector = 256*1024;  // bit vector size <= Max (bits)
+  bool can_bit_state = prog_->size() <= MaxBitStateProg;
+  int bit_state_text_max = MaxBitStateVector / prog_->size();
+
+  bool dfa_failed = false;
+  switch (re_anchor) {
+    default:
+    case UNANCHORED: {
+      if (!prog_->SearchDFA(subtext, text, anchor, kind,
+                            matchp, &dfa_failed, NULL)) {
+        if (dfa_failed) {
+          // Fall back to NFA below.
+          skipped_test = true;
+          if (FLAGS_trace_re2)
+            LOG(INFO) << "Match " << trunc(pattern_)
+                      << " [" << CEscape(subtext) << "]"
+                      << " DFA failed.";
+          break;
+        }
+        if (FLAGS_trace_re2)
+          LOG(INFO) << "Match " << trunc(pattern_)
+                    << " [" << CEscape(subtext) << "]"
+                    << " used DFA - no match.";
+        return false;
+      }
+      if (FLAGS_trace_re2)
+        LOG(INFO) << "Match " << trunc(pattern_)
+                  << " [" << CEscape(subtext) << "]"
+                  << " used DFA - match";
+      if (matchp == NULL)  // Matched.  Don't care where
+        return true;
+      // SearchDFA set match[0].end() but didn't know where the
+      // match started.  Run the regexp backward from match[0].end()
+      // to find the longest possible match -- that's where it started.
+      Prog* prog = ReverseProg();
+      if (prog == NULL)
+        return false;
+      if (!prog->SearchDFA(match, text, Prog::kAnchored,
+                           Prog::kLongestMatch, &match, &dfa_failed, NULL)) {
+        if (dfa_failed) {
+          // Fall back to NFA below.
+          skipped_test = true;
+          if (FLAGS_trace_re2)
+            LOG(INFO) << "Match " << trunc(pattern_)
+                      << " [" << CEscape(subtext) << "]"
+                      << " reverse DFA failed.";
+          break;
+        }
+        if (FLAGS_trace_re2)
+          LOG(INFO) << "Match " << trunc(pattern_)
+                    << " [" << CEscape(subtext) << "]"
+                    << " DFA inconsistency.";
+        LOG(ERROR) << "DFA inconsistency";
+        return false;
+      }
+      if (FLAGS_trace_re2)
+        LOG(INFO) << "Match " << trunc(pattern_)
+                  << " [" << CEscape(subtext) << "]"
+                  << " used reverse DFA.";
+      break;
+    }
+
+    case ANCHOR_BOTH:
+    case ANCHOR_START:
+      if (re_anchor == ANCHOR_BOTH)
+        kind = Prog::kFullMatch;
+      anchor = Prog::kAnchored;
+
+      // If only a small amount of text and need submatch
+      // information anyway and we're going to use OnePass or BitState
+      // to get it, we might as well not even bother with the DFA:
+      // OnePass or BitState will be fast enough.
+      // On tiny texts, OnePass outruns even the DFA, and
+      // it doesn't have the shared state and occasional mutex that
+      // the DFA does.
+      if (can_one_pass && text.size() <= 4096 &&
+          (ncap > 1 || text.size() <= 8)) {
+        if (FLAGS_trace_re2)
+          LOG(INFO) << "Match " << trunc(pattern_)
+                    << " [" << CEscape(subtext) << "]"
+                    << " skipping DFA for OnePass.";
+        skipped_test = true;
+        break;
+      }
+      if (can_bit_state && text.size() <= bit_state_text_max && ncap > 1) {
+        if (FLAGS_trace_re2)
+          LOG(INFO) << "Match " << trunc(pattern_)
+                    << " [" << CEscape(subtext) << "]"
+                    << " skipping DFA for BitState.";
+        skipped_test = true;
+        break;
+      }
+      if (!prog_->SearchDFA(subtext, text, anchor, kind,
+                            &match, &dfa_failed, NULL)) {
+        if (dfa_failed) {
+          if (FLAGS_trace_re2)
+            LOG(INFO) << "Match " << trunc(pattern_)
+                      << " [" << CEscape(subtext) << "]"
+                      << " DFA failed.";
+          skipped_test = true;
+          break;
+        }
+        if (FLAGS_trace_re2)
+          LOG(INFO) << "Match " << trunc(pattern_)
+                    << " [" << CEscape(subtext) << "]"
+                    << " used DFA - no match.";
+        return false;
+      }
+      break;
+  }
+
+  if (!skipped_test && ncap <= 1) {
+    // We know exactly where it matches.  That's enough.
+    if (ncap == 1)
+      submatch[0] = match;
+  } else {
+    StringPiece subtext1;
+    if (skipped_test) {
+      // DFA ran out of memory or was skipped:
+      // need to search in entire original text.
+      subtext1 = subtext;
+    } else {
+      // DFA found the exact match location:
+      // let NFA run an anchored, full match search
+      // to find submatch locations.
+      subtext1 = match;
+      anchor = Prog::kAnchored;
+      kind = Prog::kFullMatch;
+    }
+
+    if (can_one_pass && anchor != Prog::kUnanchored) {
+      if (FLAGS_trace_re2)
+        LOG(INFO) << "Match " << trunc(pattern_)
+                  << " [" << CEscape(subtext) << "]"
+                  << " using OnePass.";
+      if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) {
+        if (!skipped_test)
+          LOG(ERROR) << "SearchOnePass inconsistency";
+        return false;
+      }
+    } else if (can_bit_state && subtext1.size() <= bit_state_text_max) {
+      if (FLAGS_trace_re2)
+        LOG(INFO) << "Match " << trunc(pattern_)
+                  << " [" << CEscape(subtext) << "]"
+                  << " using BitState.";
+      if (!prog_->SearchBitState(subtext1, text, anchor,
+                                 kind, submatch, ncap)) {
+        if (!skipped_test)
+          LOG(ERROR) << "SearchBitState inconsistency";
+        return false;
+      }
+    } else {
+      if (FLAGS_trace_re2)
+        LOG(INFO) << "Match " << trunc(pattern_)
+                  << " [" << CEscape(subtext) << "]"
+                  << " using NFA.";
+      if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) {
+        if (!skipped_test)
+          LOG(ERROR) << "SearchNFA inconsistency";
+        return false;
+      }
+    }
+  }
+
+  // Adjust overall match for required prefix that we stripped off.
+  if (prefixlen > 0 && nsubmatch > 0)
+    submatch[0] = StringPiece(submatch[0].begin() - prefixlen,
+                              submatch[0].size() + prefixlen);
+
+  // Zero submatches that don't exist in the regexp.
+  for (int i = ncap; i < nsubmatch; i++)
+    submatch[i] = NULL;
+  return true;
+}
+
+// Internal matcher - like Match() but takes Args not StringPieces.
+bool RE2::DoMatch(const StringPiece& text,
+                  Anchor anchor,
+                  int* consumed,
+                  const Arg* const* args,
+                  int n) const {
+  if (!ok()) {
+    if (options_.log_errors())
+      LOG(ERROR) << "Invalid RE2: " << *error_;
+    return false;
+  }
+
+  // Count number of capture groups needed.
+  int nvec;
+  if (n == 0 && consumed == NULL)
+    nvec = 0;
+  else
+    nvec = n+1;
+
+  StringPiece* vec;
+  StringPiece stkvec[kVecSize];
+  StringPiece* heapvec = NULL;
+
+  if (nvec <= arraysize(stkvec)) {
+    vec = stkvec;
+  } else {
+    vec = new StringPiece[nvec];
+    heapvec = vec;
+  }
+
+  if (!Match(text, 0, text.size(), anchor, vec, nvec)) {
+    delete[] heapvec;
+    return false;
+  }
+
+  if(consumed != NULL)
+    *consumed = vec[0].end() - text.begin();
+
+  if (n == 0 || args == NULL) {
+    // We are not interested in results
+    delete[] heapvec;
+    return true;
+  }
+
+  int ncap = NumberOfCapturingGroups();
+  if (ncap < n) {
+    // RE has fewer capturing groups than number of arg pointers passed in
+    VLOG(1) << "Asked for " << n << " but only have " << ncap;
+    delete[] heapvec;
+    return false;
+  }
+
+  // If we got here, we must have matched the whole pattern.
+  for (int i = 0; i < n; i++) {
+    const StringPiece& s = vec[i+1];
+    if (!args[i]->Parse(s.data(), s.size())) {
+      // TODO: Should we indicate what the error was?
+      VLOG(1) << "Parse error on #" << i << " " << s << " "
+	      << (void*)s.data() << "/" << s.size();
+      delete[] heapvec;
+      return false;
+    }
+  }
+
+  delete[] heapvec;
+  return true;
+}
+
+// Append the "rewrite" string, with backslash subsitutions from "vec",
+// to string "out".
+bool RE2::Rewrite(string *out, const StringPiece &rewrite,
+                 const StringPiece *vec, int veclen) const {
+  for (const char *s = rewrite.data(), *end = s + rewrite.size();
+       s < end; s++) {
+    int c = *s;
+    if (c == '\\') {
+      s++;
+      c = (s < end) ? *s : -1;
+      if (isdigit(c)) {
+        int n = (c - '0');
+        if (n >= veclen) {
+          LOG(ERROR) << "requested group " << n
+                     << " in regexp " << rewrite.data();
+          return false;
+        }
+        StringPiece snip = vec[n];
+        if (snip.size() > 0)
+          out->append(snip.data(), snip.size());
+      } else if (c == '\\') {
+        out->push_back('\\');
+      } else {
+        LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data();
+        return false;
+      }
+    } else {
+      out->push_back(c);
+    }
+  }
+  return true;
+}
+
+// Return the number of capturing subpatterns, or -1 if the
+// regexp wasn't valid on construction.
+int RE2::NumberOfCapturingGroups() const {
+  if (suffix_regexp_ == NULL)
+    return -1;
+  ANNOTATE_BENIGN_RACE(&num_captures_, "benign race: in the worst case"
+    " multiple threads end up doing the same work in parallel.");
+  if (num_captures_ == -1)
+    num_captures_ = suffix_regexp_->NumCaptures();
+  return num_captures_;
+}
+
+// Checks that the rewrite string is well-formed with respect to this
+// regular expression.
+bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const {
+  int max_token = -1;
+  for (const char *s = rewrite.data(), *end = s + rewrite.size();
+       s < end; s++) {
+    int c = *s;
+    if (c != '\\') {
+      continue;
+    }
+    if (++s == end) {
+      *error = "Rewrite schema error: '\\' not allowed at end.";
+      return false;
+    }
+    c = *s;
+    if (c == '\\') {
+      continue;
+    }
+    if (!isdigit(c)) {
+      *error = "Rewrite schema error: "
+               "'\\' must be followed by a digit or '\\'.";
+      return false;
+    }
+    int n = (c - '0');
+    if (max_token < n) {
+      max_token = n;
+    }
+  }
+
+  if (max_token > NumberOfCapturingGroups()) {
+    SStringPrintf(error, "Rewrite schema requests %d matches, "
+                  "but the regexp only has %d parenthesized subexpressions.",
+                  max_token, NumberOfCapturingGroups());
+    return false;
+  }
+  return true;
+}
+
+/***** Parsers for various types *****/
+
+bool RE2::Arg::parse_null(const char* str, int n, void* dest) {
+  // We fail if somebody asked us to store into a non-NULL void* pointer
+  return (dest == NULL);
+}
+
+bool RE2::Arg::parse_string(const char* str, int n, void* dest) {
+  if (dest == NULL) return true;
+  reinterpret_cast<string*>(dest)->assign(str, n);
+  return true;
+}
+
+bool RE2::Arg::parse_stringpiece(const char* str, int n, void* dest) {
+  if (dest == NULL) return true;
+  reinterpret_cast<StringPiece*>(dest)->set(str, n);
+  return true;
+}
+
+bool RE2::Arg::parse_char(const char* str, int n, void* dest) {
+  if (n != 1) return false;
+  if (dest == NULL) return true;
+  *(reinterpret_cast<char*>(dest)) = str[0];
+  return true;
+}
+
+bool RE2::Arg::parse_uchar(const char* str, int n, void* dest) {
+  if (n != 1) return false;
+  if (dest == NULL) return true;
+  *(reinterpret_cast<unsigned char*>(dest)) = str[0];
+  return true;
+}
+
+// Largest number spec that we are willing to parse
+static const int kMaxNumberLength = 32;
+
+// REQUIRES "buf" must have length at least kMaxNumberLength+1
+// Copies "str" into "buf" and null-terminates.
+// Overwrites *np with the new length.
+static const char* TerminateNumber(char* buf, const char* str, int* np) {
+  int n = *np;
+  if (n <= 0) return "";
+  if (n > 0 && isspace(*str)) {
+    // We are less forgiving than the strtoxxx() routines and do not
+    // allow leading spaces.
+    return "";
+  }
+
+  // Although buf has a fixed maximum size, we can still handle
+  // arbitrarily large integers correctly by omitting leading zeros.
+  // (Numbers that are still too long will be out of range.)
+  // Before deciding whether str is too long,
+  // remove leading zeros with s/000+/00/.
+  // Leaving the leading two zeros in place means that
+  // we don't change 0000x123 (invalid) into 0x123 (valid).
+  // Skip over leading - before replacing.
+  bool neg = false;
+  if (n >= 1 && str[0] == '-') {
+    neg = true;
+    n--;
+    str++;
+  }
+
+  if (n >= 3 && str[0] == '0' && str[1] == '0') {
+    while (n >= 3 && str[2] == '0') {
+      n--;
+      str++;
+    }
+  }
+
+  if (neg) {  // make room in buf for -
+    n++;
+    str--;
+  }
+
+  if (n > kMaxNumberLength) return "";
+
+  memmove(buf, str, n);
+  if (neg) {
+    buf[0] = '-';
+  }
+  buf[n] = '\0';
+  *np = n;
+  return buf;
+}
+
+bool RE2::Arg::parse_long_radix(const char* str,
+                               int n,
+                               void* dest,
+                               int radix) {
+  if (n == 0) return false;
+  char buf[kMaxNumberLength+1];
+  str = TerminateNumber(buf, str, &n);
+  char* end;
+  errno = 0;
+  long r = strtol(str, &end, radix);
+  if (end != str + n) return false;   // Leftover junk
+  if (errno) return false;
+  if (dest == NULL) return true;
+  *(reinterpret_cast<long*>(dest)) = r;
+  return true;
+}
+
+bool RE2::Arg::parse_ulong_radix(const char* str,
+                                int n,
+                                void* dest,
+                                int radix) {
+  if (n == 0) return false;
+  char buf[kMaxNumberLength+1];
+  str = TerminateNumber(buf, str, &n);
+  if (str[0] == '-') {
+   // strtoul() will silently accept negative numbers and parse
+   // them.  This module is more strict and treats them as errors.
+   return false;
+  }
+
+  char* end;
+  errno = 0;
+  unsigned long r = strtoul(str, &end, radix);
+  if (end != str + n) return false;   // Leftover junk
+  if (errno) return false;
+  if (dest == NULL) return true;
+  *(reinterpret_cast<unsigned long*>(dest)) = r;
+  return true;
+}
+
+bool RE2::Arg::parse_short_radix(const char* str,
+                                int n,
+                                void* dest,
+                                int radix) {
+  long r;
+  if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
+  if ((short)r != r) return false;       // Out of range
+  if (dest == NULL) return true;
+  *(reinterpret_cast<short*>(dest)) = r;
+  return true;
+}
+
+bool RE2::Arg::parse_ushort_radix(const char* str,
+                                 int n,
+                                 void* dest,
+                                 int radix) {
+  unsigned long r;
+  if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
+  if ((ushort)r != r) return false;                      // Out of range
+  if (dest == NULL) return true;
+  *(reinterpret_cast<unsigned short*>(dest)) = r;
+  return true;
+}
+
+bool RE2::Arg::parse_int_radix(const char* str,
+                              int n,
+                              void* dest,
+                              int radix) {
+  long r;
+  if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
+  if ((int)r != r) return false;         // Out of range
+  if (dest == NULL) return true;
+  *(reinterpret_cast<int*>(dest)) = r;
+  return true;
+}
+
+bool RE2::Arg::parse_uint_radix(const char* str,
+                               int n,
+                               void* dest,
+                               int radix) {
+  unsigned long r;
+  if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
+  if ((uint)r != r) return false;                       // Out of range
+  if (dest == NULL) return true;
+  *(reinterpret_cast<unsigned int*>(dest)) = r;
+  return true;
+}
+
+bool RE2::Arg::parse_longlong_radix(const char* str,
+                                   int n,
+                                   void* dest,
+                                   int radix) {
+  if (n == 0) return false;
+  char buf[kMaxNumberLength+1];
+  str = TerminateNumber(buf, str, &n);
+  char* end;
+  errno = 0;
+  int64 r = strtoll(str, &end, radix);
+  if (end != str + n) return false;   // Leftover junk
+  if (errno) return false;
+  if (dest == NULL) return true;
+  *(reinterpret_cast<int64*>(dest)) = r;
+  return true;
+}
+
+bool RE2::Arg::parse_ulonglong_radix(const char* str,
+                                    int n,
+                                    void* dest,
+                                    int radix) {
+  if (n == 0) return false;
+  char buf[kMaxNumberLength+1];
+  str = TerminateNumber(buf, str, &n);
+  if (str[0] == '-') {
+    // strtoull() will silently accept negative numbers and parse
+    // them.  This module is more strict and treats them as errors.
+    return false;
+  }
+  char* end;
+  errno = 0;
+  uint64 r = strtoull(str, &end, radix);
+  if (end != str + n) return false;   // Leftover junk
+  if (errno) return false;
+  if (dest == NULL) return true;
+  *(reinterpret_cast<uint64*>(dest)) = r;
+  return true;
+}
+
+static bool parse_double_float(const char* str, int n, bool isfloat, void *dest) {
+  if (n == 0) return false;
+  static const int kMaxLength = 200;
+  char buf[kMaxLength];
+  if (n >= kMaxLength) return false;
+  memcpy(buf, str, n);
+  buf[n] = '\0';
+  errno = 0;
+  char* end;
+  double r;
+  if (isfloat) {
+    r = strtof(buf, &end);
+  } else {
+    r = strtod(buf, &end);
+  }
+  if (end != buf + n) return false;   // Leftover junk
+  if (errno) return false;
+  if (dest == NULL) return true;
+  if (isfloat) {
+    *(reinterpret_cast<float*>(dest)) = r;
+  } else {
+    *(reinterpret_cast<double*>(dest)) = r;
+  }
+  return true;
+}
+
+bool RE2::Arg::parse_double(const char* str, int n, void* dest) {
+  return parse_double_float(str, n, false, dest);
+}
+
+bool RE2::Arg::parse_float(const char* str, int n, void* dest) {
+  return parse_double_float(str, n, true, dest);
+}
+
+
+#define DEFINE_INTEGER_PARSERS(name)                                        \
+  bool RE2::Arg::parse_##name(const char* str, int n, void* dest) {          \
+    return parse_##name##_radix(str, n, dest, 10);                          \
+  }                                                                         \
+  bool RE2::Arg::parse_##name##_hex(const char* str, int n, void* dest) {    \
+    return parse_##name##_radix(str, n, dest, 16);                          \
+  }                                                                         \
+  bool RE2::Arg::parse_##name##_octal(const char* str, int n, void* dest) {  \
+    return parse_##name##_radix(str, n, dest, 8);                           \
+  }                                                                         \
+  bool RE2::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
+    return parse_##name##_radix(str, n, dest, 0);                           \
+  }
+
+DEFINE_INTEGER_PARSERS(short);
+DEFINE_INTEGER_PARSERS(ushort);
+DEFINE_INTEGER_PARSERS(int);
+DEFINE_INTEGER_PARSERS(uint);
+DEFINE_INTEGER_PARSERS(long);
+DEFINE_INTEGER_PARSERS(ulong);
+DEFINE_INTEGER_PARSERS(longlong);
+DEFINE_INTEGER_PARSERS(ulonglong);
+
+#undef DEFINE_INTEGER_PARSERS
+
+}  // namespace re2