diff options
Diffstat (limited to 're2/re2.cc')
-rw-r--r-- | re2/re2.cc | 1181 |
1 files changed, 1181 insertions, 0 deletions
diff --git a/re2/re2.cc b/re2/re2.cc new file mode 100644 index 0000000..448f28e --- /dev/null +++ b/re2/re2.cc @@ -0,0 +1,1181 @@ +// Copyright 2003-2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Regular expression interface RE2. +// +// Originally the PCRE C++ wrapper, but adapted to use +// the new automata-based regular expression engines. + +#include "re2/re2.h" + +#include <stdio.h> +#include <ctype.h> +#include <string> +#include <pthread.h> +#include <errno.h> +#include "util/util.h" +#include "util/flags.h" +#include "re2/prog.h" +#include "re2/regexp.h" + +DEFINE_bool(trace_re2, false, "trace RE2 execution"); + +namespace re2 { + +// Maximum number of args we can set +static const int kMaxArgs = 16; +static const int kVecSize = 1+kMaxArgs; + +const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::FullMatchN> RE2::FullMatch; +const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::PartialMatchN> RE2::PartialMatch; +const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::ConsumeN> RE2::Consume; +const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::FindAndConsumeN> RE2::FindAndConsume; + +const int RE2::Options::kDefaultMaxMem; // initialized in re2.h + +// Commonly-used option sets; arguments to constructor are: +// utf8 input +// posix syntax +// longest match +// log errors +const RE2::Options RE2::DefaultOptions; // EncodingUTF8, false, false, true +const RE2::Options RE2::Latin1(RE2::Options::EncodingLatin1, false, false, true); +const RE2::Options RE2::POSIX(RE2::Options::EncodingUTF8, true, true, true); +const RE2::Options RE2::Quiet(RE2::Options::EncodingUTF8, false, false, false); + +// If a regular expression has no error, its error_ field points here +static const string empty_string; + +// Converts from Regexp error code to RE2 error code. +// Maybe some day they will diverge. In any event, this +// hides the existence of Regexp from RE2 users. +static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) { + switch (code) { + case re2::kRegexpSuccess: + return RE2::NoError; + case re2::kRegexpInternalError: + return RE2::ErrorInternal; + case re2::kRegexpBadEscape: + return RE2::ErrorBadEscape; + case re2::kRegexpBadCharClass: + return RE2::ErrorBadCharClass; + case re2::kRegexpBadCharRange: + return RE2::ErrorBadCharRange; + case re2::kRegexpMissingBracket: + return RE2::ErrorMissingBracket; + case re2::kRegexpMissingParen: + return RE2::ErrorMissingParen; + case re2::kRegexpTrailingBackslash: + return RE2::ErrorTrailingBackslash; + case re2::kRegexpRepeatArgument: + return RE2::ErrorRepeatArgument; + case re2::kRegexpRepeatSize: + return RE2::ErrorRepeatSize; + case re2::kRegexpRepeatOp: + return RE2::ErrorRepeatOp; + case re2::kRegexpBadPerlOp: + return RE2::ErrorBadPerlOp; + case re2::kRegexpBadUTF8: + return RE2::ErrorBadUTF8; + case re2::kRegexpBadNamedCapture: + return RE2::ErrorBadNamedCapture; + } + return RE2::ErrorInternal; +} + +static string trunc(const StringPiece& pattern) { + if (pattern.size() < 100) + return pattern.as_string(); + return pattern.substr(0, 100).as_string() + "..."; +} + + +RE2::RE2(const char* pattern) { + Init(pattern, DefaultOptions); +} + +RE2::RE2(const string& pattern) { + Init(pattern, DefaultOptions); +} + +RE2::RE2(const StringPiece& pattern) { + Init(pattern, DefaultOptions); +} + +RE2::RE2(const StringPiece& pattern, const Options& options) { + Init(pattern, options); +} + +int RE2::Options::ParseFlags() const { + int flags = Regexp::ClassNL; + switch (encoding()) { + default: + LOG(ERROR) << "Unknown encoding " << encoding(); + break; + case RE2::Options::EncodingUTF8: + break; + case RE2::Options::EncodingLatin1: + flags |= Regexp::Latin1; + break; + } + + if (!posix_syntax()) + flags |= Regexp::LikePerl; + + if (literal()) + flags |= Regexp::Literal; + + if (never_nl()) + flags |= Regexp::NeverNL; + + if (!case_sensitive()) + flags |= Regexp::FoldCase; + + if (perl_classes()) + flags |= Regexp::PerlClasses; + + if (word_boundary()) + flags |= Regexp::PerlB; + + if (one_line()) + flags |= Regexp::OneLine; + + return flags; +} + +void RE2::Init(const StringPiece& pattern, const Options& options) { + mutex_ = new Mutex; + pattern_ = pattern.as_string(); + options_.Copy(options); + error_ = &empty_string; + error_code_ = NoError; + suffix_regexp_ = NULL; + entire_regexp_ = NULL; + prog_ = NULL; + rprog_ = NULL; + named_groups_ = NULL; + group_names_ = NULL; + num_captures_ = -1; + + RegexpStatus status; + entire_regexp_ = Regexp::Parse( + pattern_, + static_cast<Regexp::ParseFlags>(options_.ParseFlags()), + &status); + if (entire_regexp_ == NULL) { + if (error_ == &empty_string) + error_ = new string(status.Text()); + if (options_.log_errors()) { + LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': " + << status.Text(); + } + error_arg_ = status.error_arg().as_string(); + error_code_ = RegexpErrorToRE2(status.code()); + return; + } + + prefix_.clear(); + prefix_foldcase_ = false; + re2::Regexp* suffix; + if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix)) + suffix_regexp_ = suffix; + else + suffix_regexp_ = entire_regexp_->Incref(); + + // Two thirds of the memory goes to the forward Prog, + // one third to the reverse prog, because the forward + // Prog has two DFAs but the reverse prog has one. + prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3); + if (prog_ == NULL) { + if (options_.log_errors()) + LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'"; + error_ = new string("pattern too large - compile failed"); + error_code_ = RE2::ErrorPatternTooLarge; + return; + } + + // Could delay this until the first match call that + // cares about submatch information, but the one-pass + // machine's memory gets cut from the DFA memory budget, + // and that is harder to do if the DFA has already + // been built. + is_one_pass_ = prog_->IsOnePass(); +} + +// Returns rprog_, computing it if needed. +re2::Prog* RE2::ReverseProg() const { + MutexLock l(mutex_); + if (rprog_ == NULL && error_ == &empty_string) { + rprog_ = suffix_regexp_->CompileToReverseProg(options_.max_mem()/3); + if (rprog_ == NULL) { + if (options_.log_errors()) + LOG(ERROR) << "Error reverse compiling '" << trunc(pattern_) << "'"; + error_ = new string("pattern too large - reverse compile failed"); + error_code_ = RE2::ErrorPatternTooLarge; + return NULL; + } + } + return rprog_; +} + +static const map<string, int> empty_named_groups; +static const map<int, string> empty_group_names; + +RE2::~RE2() { + if (suffix_regexp_) + suffix_regexp_->Decref(); + if (entire_regexp_) + entire_regexp_->Decref(); + delete mutex_; + delete prog_; + delete rprog_; + if (error_ != &empty_string) + delete error_; + if (named_groups_ != NULL && named_groups_ != &empty_named_groups) + delete named_groups_; + if (group_names_ != NULL && group_names_ != &empty_group_names) + delete group_names_; +} + +int RE2::ProgramSize() const { + if (prog_ == NULL) + return -1; + return prog_->size(); +} + +// Returns named_groups_, computing it if needed. +const map<string, int>& RE2::NamedCapturingGroups() const { + MutexLock l(mutex_); + if (!ok()) + return empty_named_groups; + if (named_groups_ == NULL) { + named_groups_ = suffix_regexp_->NamedCaptures(); + if (named_groups_ == NULL) + named_groups_ = &empty_named_groups; + } + return *named_groups_; +} + +// Returns group_names_, computing it if needed. +const map<int, string>& RE2::CapturingGroupNames() const { + MutexLock l(mutex_); + if (!ok()) + return empty_group_names; + if (group_names_ == NULL) { + group_names_ = suffix_regexp_->CaptureNames(); + if (group_names_ == NULL) + group_names_ = &empty_group_names; + } + return *group_names_; +} + +/***** Convenience interfaces *****/ + +bool RE2::FullMatchN(const StringPiece& text, const RE2& re, + const Arg* const args[], int n) { + return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n); +} + +bool RE2::PartialMatchN(const StringPiece& text, const RE2& re, + const Arg* const args[], int n) { + return re.DoMatch(text, UNANCHORED, NULL, args, n); +} + +bool RE2::ConsumeN(StringPiece* input, const RE2& re, + const Arg* const args[], int n) { + int consumed; + if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) { + input->remove_prefix(consumed); + return true; + } else { + return false; + } +} + +bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re, + const Arg* const args[], int n) { + int consumed; + if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) { + input->remove_prefix(consumed); + return true; + } else { + return false; + } +} + +// Returns the maximum submatch needed for the rewrite to be done by Replace(). +// E.g. if rewrite == "foo \\2,\\1", returns 2. +static int MaxSubmatch(const StringPiece& rewrite) { + int max = 0; + for (const char *s = rewrite.data(), *end = s + rewrite.size(); + s < end; s++) { + if (*s == '\\') { + s++; + int c = (s < end) ? *s : -1; + if (isdigit(c)) { + int n = (c - '0'); + if (n > max) + max = n; + } + } + } + return max; +} + +bool RE2::Replace(string *str, + const RE2& re, + const StringPiece& rewrite) { + StringPiece vec[kVecSize]; + int nvec = 1 + MaxSubmatch(rewrite); + if (nvec > arraysize(vec)) + return false; + if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec)) + return false; + + string s; + if (!re.Rewrite(&s, rewrite, vec, nvec)) + return false; + + assert(vec[0].begin() >= str->data()); + assert(vec[0].end() <= str->data()+str->size()); + str->replace(vec[0].data() - str->data(), vec[0].size(), s); + return true; +} + +int RE2::GlobalReplace(string *str, + const RE2& re, + const StringPiece& rewrite) { + StringPiece vec[kVecSize]; + int nvec = 1 + MaxSubmatch(rewrite); + if (nvec > arraysize(vec)) + return false; + + const char* p = str->data(); + const char* ep = p + str->size(); + const char* lastend = NULL; + string out; + int count = 0; + while (p <= ep) { + if (!re.Match(*str, p - str->data(), str->size(), UNANCHORED, vec, nvec)) + break; + if (p < vec[0].begin()) + out.append(p, vec[0].begin() - p); + if (vec[0].begin() == lastend && vec[0].size() == 0) { + // Disallow empty match at end of last match: skip ahead. + if (p < ep) + out.append(p, 1); + p++; + continue; + } + re.Rewrite(&out, rewrite, vec, nvec); + p = vec[0].end(); + lastend = p; + count++; + } + + if (count == 0) + return 0; + + if (p < ep) + out.append(p, ep - p); + swap(out, *str); + return count; +} + +bool RE2::Extract(const StringPiece &text, + const RE2& re, + const StringPiece &rewrite, + string *out) { + StringPiece vec[kVecSize]; + int nvec = 1 + MaxSubmatch(rewrite); + if (nvec > arraysize(vec)) + return false; + + if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec)) + return false; + + out->clear(); + return re.Rewrite(out, rewrite, vec, nvec); +} + +string RE2::QuoteMeta(const StringPiece& unquoted) { + string result; + result.reserve(unquoted.size() << 1); + + // Escape any ascii character not in [A-Za-z_0-9]. + // + // Note that it's legal to escape a character even if it has no + // special meaning in a regular expression -- so this function does + // that. (This also makes it identical to the perl function of the + // same name except for the null-character special case; + // see `perldoc -f quotemeta`.) + for (int ii = 0; ii < unquoted.length(); ++ii) { + // Note that using 'isalnum' here raises the benchmark time from + // 32ns to 58ns: + if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') && + (unquoted[ii] < 'A' || unquoted[ii] > 'Z') && + (unquoted[ii] < '0' || unquoted[ii] > '9') && + unquoted[ii] != '_' && + // If this is the part of a UTF8 or Latin1 character, we need + // to copy this byte without escaping. Experimentally this is + // what works correctly with the regexp library. + !(unquoted[ii] & 128)) { + if (unquoted[ii] == '\0') { // Special handling for null chars. + // Note that this special handling is not strictly required for RE2, + // but this quoting is required for other regexp libraries such as + // PCRE. + // Can't use "\\0" since the next character might be a digit. + result += "\\x00"; + continue; + } + result += '\\'; + } + result += unquoted[ii]; + } + + return result; +} + +bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const { + if (prog_ == NULL) + return false; + + int n = prefix_.size(); + if (n > maxlen) + n = maxlen; + + // Determine initial min max from prefix_ literal. + string pmin, pmax; + pmin = prefix_.substr(0, n); + pmax = prefix_.substr(0, n); + if (prefix_foldcase_) { + // prefix is ASCII lowercase; change pmin to uppercase. + for (int i = 0; i < n; i++) { + if ('a' <= pmin[i] && pmin[i] <= 'z') + pmin[i] += 'A' - 'a'; + } + } + + // Add to prefix min max using PossibleMatchRange on regexp. + string dmin, dmax; + maxlen -= n; + if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) { + pmin += dmin; + pmax += dmax; + } else if (pmax.size() > 0) { + // prog_->PossibleMatchRange has failed us, + // but we still have useful information from prefix_. + // Round up pmax to allow any possible suffix. + pmax = PrefixSuccessor(pmax); + } else { + // Nothing useful. + *min = ""; + *max = ""; + return false; + } + + *min = pmin; + *max = pmax; + return true; +} + +// Avoid possible locale nonsense in standard strcasecmp. +// The string a is known to be all lowercase. +static int ascii_strcasecmp(const char* a, const char* b, int len) { + const char *ae = a + len; + + for (; a < ae; a++, b++) { + uint8 x = *a; + uint8 y = *b; + if ('A' <= y && y <= 'Z') + y += 'a' - 'A'; + if (x != y) + return x - y; + } + return 0; +} + + +/***** Actual matching and rewriting code *****/ + +bool RE2::Match(const StringPiece& text, + int startpos, + int endpos, + Anchor re_anchor, + StringPiece* submatch, + int nsubmatch) const { + if (!ok() || suffix_regexp_ == NULL) { + if (options_.log_errors()) + LOG(ERROR) << "Invalid RE2: " << *error_; + return false; + } + + if (startpos < 0 || startpos > endpos || endpos > text.size()) { + LOG(ERROR) << "RE2: invalid startpos, endpos pair."; + return false; + } + + StringPiece subtext = text; + subtext.remove_prefix(startpos); + subtext.remove_suffix(text.size() - endpos); + + // Use DFAs to find exact location of match, filter out non-matches. + + // Don't ask for the location if we won't use it. + // SearchDFA can do extra optimizations in that case. + StringPiece match; + StringPiece* matchp = &match; + if (nsubmatch == 0) + matchp = NULL; + + int ncap = 1 + NumberOfCapturingGroups(); + if (ncap > nsubmatch) + ncap = nsubmatch; + + // If the regexp is anchored explicitly, must not be in middle of text. + if (prog_->anchor_start() && startpos != 0) + return false; + + // If the regexp is anchored explicitly, update re_anchor + // so that we can potentially fall into a faster case below. + if (prog_->anchor_start() && prog_->anchor_end()) + re_anchor = ANCHOR_BOTH; + else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH) + re_anchor = ANCHOR_START; + + // Check for the required prefix, if any. + int prefixlen = 0; + if (!prefix_.empty()) { + if (startpos != 0) + return false; + prefixlen = prefix_.size(); + if (prefixlen > subtext.size()) + return false; + if (prefix_foldcase_) { + if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0) + return false; + } else { + if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0) + return false; + } + subtext.remove_prefix(prefixlen); + // If there is a required prefix, the anchor must be at least ANCHOR_START. + if (re_anchor != ANCHOR_BOTH) + re_anchor = ANCHOR_START; + } + + Prog::Anchor anchor = Prog::kUnanchored; + Prog::MatchKind kind = Prog::kFirstMatch; + if (options_.longest_match()) + kind = Prog::kLongestMatch; + bool skipped_test = false; + + bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture); + + // SearchBitState allocates a bit vector of size prog_->size() * text.size(). + // It also allocates a stack of 3-word structures which could potentially + // grow as large as prog_->size() * text.size() but in practice is much + // smaller. + // Conditions for using SearchBitState: + const int MaxBitStateProg = 500; // prog_->size() <= Max. + const int MaxBitStateVector = 256*1024; // bit vector size <= Max (bits) + bool can_bit_state = prog_->size() <= MaxBitStateProg; + int bit_state_text_max = MaxBitStateVector / prog_->size(); + + bool dfa_failed = false; + switch (re_anchor) { + default: + case UNANCHORED: { + if (!prog_->SearchDFA(subtext, text, anchor, kind, + matchp, &dfa_failed, NULL)) { + if (dfa_failed) { + // Fall back to NFA below. + skipped_test = true; + if (FLAGS_trace_re2) + LOG(INFO) << "Match " << trunc(pattern_) + << " [" << CEscape(subtext) << "]" + << " DFA failed."; + break; + } + if (FLAGS_trace_re2) + LOG(INFO) << "Match " << trunc(pattern_) + << " [" << CEscape(subtext) << "]" + << " used DFA - no match."; + return false; + } + if (FLAGS_trace_re2) + LOG(INFO) << "Match " << trunc(pattern_) + << " [" << CEscape(subtext) << "]" + << " used DFA - match"; + if (matchp == NULL) // Matched. Don't care where + return true; + // SearchDFA set match[0].end() but didn't know where the + // match started. Run the regexp backward from match[0].end() + // to find the longest possible match -- that's where it started. + Prog* prog = ReverseProg(); + if (prog == NULL) + return false; + if (!prog->SearchDFA(match, text, Prog::kAnchored, + Prog::kLongestMatch, &match, &dfa_failed, NULL)) { + if (dfa_failed) { + // Fall back to NFA below. + skipped_test = true; + if (FLAGS_trace_re2) + LOG(INFO) << "Match " << trunc(pattern_) + << " [" << CEscape(subtext) << "]" + << " reverse DFA failed."; + break; + } + if (FLAGS_trace_re2) + LOG(INFO) << "Match " << trunc(pattern_) + << " [" << CEscape(subtext) << "]" + << " DFA inconsistency."; + LOG(ERROR) << "DFA inconsistency"; + return false; + } + if (FLAGS_trace_re2) + LOG(INFO) << "Match " << trunc(pattern_) + << " [" << CEscape(subtext) << "]" + << " used reverse DFA."; + break; + } + + case ANCHOR_BOTH: + case ANCHOR_START: + if (re_anchor == ANCHOR_BOTH) + kind = Prog::kFullMatch; + anchor = Prog::kAnchored; + + // If only a small amount of text and need submatch + // information anyway and we're going to use OnePass or BitState + // to get it, we might as well not even bother with the DFA: + // OnePass or BitState will be fast enough. + // On tiny texts, OnePass outruns even the DFA, and + // it doesn't have the shared state and occasional mutex that + // the DFA does. + if (can_one_pass && text.size() <= 4096 && + (ncap > 1 || text.size() <= 8)) { + if (FLAGS_trace_re2) + LOG(INFO) << "Match " << trunc(pattern_) + << " [" << CEscape(subtext) << "]" + << " skipping DFA for OnePass."; + skipped_test = true; + break; + } + if (can_bit_state && text.size() <= bit_state_text_max && ncap > 1) { + if (FLAGS_trace_re2) + LOG(INFO) << "Match " << trunc(pattern_) + << " [" << CEscape(subtext) << "]" + << " skipping DFA for BitState."; + skipped_test = true; + break; + } + if (!prog_->SearchDFA(subtext, text, anchor, kind, + &match, &dfa_failed, NULL)) { + if (dfa_failed) { + if (FLAGS_trace_re2) + LOG(INFO) << "Match " << trunc(pattern_) + << " [" << CEscape(subtext) << "]" + << " DFA failed."; + skipped_test = true; + break; + } + if (FLAGS_trace_re2) + LOG(INFO) << "Match " << trunc(pattern_) + << " [" << CEscape(subtext) << "]" + << " used DFA - no match."; + return false; + } + break; + } + + if (!skipped_test && ncap <= 1) { + // We know exactly where it matches. That's enough. + if (ncap == 1) + submatch[0] = match; + } else { + StringPiece subtext1; + if (skipped_test) { + // DFA ran out of memory or was skipped: + // need to search in entire original text. + subtext1 = subtext; + } else { + // DFA found the exact match location: + // let NFA run an anchored, full match search + // to find submatch locations. + subtext1 = match; + anchor = Prog::kAnchored; + kind = Prog::kFullMatch; + } + + if (can_one_pass && anchor != Prog::kUnanchored) { + if (FLAGS_trace_re2) + LOG(INFO) << "Match " << trunc(pattern_) + << " [" << CEscape(subtext) << "]" + << " using OnePass."; + if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) { + if (!skipped_test) + LOG(ERROR) << "SearchOnePass inconsistency"; + return false; + } + } else if (can_bit_state && subtext1.size() <= bit_state_text_max) { + if (FLAGS_trace_re2) + LOG(INFO) << "Match " << trunc(pattern_) + << " [" << CEscape(subtext) << "]" + << " using BitState."; + if (!prog_->SearchBitState(subtext1, text, anchor, + kind, submatch, ncap)) { + if (!skipped_test) + LOG(ERROR) << "SearchBitState inconsistency"; + return false; + } + } else { + if (FLAGS_trace_re2) + LOG(INFO) << "Match " << trunc(pattern_) + << " [" << CEscape(subtext) << "]" + << " using NFA."; + if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) { + if (!skipped_test) + LOG(ERROR) << "SearchNFA inconsistency"; + return false; + } + } + } + + // Adjust overall match for required prefix that we stripped off. + if (prefixlen > 0 && nsubmatch > 0) + submatch[0] = StringPiece(submatch[0].begin() - prefixlen, + submatch[0].size() + prefixlen); + + // Zero submatches that don't exist in the regexp. + for (int i = ncap; i < nsubmatch; i++) + submatch[i] = NULL; + return true; +} + +// Internal matcher - like Match() but takes Args not StringPieces. +bool RE2::DoMatch(const StringPiece& text, + Anchor anchor, + int* consumed, + const Arg* const* args, + int n) const { + if (!ok()) { + if (options_.log_errors()) + LOG(ERROR) << "Invalid RE2: " << *error_; + return false; + } + + // Count number of capture groups needed. + int nvec; + if (n == 0 && consumed == NULL) + nvec = 0; + else + nvec = n+1; + + StringPiece* vec; + StringPiece stkvec[kVecSize]; + StringPiece* heapvec = NULL; + + if (nvec <= arraysize(stkvec)) { + vec = stkvec; + } else { + vec = new StringPiece[nvec]; + heapvec = vec; + } + + if (!Match(text, 0, text.size(), anchor, vec, nvec)) { + delete[] heapvec; + return false; + } + + if(consumed != NULL) + *consumed = vec[0].end() - text.begin(); + + if (n == 0 || args == NULL) { + // We are not interested in results + delete[] heapvec; + return true; + } + + int ncap = NumberOfCapturingGroups(); + if (ncap < n) { + // RE has fewer capturing groups than number of arg pointers passed in + VLOG(1) << "Asked for " << n << " but only have " << ncap; + delete[] heapvec; + return false; + } + + // If we got here, we must have matched the whole pattern. + for (int i = 0; i < n; i++) { + const StringPiece& s = vec[i+1]; + if (!args[i]->Parse(s.data(), s.size())) { + // TODO: Should we indicate what the error was? + VLOG(1) << "Parse error on #" << i << " " << s << " " + << (void*)s.data() << "/" << s.size(); + delete[] heapvec; + return false; + } + } + + delete[] heapvec; + return true; +} + +// Append the "rewrite" string, with backslash subsitutions from "vec", +// to string "out". +bool RE2::Rewrite(string *out, const StringPiece &rewrite, + const StringPiece *vec, int veclen) const { + for (const char *s = rewrite.data(), *end = s + rewrite.size(); + s < end; s++) { + int c = *s; + if (c == '\\') { + s++; + c = (s < end) ? *s : -1; + if (isdigit(c)) { + int n = (c - '0'); + if (n >= veclen) { + LOG(ERROR) << "requested group " << n + << " in regexp " << rewrite.data(); + return false; + } + StringPiece snip = vec[n]; + if (snip.size() > 0) + out->append(snip.data(), snip.size()); + } else if (c == '\\') { + out->push_back('\\'); + } else { + LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data(); + return false; + } + } else { + out->push_back(c); + } + } + return true; +} + +// Return the number of capturing subpatterns, or -1 if the +// regexp wasn't valid on construction. +int RE2::NumberOfCapturingGroups() const { + if (suffix_regexp_ == NULL) + return -1; + ANNOTATE_BENIGN_RACE(&num_captures_, "benign race: in the worst case" + " multiple threads end up doing the same work in parallel."); + if (num_captures_ == -1) + num_captures_ = suffix_regexp_->NumCaptures(); + return num_captures_; +} + +// Checks that the rewrite string is well-formed with respect to this +// regular expression. +bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const { + int max_token = -1; + for (const char *s = rewrite.data(), *end = s + rewrite.size(); + s < end; s++) { + int c = *s; + if (c != '\\') { + continue; + } + if (++s == end) { + *error = "Rewrite schema error: '\\' not allowed at end."; + return false; + } + c = *s; + if (c == '\\') { + continue; + } + if (!isdigit(c)) { + *error = "Rewrite schema error: " + "'\\' must be followed by a digit or '\\'."; + return false; + } + int n = (c - '0'); + if (max_token < n) { + max_token = n; + } + } + + if (max_token > NumberOfCapturingGroups()) { + SStringPrintf(error, "Rewrite schema requests %d matches, " + "but the regexp only has %d parenthesized subexpressions.", + max_token, NumberOfCapturingGroups()); + return false; + } + return true; +} + +/***** Parsers for various types *****/ + +bool RE2::Arg::parse_null(const char* str, int n, void* dest) { + // We fail if somebody asked us to store into a non-NULL void* pointer + return (dest == NULL); +} + +bool RE2::Arg::parse_string(const char* str, int n, void* dest) { + if (dest == NULL) return true; + reinterpret_cast<string*>(dest)->assign(str, n); + return true; +} + +bool RE2::Arg::parse_stringpiece(const char* str, int n, void* dest) { + if (dest == NULL) return true; + reinterpret_cast<StringPiece*>(dest)->set(str, n); + return true; +} + +bool RE2::Arg::parse_char(const char* str, int n, void* dest) { + if (n != 1) return false; + if (dest == NULL) return true; + *(reinterpret_cast<char*>(dest)) = str[0]; + return true; +} + +bool RE2::Arg::parse_uchar(const char* str, int n, void* dest) { + if (n != 1) return false; + if (dest == NULL) return true; + *(reinterpret_cast<unsigned char*>(dest)) = str[0]; + return true; +} + +// Largest number spec that we are willing to parse +static const int kMaxNumberLength = 32; + +// REQUIRES "buf" must have length at least kMaxNumberLength+1 +// Copies "str" into "buf" and null-terminates. +// Overwrites *np with the new length. +static const char* TerminateNumber(char* buf, const char* str, int* np) { + int n = *np; + if (n <= 0) return ""; + if (n > 0 && isspace(*str)) { + // We are less forgiving than the strtoxxx() routines and do not + // allow leading spaces. + return ""; + } + + // Although buf has a fixed maximum size, we can still handle + // arbitrarily large integers correctly by omitting leading zeros. + // (Numbers that are still too long will be out of range.) + // Before deciding whether str is too long, + // remove leading zeros with s/000+/00/. + // Leaving the leading two zeros in place means that + // we don't change 0000x123 (invalid) into 0x123 (valid). + // Skip over leading - before replacing. + bool neg = false; + if (n >= 1 && str[0] == '-') { + neg = true; + n--; + str++; + } + + if (n >= 3 && str[0] == '0' && str[1] == '0') { + while (n >= 3 && str[2] == '0') { + n--; + str++; + } + } + + if (neg) { // make room in buf for - + n++; + str--; + } + + if (n > kMaxNumberLength) return ""; + + memmove(buf, str, n); + if (neg) { + buf[0] = '-'; + } + buf[n] = '\0'; + *np = n; + return buf; +} + +bool RE2::Arg::parse_long_radix(const char* str, + int n, + void* dest, + int radix) { + if (n == 0) return false; + char buf[kMaxNumberLength+1]; + str = TerminateNumber(buf, str, &n); + char* end; + errno = 0; + long r = strtol(str, &end, radix); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *(reinterpret_cast<long*>(dest)) = r; + return true; +} + +bool RE2::Arg::parse_ulong_radix(const char* str, + int n, + void* dest, + int radix) { + if (n == 0) return false; + char buf[kMaxNumberLength+1]; + str = TerminateNumber(buf, str, &n); + if (str[0] == '-') { + // strtoul() will silently accept negative numbers and parse + // them. This module is more strict and treats them as errors. + return false; + } + + char* end; + errno = 0; + unsigned long r = strtoul(str, &end, radix); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *(reinterpret_cast<unsigned long*>(dest)) = r; + return true; +} + +bool RE2::Arg::parse_short_radix(const char* str, + int n, + void* dest, + int radix) { + long r; + if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse + if ((short)r != r) return false; // Out of range + if (dest == NULL) return true; + *(reinterpret_cast<short*>(dest)) = r; + return true; +} + +bool RE2::Arg::parse_ushort_radix(const char* str, + int n, + void* dest, + int radix) { + unsigned long r; + if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse + if ((ushort)r != r) return false; // Out of range + if (dest == NULL) return true; + *(reinterpret_cast<unsigned short*>(dest)) = r; + return true; +} + +bool RE2::Arg::parse_int_radix(const char* str, + int n, + void* dest, + int radix) { + long r; + if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse + if ((int)r != r) return false; // Out of range + if (dest == NULL) return true; + *(reinterpret_cast<int*>(dest)) = r; + return true; +} + +bool RE2::Arg::parse_uint_radix(const char* str, + int n, + void* dest, + int radix) { + unsigned long r; + if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse + if ((uint)r != r) return false; // Out of range + if (dest == NULL) return true; + *(reinterpret_cast<unsigned int*>(dest)) = r; + return true; +} + +bool RE2::Arg::parse_longlong_radix(const char* str, + int n, + void* dest, + int radix) { + if (n == 0) return false; + char buf[kMaxNumberLength+1]; + str = TerminateNumber(buf, str, &n); + char* end; + errno = 0; + int64 r = strtoll(str, &end, radix); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *(reinterpret_cast<int64*>(dest)) = r; + return true; +} + +bool RE2::Arg::parse_ulonglong_radix(const char* str, + int n, + void* dest, + int radix) { + if (n == 0) return false; + char buf[kMaxNumberLength+1]; + str = TerminateNumber(buf, str, &n); + if (str[0] == '-') { + // strtoull() will silently accept negative numbers and parse + // them. This module is more strict and treats them as errors. + return false; + } + char* end; + errno = 0; + uint64 r = strtoull(str, &end, radix); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *(reinterpret_cast<uint64*>(dest)) = r; + return true; +} + +static bool parse_double_float(const char* str, int n, bool isfloat, void *dest) { + if (n == 0) return false; + static const int kMaxLength = 200; + char buf[kMaxLength]; + if (n >= kMaxLength) return false; + memcpy(buf, str, n); + buf[n] = '\0'; + errno = 0; + char* end; + double r; + if (isfloat) { + r = strtof(buf, &end); + } else { + r = strtod(buf, &end); + } + if (end != buf + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + if (isfloat) { + *(reinterpret_cast<float*>(dest)) = r; + } else { + *(reinterpret_cast<double*>(dest)) = r; + } + return true; +} + +bool RE2::Arg::parse_double(const char* str, int n, void* dest) { + return parse_double_float(str, n, false, dest); +} + +bool RE2::Arg::parse_float(const char* str, int n, void* dest) { + return parse_double_float(str, n, true, dest); +} + + +#define DEFINE_INTEGER_PARSERS(name) \ + bool RE2::Arg::parse_##name(const char* str, int n, void* dest) { \ + return parse_##name##_radix(str, n, dest, 10); \ + } \ + bool RE2::Arg::parse_##name##_hex(const char* str, int n, void* dest) { \ + return parse_##name##_radix(str, n, dest, 16); \ + } \ + bool RE2::Arg::parse_##name##_octal(const char* str, int n, void* dest) { \ + return parse_##name##_radix(str, n, dest, 8); \ + } \ + bool RE2::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \ + return parse_##name##_radix(str, n, dest, 0); \ + } + +DEFINE_INTEGER_PARSERS(short); +DEFINE_INTEGER_PARSERS(ushort); +DEFINE_INTEGER_PARSERS(int); +DEFINE_INTEGER_PARSERS(uint); +DEFINE_INTEGER_PARSERS(long); +DEFINE_INTEGER_PARSERS(ulong); +DEFINE_INTEGER_PARSERS(longlong); +DEFINE_INTEGER_PARSERS(ulonglong); + +#undef DEFINE_INTEGER_PARSERS + +} // namespace re2 |