1 files changed, 632 insertions, 0 deletions
diff --git a/re2/regexp.h b/re2/regexp.h
new file mode 100644
index 0000000..1aebc16
--- /dev/null
+++ b/re2/regexp.h
@@ -0,0 +1,632 @@
+// Copyright 2006 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// --- SPONSORED LINK --------------------------------------------------
+// If you want to use this library for regular expression matching,
+// you should use re2/re2.h, which provides a class RE2 that
+// mimics the PCRE interface provided by PCRE's C++ wrappers.
+// This header describes the low-level interface used to implement RE2
+// and may change in backwards-incompatible ways from time to time.
+// In contrast, RE2's interface will not.
+// ---------------------------------------------------------------------
+
+// Regular expression library: parsing, execution, and manipulation
+// of regular expressions.
+//
+// Any operation that traverses the Regexp structures should be written
+// using Regexp::Walker (see walker-inl.h), not recursively, because deeply nested
+// regular expressions such as x++++++++++++++++++++... might cause recursive
+// traversals to overflow the stack.
+//
+// It is the caller's responsibility to provide appropriate mutual exclusion
+// around manipulation of the regexps.  RE2 does this.
+//
+// PARSING
+//
+// Regexp::Parse parses regular expressions encoded in UTF-8.
+// The default syntax is POSIX extended regular expressions,
+// with the following changes:
+//
+//   1.  Backreferences (optional in POSIX EREs) are not supported.
+//         (Supporting them precludes the use of DFA-based
+//          matching engines.)
+//
+//   2.  Collating elements and collation classes are not supported.
+//         (No one has needed or wanted them.)
+//
+// The exact syntax accepted can be modified by passing flags to
+// Regexp::Parse.  In particular, many of the basic Perl additions
+// are available.  The flags are documented below (search for LikePerl).
+//
+// If parsed with the flag Regexp::Latin1, both the regular expression
+// and the input to the matching routines are assumed to be encoded in
+// Latin-1, not UTF-8.
+//
+// EXECUTION
+//
+// Once Regexp has parsed a regular expression, it provides methods
+// to search text using that regular expression.  These methods are
+// implemented via calling out to other regular expression libraries.
+// (Let's call them the sublibraries.)
+//
+// To call a sublibrary, Regexp does not simply prepare a
+// string version of the regular expression and hand it to the
+// sublibrary.  Instead, Regexp prepares, from its own parsed form, the
+// corresponding internal representation used by the sublibrary.
+// This has the drawback of needing to know the internal representation
+// used by the sublibrary, but it has two important benefits:
+//
+//   1. The syntax and meaning of regular expressions is guaranteed
+//      to be that used by Regexp's parser, not the syntax expected
+//      by the sublibrary.  Regexp might accept a restricted or
+//      expanded syntax for regular expressions as compared with
+//      the sublibrary.  As long as Regexp can translate from its
+//      internal form into the sublibrary's, clients need not know
+//      exactly which sublibrary they are using.
+//
+//   2. The sublibrary parsers are bypassed.  For whatever reason,
+//      sublibrary regular expression parsers often have security
+//      problems.  For example, plan9grep's regular expression parser
+//      has a buffer overflow in its handling of large character
+//      classes, and PCRE's parser has had buffer overflow problems
+//      in the past.  Security-team requires sandboxing of sublibrary
+//      regular expression parsers.  Avoiding the sublibrary parsers
+//      avoids the sandbox.
+//
+// The execution methods we use now are provided by the compiled form,
+// Prog, described in prog.h
+//
+// MANIPULATION
+//
+// Unlike other regular expression libraries, Regexp makes its parsed
+// form accessible to clients, so that client code can analyze the
+// parsed regular expressions.
+
+#ifndef RE2_REGEXP_H__
+#define RE2_REGEXP_H__
+
+#include "util/util.h"
+#include "re2/stringpiece.h"
+
+namespace re2 {
+
+// Keep in sync with string list kOpcodeNames[] in testing/dump.cc
+enum RegexpOp {
+  // Matches no strings.
+  kRegexpNoMatch = 1,
+
+  // Matches empty string.
+  kRegexpEmptyMatch,
+
+  // Matches rune_.
+  kRegexpLiteral,
+
+  // Matches runes_.
+  kRegexpLiteralString,
+
+  // Matches concatenation of sub_[0..nsub-1].
+  kRegexpConcat,
+  // Matches union of sub_[0..nsub-1].
+  kRegexpAlternate,
+
+  // Matches sub_[0] zero or more times.
+  kRegexpStar,
+  // Matches sub_[0] one or more times.
+  kRegexpPlus,
+  // Matches sub_[0] zero or one times.
+  kRegexpQuest,
+
+  // Matches sub_[0] at least min_ times, at most max_ times.
+  // max_ == -1 means no upper limit.
+  kRegexpRepeat,
+
+  // Parenthesized (capturing) subexpression.  Index is cap_.
+  // Optionally, capturing name is name_.
+  kRegexpCapture,
+
+  // Matches any character.
+  kRegexpAnyChar,
+
+  // Matches any byte [sic].
+  kRegexpAnyByte,
+
+  // Matches empty string at beginning of line.
+  kRegexpBeginLine,
+  // Matches empty string at end of line.
+  kRegexpEndLine,
+
+  // Matches word boundary "\b".
+  kRegexpWordBoundary,
+  // Matches not-a-word boundary "\B".
+  kRegexpNoWordBoundary,
+
+  // Matches empty string at beginning of text.
+  kRegexpBeginText,
+  // Matches empty string at end of text.
+  kRegexpEndText,
+
+  // Matches character class given by cc_.
+  kRegexpCharClass,
+
+  // Forces match of entire expression right now,
+  // with match ID match_id_ (used by RE2::Set).
+  kRegexpHaveMatch,
+
+  kMaxRegexpOp = kRegexpHaveMatch,
+};
+
+// Keep in sync with string list in regexp.cc
+enum RegexpStatusCode {
+  // No error
+  kRegexpSuccess = 0,
+
+  // Unexpected error
+  kRegexpInternalError,
+
+  // Parse errors
+  kRegexpBadEscape,          // bad escape sequence
+  kRegexpBadCharClass,       // bad character class
+  kRegexpBadCharRange,       // bad character class range
+  kRegexpMissingBracket,     // missing closing ]
+  kRegexpMissingParen,       // missing closing )
+  kRegexpTrailingBackslash,  // at end of regexp
+  kRegexpRepeatArgument,     // repeat argument missing, e.g. "*"
+  kRegexpRepeatSize,         // bad repetition argument
+  kRegexpRepeatOp,           // bad repetition operator
+  kRegexpBadPerlOp,          // bad perl operator
+  kRegexpBadUTF8,            // invalid UTF-8 in regexp
+  kRegexpBadNamedCapture,    // bad named capture
+};
+
+// Error status for certain operations.
+class RegexpStatus {
+ public:
+  RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {}
+  ~RegexpStatus() { delete tmp_; }
+
+  void set_code(enum RegexpStatusCode code) { code_ = code; }
+  void set_error_arg(const StringPiece& error_arg) { error_arg_ = error_arg; }
+  void set_tmp(string* tmp) { delete tmp_; tmp_ = tmp; }
+  enum RegexpStatusCode code() const { return code_; }
+  const StringPiece& error_arg() const { return error_arg_; }
+  bool ok() const { return code() == kRegexpSuccess; }
+
+  // Copies state from status.
+  void Copy(const RegexpStatus& status);
+
+  // Returns text equivalent of code, e.g.:
+  //   "Bad character class"
+  static const string& CodeText(enum RegexpStatusCode code);
+
+  // Returns text describing error, e.g.:
+  //   "Bad character class: [z-a]"
+  string Text() const;
+
+ private:
+  enum RegexpStatusCode code_;  // Kind of error
+  StringPiece error_arg_;       // Piece of regexp containing syntax error.
+  string* tmp_;                 // Temporary storage, possibly where error_arg_ is.
+
+  DISALLOW_EVIL_CONSTRUCTORS(RegexpStatus);
+};
+
+// Walker to implement Simplify.
+class SimplifyWalker;
+
+// Compiled form; see prog.h
+class Prog;
+
+struct RuneRange {
+  RuneRange() : lo(0), hi(0) { }
+  RuneRange(int l, int h) : lo(l), hi(h) { }
+  Rune lo;
+  Rune hi;
+};
+
+// Less-than on RuneRanges treats a == b if they overlap at all.
+// This lets us look in a set to find the range covering a particular Rune.
+struct RuneRangeLess {
+  bool operator()(const RuneRange& a, const RuneRange& b) const {
+    return a.hi < b.lo;
+  }
+};
+
+class CharClassBuilder;
+
+class CharClass {
+ public:
+  void Delete();
+
+  typedef RuneRange* iterator;
+  iterator begin() { return ranges_; }
+  iterator end() { return ranges_ + nranges_; }
+
+  int size() { return nrunes_; }
+  bool empty() { return nrunes_ == 0; }
+  bool full() { return nrunes_ == Runemax+1; }
+  bool FoldsASCII() { return folds_ascii_; }
+
+  bool Contains(Rune r);
+  CharClass* Negate();
+
+ private:
+  CharClass();  // not implemented
+  ~CharClass();  // not implemented
+  static CharClass* New(int maxranges);
+
+  friend class CharClassBuilder;
+
+  bool folds_ascii_;
+  int nrunes_;
+  RuneRange *ranges_;
+  int nranges_;
+  DISALLOW_EVIL_CONSTRUCTORS(CharClass);
+};
+
+class Regexp {
+ public:
+
+  // Flags for parsing.  Can be ORed together.
+  enum ParseFlags {
+    NoParseFlags = 0,
+    FoldCase     = 1<<0,   // Fold case during matching (case-insensitive).
+    Literal      = 1<<1,   // Treat s as literal string instead of a regexp.
+    ClassNL      = 1<<2,   // Allow char classes like [^a-z] and \D and \s
+                           // and [[:space:]] to match newline.
+    DotNL        = 1<<3,   // Allow . to match newline.
+    MatchNL      = ClassNL | DotNL,
+    OneLine      = 1<<4,   // Treat ^ and $ as only matching at beginning and
+                           // end of text, not around embedded newlines.
+                           // (Perl's default)
+    Latin1       = 1<<5,   // Regexp and text are in Latin1, not UTF-8.
+    NonGreedy    = 1<<6,   // Repetition operators are non-greedy by default.
+    PerlClasses  = 1<<7,   // Allow Perl character classes like \d.
+    PerlB        = 1<<8,   // Allow Perl's \b and \B.
+    PerlX        = 1<<9,   // Perl extensions:
+                           //   non-capturing parens - (?: )
+                           //   non-greedy operators - *? +? ?? {}?
+                           //   flag edits - (?i) (?-i) (?i: )
+                           //     i - FoldCase
+                           //     m - !OneLine
+                           //     s - DotNL
+                           //     U - NonGreedy
+                           //   line ends: \A \z
+                           //   \Q and \E to disable/enable metacharacters
+                           //   (?P<name>expr) for named captures
+                           //   \C to match any single byte
+    UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group
+                           //   and \P{Han} for its negation.
+    NeverNL      = 1<<11,  // Never match NL, even if the regexp mentions
+                           //   it explicitly.
+
+    // As close to Perl as we can get.
+    LikePerl     = ClassNL | OneLine | PerlClasses | PerlB | PerlX |
+                   UnicodeGroups,
+
+    // Internal use only.
+    WasDollar    = 1<<15,  // on kRegexpEndText: was $ in regexp text
+  };
+
+  // Get.  No set, Regexps are logically immutable once created.
+  RegexpOp op() { return static_cast<RegexpOp>(op_); }
+  int nsub() { return nsub_; }
+  bool simple() { return simple_; }
+  enum ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); }
+  int Ref();  // For testing.
+
+  Regexp** sub() {
+    if(nsub_ <= 1)
+      return &subone_;
+    else
+      return submany_;
+  }
+
+  int min() { DCHECK_EQ(op_, kRegexpRepeat); return min_; }
+  int max() { DCHECK_EQ(op_, kRegexpRepeat); return max_; }
+  Rune rune() { DCHECK_EQ(op_, kRegexpLiteral); return rune_; }
+  CharClass* cc() { DCHECK_EQ(op_, kRegexpCharClass); return cc_; }
+  int cap() { DCHECK_EQ(op_, kRegexpCapture); return cap_; }
+  const string* name() { DCHECK_EQ(op_, kRegexpCapture); return name_; }
+  Rune* runes() { DCHECK_EQ(op_, kRegexpLiteralString); return runes_; }
+  int nrunes() { DCHECK_EQ(op_, kRegexpLiteralString); return nrunes_; }
+  int match_id() { DCHECK_EQ(op_, kRegexpHaveMatch); return match_id_; }
+
+  // Increments reference count, returns object as convenience.
+  Regexp* Incref();
+
+  // Decrements reference count and deletes this object if count reaches 0.
+  void Decref();
+
+  // Parses string s to produce regular expression, returned.
+  // Caller must release return value with re->Decref().
+  // On failure, sets *status (if status != NULL) and returns NULL.
+  static Regexp* Parse(const StringPiece& s, ParseFlags flags,
+                       RegexpStatus* status);
+
+  // Returns a _new_ simplified version of the current regexp.
+  // Does not edit the current regexp.
+  // Caller must release return value with re->Decref().
+  // Simplified means that counted repetition has been rewritten
+  // into simpler terms and all Perl/POSIX features have been
+  // removed.  The result will capture exactly the same
+  // subexpressions the original did, unless formatted with ToString.
+  Regexp* Simplify();
+  friend class SimplifyWalker;
+
+  // Parses the regexp src and then simplifies it and sets *dst to the
+  // string representation of the simplified form.  Returns true on success.
+  // Returns false and sets *status (if status != NULL) on parse error.
+  static bool SimplifyRegexp(const StringPiece& src, ParseFlags flags,
+                             string* dst,
+                             RegexpStatus* status);
+
+  // Returns the number of capturing groups in the regexp.
+  int NumCaptures();
+  friend class NumCapturesWalker;
+
+  // Returns a map from names to capturing group indices,
+  // or NULL if the regexp contains no named capture groups.
+  // The caller is responsible for deleting the map.
+  map<string, int>* NamedCaptures();
+
+  // Returns a map from capturing group indices to capturing group
+  // names or NULL if the regexp contains no named capture groups. The
+  // caller is responsible for deleting the map.
+  map<int, string>* CaptureNames();
+
+  // Returns a string representation of the current regexp,
+  // using as few parentheses as possible.
+  string ToString();
+
+  // Convenience functions.  They consume the passed reference,
+  // so in many cases you should use, e.g., Plus(re->Incref(), flags).
+  // They do not consume allocated arrays like subs or runes.
+  static Regexp* Plus(Regexp* sub, ParseFlags flags);
+  static Regexp* Star(Regexp* sub, ParseFlags flags);
+  static Regexp* Quest(Regexp* sub, ParseFlags flags);
+  static Regexp* Concat(Regexp** subs, int nsubs, ParseFlags flags);
+  static Regexp* Alternate(Regexp** subs, int nsubs, ParseFlags flags);
+  static Regexp* Capture(Regexp* sub, ParseFlags flags, int cap);
+  static Regexp* Repeat(Regexp* sub, ParseFlags flags, int min, int max);
+  static Regexp* NewLiteral(Rune rune, ParseFlags flags);
+  static Regexp* NewCharClass(CharClass* cc, ParseFlags flags);
+  static Regexp* LiteralString(Rune* runes, int nrunes, ParseFlags flags);
+  static Regexp* HaveMatch(int match_id, ParseFlags flags);
+
+  // Like Alternate but does not factor out common prefixes.
+  static Regexp* AlternateNoFactor(Regexp** subs, int nsubs, ParseFlags flags);
+
+  // Debugging function.  Returns string format for regexp
+  // that makes structure clear.  Does NOT use regexp syntax.
+  string Dump();
+
+  // Helper traversal class, defined fully in walker-inl.h.
+  template<typename T> class Walker;
+
+  // Compile to Prog.  See prog.h
+  // Reverse prog expects to be run over text backward.
+  // Construction and execution of prog will
+  // stay within approximately max_mem bytes of memory.
+  // If max_mem <= 0, a reasonable default is used.
+  Prog* CompileToProg(int64 max_mem);
+  Prog* CompileToReverseProg(int64 max_mem);
+
+  // Whether to expect this library to find exactly the same answer as PCRE
+  // when running this regexp.  Most regexps do mimic PCRE exactly, but a few
+  // obscure cases behave differently.  Technically this is more a property
+  // of the Prog than the Regexp, but the computation is much easier to do
+  // on the Regexp.  See mimics_pcre.cc for the exact conditions.
+  bool MimicsPCRE();
+
+  // Benchmarking function.
+  void NullWalk();
+
+  // Whether every match of this regexp must be anchored and
+  // begin with a non-empty fixed string (perhaps after ASCII
+  // case-folding).  If so, returns the prefix and the sub-regexp that
+  // follows it.
+  bool RequiredPrefix(string* prefix, bool *foldcase, Regexp** suffix);
+
+ private:
+  // Constructor allocates vectors as appropriate for operator.
+  explicit Regexp(RegexpOp op, ParseFlags parse_flags);
+
+  // Use Decref() instead of delete to release Regexps.
+  // This is private to catch deletes at compile time.
+  ~Regexp();
+  void Destroy();
+  bool QuickDestroy();
+
+  // Helpers for Parse.  Listed here so they can edit Regexps.
+  class ParseState;
+  friend class ParseState;
+  friend bool ParseCharClass(StringPiece* s, Regexp** out_re,
+                             RegexpStatus* status);
+
+  // Helper for testing [sic].
+  friend bool RegexpEqualTestingOnly(Regexp*, Regexp*);
+
+  // Computes whether Regexp is already simple.
+  bool ComputeSimple();
+
+  // Constructor that generates a concatenation or alternation,
+  // enforcing the limit on the number of subexpressions for
+  // a particular Regexp.
+  static Regexp* ConcatOrAlternate(RegexpOp op, Regexp** subs, int nsubs,
+                                   ParseFlags flags, bool can_factor);
+
+  // Returns the leading string that re starts with.
+  // The returned Rune* points into a piece of re,
+  // so it must not be used after the caller calls re->Decref().
+  static Rune* LeadingString(Regexp* re, int* nrune, ParseFlags* flags);
+
+  // Removes the first n leading runes from the beginning of re.
+  // Edits re in place.
+  static void RemoveLeadingString(Regexp* re, int n);
+
+  // Returns the leading regexp in re's top-level concatenation.
+  // The returned Regexp* points at re or a sub-expression of re,
+  // so it must not be used after the caller calls re->Decref().
+  static Regexp* LeadingRegexp(Regexp* re);
+
+  // Removes LeadingRegexp(re) from re and returns the remainder.
+  // Might edit re in place.
+  static Regexp* RemoveLeadingRegexp(Regexp* re);
+
+  // Simplifies an alternation of literal strings by factoring out
+  // common prefixes.
+  static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags);
+  static int FactorAlternationRecursive(Regexp** sub, int nsub,
+                                        ParseFlags flags, int maxdepth);
+
+  // Is a == b?  Only efficient on regexps that have not been through
+  // Simplify yet - the expansion of a kRegexpRepeat will make this
+  // take a long time.  Do not call on such regexps, hence private.
+  static bool Equal(Regexp* a, Regexp* b);
+
+  // Allocate space for n sub-regexps.
+  void AllocSub(int n) {
+    if (n < 0 || static_cast<uint16>(n) != n)
+      LOG(FATAL) << "Cannot AllocSub " << n;
+    if (n > 1)
+      submany_ = new Regexp*[n];
+    nsub_ = n;
+  }
+
+  // Add Rune to LiteralString
+  void AddRuneToString(Rune r);
+
+  // Swaps this with that, in place.
+  void Swap(Regexp *that);
+
+  // Operator.  See description of operators above.
+  // uint8 instead of RegexpOp to control space usage.
+  uint8 op_;
+
+  // Is this regexp structure already simple
+  // (has it been returned by Simplify)?
+  // uint8 instead of bool to control space usage.
+  uint8 simple_;
+
+  // Flags saved from parsing and used during execution.
+  // (Only FoldCase is used.)
+  // uint16 instead of ParseFlags to control space usage.
+  uint16 parse_flags_;
+
+  // Reference count.  Exists so that SimplifyRegexp can build
+  // regexp structures that are dags rather than trees to avoid
+  // exponential blowup in space requirements.
+  // uint16 to control space usage.
+  // The standard regexp routines will never generate a
+  // ref greater than the maximum repeat count (100),
+  // but even so, Incref and Decref consult an overflow map
+  // when ref_ reaches kMaxRef.
+  uint16 ref_;
+  static const uint16 kMaxRef = 0xffff;
+
+  // Subexpressions.
+  // uint16 to control space usage.
+  // Concat and Alternate handle larger numbers of subexpressions
+  // by building concatenation or alternation trees.
+  // Other routines should call Concat or Alternate instead of
+  // filling in sub() by hand.
+  uint16 nsub_;
+  static const uint16 kMaxNsub = 0xffff;
+  union {
+    Regexp** submany_;  // if nsub_ > 1
+    Regexp* subone_;  // if nsub_ == 1
+  };
+
+  // Extra space for parse and teardown stacks.
+  Regexp* down_;
+
+  // Arguments to operator.  See description of operators above.
+  union {
+    struct {  // Repeat
+      int max_;
+      int min_;
+    };
+    struct {  // Capture
+      int cap_;
+      string* name_;
+    };
+    struct {  // LiteralString
+      int nrunes_;
+      Rune* runes_;
+    };
+    struct {  // CharClass
+      // These two could be in separate union members,
+      // but it wouldn't save any space (there are other two-word structs)
+      // and keeping them separate avoids confusion during parsing.
+      CharClass* cc_;
+      CharClassBuilder* ccb_;
+    };
+    Rune rune_;  // Literal
+    int match_id_;  // HaveMatch
+    void *the_union_[2];  // as big as any other element, for memset
+  };
+
+  DISALLOW_EVIL_CONSTRUCTORS(Regexp);
+};
+
+// Character class set: contains non-overlapping, non-abutting RuneRanges.
+typedef set<RuneRange, RuneRangeLess> RuneRangeSet;
+
+class CharClassBuilder {
+ public:
+  CharClassBuilder();
+
+  typedef RuneRangeSet::iterator iterator;
+  iterator begin() { return ranges_.begin(); }
+  iterator end() { return ranges_.end(); }
+
+  int size() { return nrunes_; }
+  bool empty() { return nrunes_ == 0; }
+  bool full() { return nrunes_ == Runemax+1; }
+
+  bool Contains(Rune r);
+  bool FoldsASCII();
+  bool AddRange(Rune lo, Rune hi);  // returns whether class changed
+  CharClassBuilder* Copy();
+  void AddCharClass(CharClassBuilder* cc);
+  void Negate();
+  void RemoveAbove(Rune r);
+  CharClass* GetCharClass();
+  void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags);
+
+ private:
+  static const uint32 AlphaMask = (1<<26) - 1;
+  uint32 upper_;  // bitmap of A-Z
+  uint32 lower_;  // bitmap of a-z
+  int nrunes_;
+  RuneRangeSet ranges_;
+  DISALLOW_EVIL_CONSTRUCTORS(CharClassBuilder);
+};
+
+// Tell g++ that bitwise ops on ParseFlags produce ParseFlags.
+inline Regexp::ParseFlags operator|(Regexp::ParseFlags a, Regexp::ParseFlags b)
+{
+  return static_cast<Regexp::ParseFlags>(static_cast<int>(a) | static_cast<int>(b));
+}
+
+inline Regexp::ParseFlags operator^(Regexp::ParseFlags a, Regexp::ParseFlags b)
+{
+  return static_cast<Regexp::ParseFlags>(static_cast<int>(a) ^ static_cast<int>(b));
+}
+
+inline Regexp::ParseFlags operator&(Regexp::ParseFlags a, Regexp::ParseFlags b)
+{
+  return static_cast<Regexp::ParseFlags>(static_cast<int>(a) & static_cast<int>(b));
+}
+
+inline Regexp::ParseFlags operator~(Regexp::ParseFlags a)
+{
+  return static_cast<Regexp::ParseFlags>(~static_cast<int>(a));
+}
+
+
+
+}  // namespace re2
+
+#endif  // RE2_REGEXP_H__