diff options
Diffstat (limited to 're2/compile.cc')
-rw-r--r-- | re2/compile.cc | 1138 |
1 files changed, 1138 insertions, 0 deletions
diff --git a/re2/compile.cc b/re2/compile.cc new file mode 100644 index 0000000..67c4c2c --- /dev/null +++ b/re2/compile.cc @@ -0,0 +1,1138 @@ +// Copyright 2007 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Compile regular expression to Prog. +// +// Prog and Inst are defined in prog.h. +// This file's external interface is just Regexp::CompileToProg. +// The Compiler class defined in this file is private. + +#include "re2/prog.h" +#include "re2/re2.h" +#include "re2/regexp.h" +#include "re2/walker-inl.h" + +namespace re2 { + +// List of pointers to Inst* that need to be filled in (patched). +// Because the Inst* haven't been filled in yet, +// we can use the Inst* word to hold the list's "next" pointer. +// It's kind of sleazy, but it works well in practice. +// See http://swtch.com/~rsc/regexp/regexp1.html for inspiration. +// +// Because the out and out1 fields in Inst are no longer pointers, +// we can't use pointers directly here either. Instead, p refers +// to inst_[p>>1].out (p&1 == 0) or inst_[p>>1].out1 (p&1 == 1). +// p == 0 represents the NULL list. This is okay because instruction #0 +// is always the fail instruction, which never appears on a list. + +struct PatchList { + uint32 p; + + // Returns patch list containing just p. + static PatchList Mk(uint32 p); + + // Patches all the entries on l to have value v. + // Caller must not ever use patch list again. + static void Patch(Prog::Inst *inst0, PatchList l, uint32 v); + + // Deref returns the next pointer pointed at by p. + static PatchList Deref(Prog::Inst *inst0, PatchList l); + + // Appends two patch lists and returns result. + static PatchList Append(Prog::Inst *inst0, PatchList l1, PatchList l2); +}; + +static PatchList nullPatchList = { 0 }; + +// Returns patch list containing just p. +PatchList PatchList::Mk(uint32 p) { + PatchList l; + l.p = p; + return l; +} + +// Returns the next pointer pointed at by l. +PatchList PatchList::Deref(Prog::Inst* inst0, PatchList l) { + Prog::Inst* ip = &inst0[l.p>>1]; + if (l.p&1) + l.p = ip->out1(); + else + l.p = ip->out(); + return l; +} + +// Patches all the entries on l to have value v. +void PatchList::Patch(Prog::Inst *inst0, PatchList l, uint32 val) { + while (l.p != 0) { + Prog::Inst* ip = &inst0[l.p>>1]; + if (l.p&1) { + l.p = ip->out1(); + ip->out1_ = val; + } else { + l.p = ip->out(); + ip->set_out(val); + } + } +} + +// Appends two patch lists and returns result. +PatchList PatchList::Append(Prog::Inst* inst0, PatchList l1, PatchList l2) { + if (l1.p == 0) + return l2; + if (l2.p == 0) + return l1; + + PatchList l = l1; + for (;;) { + PatchList next = PatchList::Deref(inst0, l); + if (next.p == 0) + break; + l = next; + } + + Prog::Inst* ip = &inst0[l.p>>1]; + if (l.p&1) + ip->out1_ = l2.p; + else + ip->set_out(l2.p); + + return l1; +} + +// Compiled program fragment. +struct Frag { + uint32 begin; + PatchList end; + + Frag() : begin(0) { end.p = 0; } // needed so Frag can go in vector + Frag(uint32 begin, PatchList end) : begin(begin), end(end) {} +}; + +static Frag kNullFrag; + +// Input encodings. +enum Encoding { + kEncodingUTF8 = 1, // UTF-8 (0-10FFFF) + kEncodingLatin1, // Latin1 (0-FF) +}; + +class Compiler : public Regexp::Walker<Frag> { + public: + explicit Compiler(); + ~Compiler(); + + // Compiles Regexp to a new Prog. + // Caller is responsible for deleting Prog when finished with it. + // If reversed is true, compiles for walking over the input + // string backward (reverses all concatenations). + static Prog *Compile(Regexp* re, bool reversed, int64 max_mem); + + // Compiles alternation of all the re to a new Prog. + // Each re has a match with an id equal to its index in the vector. + static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor, + Regexp* re); + + // Interface for Regexp::Walker, which helps traverse the Regexp. + // The walk is purely post-recursive: given the machines for the + // children, PostVisit combines them to create the machine for + // the current node. The child_args are Frags. + // The Compiler traverses the Regexp parse tree, visiting + // each node in depth-first order. It invokes PreVisit before + // visiting the node's children and PostVisit after visiting + // the children. + Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop); + Frag PostVisit(Regexp* re, Frag parent_arg, Frag pre_arg, Frag* child_args, + int nchild_args); + Frag ShortVisit(Regexp* re, Frag parent_arg); + Frag Copy(Frag arg); + + // Given fragment a, returns a+ or a+?; a* or a*?; a? or a?? + Frag Plus(Frag a, bool nongreedy); + Frag Star(Frag a, bool nongreedy); + Frag Quest(Frag a, bool nongreedy); + + // Given fragment a, returns (a) capturing as \n. + Frag Capture(Frag a, int n); + + // Given fragments a and b, returns ab; a|b + Frag Cat(Frag a, Frag b); + Frag Alt(Frag a, Frag b); + + // Returns a fragment that can't match anything. + Frag NoMatch(); + + // Returns a fragment that matches the empty string. + Frag Match(int32 id); + + // Returns a no-op fragment. + Frag Nop(); + + // Returns a fragment matching the byte range lo-hi. + Frag ByteRange(int lo, int hi, bool foldcase); + + // Returns a fragment matching an empty-width special op. + Frag EmptyWidth(EmptyOp op); + + // Adds n instructions to the program. + // Returns the index of the first one. + // Returns -1 if no more instructions are available. + int AllocInst(int n); + + // Deletes unused instructions. + void Trim(); + + // Rune range compiler. + + // Begins a new alternation. + void BeginRange(); + + // Adds a fragment matching the rune range lo-hi. + void AddRuneRange(Rune lo, Rune hi, bool foldcase); + void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase); + void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase); + void Add_80_10ffff(); + + // New suffix that matches the byte range lo-hi, then goes to next. + int RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next); + int UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next); + + // Adds a suffix to alternation. + void AddSuffix(int id); + + // Returns the alternation of all the added suffixes. + Frag EndRange(); + + // Single rune. + Frag Literal(Rune r, bool foldcase); + + void Setup(Regexp::ParseFlags, int64, RE2::Anchor); + Prog* Finish(); + + // Returns .* where dot = any byte + Frag DotStar(); + + private: + Prog* prog_; // Program being built. + bool failed_; // Did we give up compiling? + Encoding encoding_; // Input encoding + bool reversed_; // Should program run backward over text? + + int max_inst_; // Maximum number of instructions. + + Prog::Inst* inst_; // Pointer to first instruction. + int inst_len_; // Number of instructions used. + int inst_cap_; // Number of instructions allocated. + + int64 max_mem_; // Total memory budget. + + map<uint64, int> rune_cache_; + Frag rune_range_; + + RE2::Anchor anchor_; // anchor mode for RE2::Set + + DISALLOW_EVIL_CONSTRUCTORS(Compiler); +}; + +Compiler::Compiler() { + prog_ = new Prog(); + failed_ = false; + encoding_ = kEncodingUTF8; + reversed_ = false; + inst_ = NULL; + inst_len_ = 0; + inst_cap_ = 0; + max_inst_ = 1; // make AllocInst for fail instruction okay + max_mem_ = 0; + int fail = AllocInst(1); + inst_[fail].InitFail(); + max_inst_ = 0; // Caller must change +} + +Compiler::~Compiler() { + delete prog_; + delete[] inst_; +} + +int Compiler::AllocInst(int n) { + if (failed_ || inst_len_ + n > max_inst_) { + failed_ = true; + return -1; + } + + if (inst_len_ + n > inst_cap_) { + if (inst_cap_ == 0) + inst_cap_ = 8; + while (inst_len_ + n > inst_cap_) + inst_cap_ *= 2; + Prog::Inst* ip = new Prog::Inst[inst_cap_]; + memmove(ip, inst_, inst_len_ * sizeof ip[0]); + memset(ip + inst_len_, 0, (inst_cap_ - inst_len_) * sizeof ip[0]); + delete[] inst_; + inst_ = ip; + } + int id = inst_len_; + inst_len_ += n; + return id; +} + +void Compiler::Trim() { + if (inst_len_ < inst_cap_) { + Prog::Inst* ip = new Prog::Inst[inst_len_]; + memmove(ip, inst_, inst_len_ * sizeof ip[0]); + delete[] inst_; + inst_ = ip; + inst_cap_ = inst_len_; + } +} + +// These routines are somewhat hard to visualize in text -- +// see http://swtch.com/~rsc/regexp/regexp1.html for +// pictures explaining what is going on here. + +// Returns an unmatchable fragment. +Frag Compiler::NoMatch() { + return Frag(0, nullPatchList); +} + +// Is a an unmatchable fragment? +static bool IsNoMatch(Frag a) { + return a.begin == 0; +} + +// Given fragments a and b, returns fragment for ab. +Frag Compiler::Cat(Frag a, Frag b) { + if (IsNoMatch(a) || IsNoMatch(b)) + return NoMatch(); + + // Elide no-op. + Prog::Inst* begin = &inst_[a.begin]; + if (begin->opcode() == kInstNop && + a.end.p == (a.begin << 1) && + begin->out() == 0) { + PatchList::Patch(inst_, a.end, b.begin); // in case refs to a somewhere + return b; + } + + // To run backward over string, reverse all concatenations. + if (reversed_) { + PatchList::Patch(inst_, b.end, a.begin); + return Frag(b.begin, a.end); + } + + PatchList::Patch(inst_, a.end, b.begin); + return Frag(a.begin, b.end); +} + +// Given fragments for a and b, returns fragment for a|b. +Frag Compiler::Alt(Frag a, Frag b) { + // Special case for convenience in loops. + if (IsNoMatch(a)) + return b; + if (IsNoMatch(b)) + return a; + + int id = AllocInst(1); + if (id < 0) + return NoMatch(); + + inst_[id].InitAlt(a.begin, b.begin); + return Frag(id, PatchList::Append(inst_, a.end, b.end)); +} + +// When capturing submatches in like-Perl mode, a kOpAlt Inst +// treats out_ as the first choice, out1_ as the second. +// +// For *, +, and ?, if out_ causes another repetition, +// then the operator is greedy. If out1_ is the repetition +// (and out_ moves forward), then the operator is non-greedy. + +// Given a fragment a, returns a fragment for a* or a*? (if nongreedy) +Frag Compiler::Star(Frag a, bool nongreedy) { + int id = AllocInst(1); + if (id < 0) + return NoMatch(); + inst_[id].InitAlt(0, 0); + PatchList::Patch(inst_, a.end, id); + if (nongreedy) { + inst_[id].out1_ = a.begin; + return Frag(id, PatchList::Mk(id << 1)); + } else { + inst_[id].set_out(a.begin); + return Frag(id, PatchList::Mk((id << 1) | 1)); + } +} + +// Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy) +Frag Compiler::Plus(Frag a, bool nongreedy) { + // a+ is just a* with a different entry point. + Frag f = Star(a, nongreedy); + return Frag(a.begin, f.end); +} + +// Given a fragment for a, returns a fragment for a? or a?? (if nongreedy) +Frag Compiler::Quest(Frag a, bool nongreedy) { + int id = AllocInst(1); + if (id < 0) + return NoMatch(); + PatchList pl; + if (nongreedy) { + inst_[id].InitAlt(0, a.begin); + pl = PatchList::Mk(id << 1); + } else { + inst_[id].InitAlt(a.begin, 0); + pl = PatchList::Mk((id << 1) | 1); + } + return Frag(id, PatchList::Append(inst_, pl, a.end)); +} + +// Returns a fragment for the byte range lo-hi. +Frag Compiler::ByteRange(int lo, int hi, bool foldcase) { + int id = AllocInst(1); + if (id < 0) + return NoMatch(); + inst_[id].InitByteRange(lo, hi, foldcase, 0); + prog_->byte_inst_count_++; + prog_->MarkByteRange(lo, hi); + if (foldcase && lo <= 'z' && hi >= 'a') { + if (lo < 'a') + lo = 'a'; + if (hi > 'z') + hi = 'z'; + if (lo <= hi) + prog_->MarkByteRange(lo + 'A' - 'a', hi + 'A' - 'a'); + } + return Frag(id, PatchList::Mk(id << 1)); +} + +// Returns a no-op fragment. Sometimes unavoidable. +Frag Compiler::Nop() { + int id = AllocInst(1); + if (id < 0) + return NoMatch(); + inst_[id].InitNop(0); + return Frag(id, PatchList::Mk(id << 1)); +} + +// Returns a fragment that signals a match. +Frag Compiler::Match(int32 match_id) { + int id = AllocInst(1); + if (id < 0) + return NoMatch(); + inst_[id].InitMatch(match_id); + return Frag(id, nullPatchList); +} + +// Returns a fragment matching a particular empty-width op (like ^ or $) +Frag Compiler::EmptyWidth(EmptyOp empty) { + int id = AllocInst(1); + if (id < 0) + return NoMatch(); + inst_[id].InitEmptyWidth(empty, 0); + if (empty & (kEmptyBeginLine|kEmptyEndLine)) + prog_->MarkByteRange('\n', '\n'); + if (empty & (kEmptyWordBoundary|kEmptyNonWordBoundary)) { + int j; + for (int i = 0; i < 256; i = j) { + for (j = i+1; j < 256 && Prog::IsWordChar(i) == Prog::IsWordChar(j); j++) + ; + prog_->MarkByteRange(i, j-1); + } + } + return Frag(id, PatchList::Mk(id << 1)); +} + +// Given a fragment a, returns a fragment with capturing parens around a. +Frag Compiler::Capture(Frag a, int n) { + int id = AllocInst(2); + if (id < 0) + return NoMatch(); + inst_[id].InitCapture(2*n, a.begin); + inst_[id+1].InitCapture(2*n+1, 0); + PatchList::Patch(inst_, a.end, id+1); + + return Frag(id, PatchList::Mk((id+1) << 1)); +} + +// A Rune is a name for a Unicode code point. +// Returns maximum rune encoded by UTF-8 sequence of length len. +static int MaxRune(int len) { + int b; // number of Rune blents lenn len-byte UTF-8 sequence (len < UTFmax) + if (len == 1) + b = 7; + else + b = 8-(len+1) + 6*(len-1); + return (1<<b) - 1; // maximum Rune for b bits. +} + +// The rune range compiler caches common suffix fragments, +// which are very common in UTF-8 (e.g., [80-bf]). +// The fragment suffixes are identified by their start +// instructions. NULL denotes the eventual end match. +// The Frag accumulates in rune_range_. Caching common +// suffixes reduces the UTF-8 "." from 32 to 24 instructions, +// and it reduces the corresponding one-pass NFA from 16 nodes to 8. + +void Compiler::BeginRange() { + rune_cache_.clear(); + rune_range_.begin = 0; + rune_range_.end = nullPatchList; +} + +int Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, + int next) { + Frag f = ByteRange(lo, hi, foldcase); + if (next != 0) { + PatchList::Patch(inst_, f.end, next); + } else { + rune_range_.end = PatchList::Append(inst_, rune_range_.end, f.end); + } + return f.begin; +} + +int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) { + // In Latin1 mode, there's no point in caching. + // In forward UTF-8 mode, only need to cache continuation bytes. + if (encoding_ == kEncodingLatin1 || + (encoding_ == kEncodingUTF8 && + !reversed_ && + !(0x80 <= lo && hi <= 0xbf))) { + return UncachedRuneByteSuffix(lo, hi, foldcase, next); + } + + uint64 key = ((uint64)next << 17) | (lo<<9) | (hi<<1) | foldcase; + map<uint64, int>::iterator it = rune_cache_.find(key); + if (it != rune_cache_.end()) + return it->second; + int id = UncachedRuneByteSuffix(lo, hi, foldcase, next); + rune_cache_[key] = id; + return id; +} + +void Compiler::AddSuffix(int id) { + if (rune_range_.begin == 0) { + rune_range_.begin = id; + return; + } + + int alt = AllocInst(1); + if (alt < 0) { + rune_range_.begin = 0; + return; + } + inst_[alt].InitAlt(rune_range_.begin, id); + rune_range_.begin = alt; +} + +Frag Compiler::EndRange() { + return rune_range_; +} + +// Converts rune range lo-hi into a fragment that recognizes +// the bytes that would make up those runes in the current +// encoding (Latin 1 or UTF-8). +// This lets the machine work byte-by-byte even when +// using multibyte encodings. + +void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) { + switch (encoding_) { + default: + case kEncodingUTF8: + AddRuneRangeUTF8(lo, hi, foldcase); + break; + case kEncodingLatin1: + AddRuneRangeLatin1(lo, hi, foldcase); + break; + } +} + +void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) { + // Latin1 is easy: runes *are* bytes. + if (lo > hi || lo > 0xFF) + return; + if (hi > 0xFF) + hi = 0xFF; + AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0)); +} + +// Table describing how to make a UTF-8 matching machine +// for the rune range 80-10FFFF (Runeself-Runemax). +// This range happens frequently enough (for example /./ and /[^a-z]/) +// and the rune_cache_ map is slow enough that this is worth +// special handling. Makes compilation of a small expression +// with a dot in it about 10% faster. +// The * in the comments below mark whole sequences. +static struct ByteRangeProg { + int next; + int lo; + int hi; +} prog_80_10ffff[] = { + // Two-byte + { -1, 0x80, 0xBF, }, // 0: 80-BF + { 0, 0xC2, 0xDF, }, // 1: C2-DF 80-BF* + + // Three-byte + { 0, 0xA0, 0xBF, }, // 2: A0-BF 80-BF + { 2, 0xE0, 0xE0, }, // 3: E0 A0-BF 80-BF* + { 0, 0x80, 0xBF, }, // 4: 80-BF 80-BF + { 4, 0xE1, 0xEF, }, // 5: E1-EF 80-BF 80-BF* + + // Four-byte + { 4, 0x90, 0xBF, }, // 6: 90-BF 80-BF 80-BF + { 6, 0xF0, 0xF0, }, // 7: F0 90-BF 80-BF 80-BF* + { 4, 0x80, 0xBF, }, // 8: 80-BF 80-BF 80-BF + { 8, 0xF1, 0xF3, }, // 9: F1-F3 80-BF 80-BF 80-BF* + { 4, 0x80, 0x8F, }, // 10: 80-8F 80-BF 80-BF + { 10, 0xF4, 0xF4, }, // 11: F4 80-8F 80-BF 80-BF* +}; + +void Compiler::Add_80_10ffff() { + int inst[arraysize(prog_80_10ffff)]; + for (int i = 0; i < arraysize(prog_80_10ffff); i++) { + const ByteRangeProg& p = prog_80_10ffff[i]; + int next = 0; + if (p.next >= 0) + next = inst[p.next]; + inst[i] = UncachedRuneByteSuffix(p.lo, p.hi, false, next); + if ((p.lo & 0xC0) != 0x80) + AddSuffix(inst[i]); + } +} + +void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) { + if (lo > hi) + return; + + // Pick off 80-10FFFF as a common special case + // that can bypass the slow rune_cache_. + if (lo == 0x80 && hi == 0x10ffff && !reversed_) { + Add_80_10ffff(); + return; + } + + // Split range into same-length sized ranges. + for (int i = 1; i < UTFmax; i++) { + Rune max = MaxRune(i); + if (lo <= max && max < hi) { + AddRuneRangeUTF8(lo, max, foldcase); + AddRuneRangeUTF8(max+1, hi, foldcase); + return; + } + } + + // ASCII range is always a special case. + if (hi < Runeself) { + AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0)); + return; + } + + // Split range into sections that agree on leading bytes. + for (int i = 1; i < UTFmax; i++) { + uint m = (1<<(6*i)) - 1; // last i bytes of a UTF-8 sequence + if ((lo & ~m) != (hi & ~m)) { + if ((lo & m) != 0) { + AddRuneRangeUTF8(lo, lo|m, foldcase); + AddRuneRangeUTF8((lo|m)+1, hi, foldcase); + return; + } + if ((hi & m) != m) { + AddRuneRangeUTF8(lo, (hi&~m)-1, foldcase); + AddRuneRangeUTF8(hi&~m, hi, foldcase); + return; + } + } + } + + // Finally. Generate byte matching equivalent for lo-hi. + uint8 ulo[UTFmax], uhi[UTFmax]; + int n = runetochar(reinterpret_cast<char*>(ulo), &lo); + int m = runetochar(reinterpret_cast<char*>(uhi), &hi); + (void)m; // USED(m) + DCHECK_EQ(n, m); + + int id = 0; + if (reversed_) { + for (int i = 0; i < n; i++) + id = RuneByteSuffix(ulo[i], uhi[i], false, id); + } else { + for (int i = n-1; i >= 0; i--) + id = RuneByteSuffix(ulo[i], uhi[i], false, id); + } + AddSuffix(id); +} + +// Should not be called. +Frag Compiler::Copy(Frag arg) { + // We're using WalkExponential; there should be no copying. + LOG(DFATAL) << "Compiler::Copy called!"; + failed_ = true; + return NoMatch(); +} + +// Visits a node quickly; called once WalkExponential has +// decided to cut this walk short. +Frag Compiler::ShortVisit(Regexp* re, Frag) { + failed_ = true; + return NoMatch(); +} + +// Called before traversing a node's children during the walk. +Frag Compiler::PreVisit(Regexp* re, Frag, bool* stop) { + // Cut off walk if we've already failed. + if (failed_) + *stop = true; + + return kNullFrag; // not used by caller +} + +Frag Compiler::Literal(Rune r, bool foldcase) { + switch (encoding_) { + default: + return kNullFrag; + + case kEncodingLatin1: + return ByteRange(r, r, foldcase); + + case kEncodingUTF8: { + if (r < Runeself) // Make common case fast. + return ByteRange(r, r, foldcase); + uint8 buf[UTFmax]; + int n = runetochar(reinterpret_cast<char*>(buf), &r); + Frag f = ByteRange((uint8)buf[0], buf[0], false); + for (int i = 1; i < n; i++) + f = Cat(f, ByteRange((uint8)buf[i], buf[i], false)); + return f; + } + } +} + +// Called after traversing the node's children during the walk. +// Given their frags, build and return the frag for this re. +Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags, + int nchild_frags) { + // If a child failed, don't bother going forward, especially + // since the child_frags might contain Frags with NULLs in them. + if (failed_) + return NoMatch(); + + // Given the child fragments, return the fragment for this node. + switch (re->op()) { + case kRegexpRepeat: + // Should not see; code at bottom of function will print error + break; + + case kRegexpNoMatch: + return NoMatch(); + + case kRegexpEmptyMatch: + return Nop(); + + case kRegexpHaveMatch: { + Frag f = Match(re->match_id()); + // Remember unanchored match to end of string. + if (anchor_ != RE2::ANCHOR_BOTH) + f = Cat(DotStar(), f); + return f; + } + + case kRegexpConcat: { + Frag f = child_frags[0]; + for (int i = 1; i < nchild_frags; i++) + f = Cat(f, child_frags[i]); + return f; + } + + case kRegexpAlternate: { + Frag f = child_frags[0]; + for (int i = 1; i < nchild_frags; i++) + f = Alt(f, child_frags[i]); + return f; + } + + case kRegexpStar: + return Star(child_frags[0], re->parse_flags()&Regexp::NonGreedy); + + case kRegexpPlus: + return Plus(child_frags[0], re->parse_flags()&Regexp::NonGreedy); + + case kRegexpQuest: + return Quest(child_frags[0], re->parse_flags()&Regexp::NonGreedy); + + case kRegexpLiteral: + return Literal(re->rune(), re->parse_flags()&Regexp::FoldCase); + + case kRegexpLiteralString: { + // Concatenation of literals. + if (re->nrunes() == 0) + return Nop(); + Frag f; + for (int i = 0; i < re->nrunes(); i++) { + Frag f1 = Literal(re->runes()[i], re->parse_flags()&Regexp::FoldCase); + if (i == 0) + f = f1; + else + f = Cat(f, f1); + } + return f; + } + + case kRegexpAnyChar: + BeginRange(); + AddRuneRange(0, Runemax, false); + return EndRange(); + + case kRegexpAnyByte: + return ByteRange(0x00, 0xFF, false); + + case kRegexpCharClass: { + CharClass* cc = re->cc(); + if (cc->empty()) { + // This can't happen. + LOG(DFATAL) << "No ranges in char class"; + failed_ = true; + return NoMatch(); + } + + // ASCII case-folding optimization: if the char class + // behaves the same on A-Z as it does on a-z, + // discard any ranges wholly contained in A-Z + // and mark the other ranges as foldascii. + // This reduces the size of a program for + // (?i)abc from 3 insts per letter to 1 per letter. + bool foldascii = cc->FoldsASCII(); + + // Character class is just a big OR of the different + // character ranges in the class. + BeginRange(); + for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) { + // ASCII case-folding optimization (see above). + if (foldascii && 'A' <= i->lo && i->hi <= 'Z') + continue; + + // If this range contains all of A-Za-z or none of it, + // the fold flag is unnecessary; don't bother. + bool fold = foldascii; + if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo) + fold = false; + + AddRuneRange(i->lo, i->hi, fold); + } + return EndRange(); + } + + case kRegexpCapture: + // If this is a non-capturing parenthesis -- (?:foo) -- + // just use the inner expression. + if (re->cap() < 0) + return child_frags[0]; + return Capture(child_frags[0], re->cap()); + + case kRegexpBeginLine: + return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine); + + case kRegexpEndLine: + return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine); + + case kRegexpBeginText: + return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText); + + case kRegexpEndText: + return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText); + + case kRegexpWordBoundary: + return EmptyWidth(kEmptyWordBoundary); + + case kRegexpNoWordBoundary: + return EmptyWidth(kEmptyNonWordBoundary); + } + LOG(DFATAL) << "Missing case in Compiler: " << re->op(); + failed_ = true; + return NoMatch(); +} + +// Is this regexp required to start at the beginning of the text? +// Only approximate; can return false for complicated regexps like (\Aa|\Ab), +// but handles (\A(a|b)). Could use the Walker to write a more exact one. +static bool IsAnchorStart(Regexp** pre, int depth) { + Regexp* re = *pre; + Regexp* sub; + // The depth limit makes sure that we don't overflow + // the stack on a deeply nested regexp. As the comment + // above says, IsAnchorStart is conservative, so returning + // a false negative is okay. The exact limit is somewhat arbitrary. + if (re == NULL || depth >= 4) + return false; + switch (re->op()) { + default: + break; + case kRegexpConcat: + if (re->nsub() > 0) { + sub = re->sub()[0]->Incref(); + if (IsAnchorStart(&sub, depth+1)) { + Regexp** subcopy = new Regexp*[re->nsub()]; + subcopy[0] = sub; // already have reference + for (int i = 1; i < re->nsub(); i++) + subcopy[i] = re->sub()[i]->Incref(); + *pre = Regexp::Concat(subcopy, re->nsub(), re->parse_flags()); + delete[] subcopy; + re->Decref(); + return true; + } + sub->Decref(); + } + break; + case kRegexpCapture: + sub = re->sub()[0]->Incref(); + if (IsAnchorStart(&sub, depth+1)) { + *pre = Regexp::Capture(sub, re->parse_flags(), re->cap()); + re->Decref(); + return true; + } + sub->Decref(); + break; + case kRegexpBeginText: + *pre = Regexp::LiteralString(NULL, 0, re->parse_flags()); + re->Decref(); + return true; + } + return false; +} + +// Is this regexp required to start at the end of the text? +// Only approximate; can return false for complicated regexps like (a\z|b\z), +// but handles ((a|b)\z). Could use the Walker to write a more exact one. +static bool IsAnchorEnd(Regexp** pre, int depth) { + Regexp* re = *pre; + Regexp* sub; + // The depth limit makes sure that we don't overflow + // the stack on a deeply nested regexp. As the comment + // above says, IsAnchorEnd is conservative, so returning + // a false negative is okay. The exact limit is somewhat arbitrary. + if (re == NULL || depth >= 4) + return false; + switch (re->op()) { + default: + break; + case kRegexpConcat: + if (re->nsub() > 0) { + sub = re->sub()[re->nsub() - 1]->Incref(); + if (IsAnchorEnd(&sub, depth+1)) { + Regexp** subcopy = new Regexp*[re->nsub()]; + subcopy[re->nsub() - 1] = sub; // already have reference + for (int i = 0; i < re->nsub() - 1; i++) + subcopy[i] = re->sub()[i]->Incref(); + *pre = Regexp::Concat(subcopy, re->nsub(), re->parse_flags()); + delete[] subcopy; + re->Decref(); + return true; + } + sub->Decref(); + } + break; + case kRegexpCapture: + sub = re->sub()[0]->Incref(); + if (IsAnchorEnd(&sub, depth+1)) { + *pre = Regexp::Capture(sub, re->parse_flags(), re->cap()); + re->Decref(); + return true; + } + sub->Decref(); + break; + case kRegexpEndText: + *pre = Regexp::LiteralString(NULL, 0, re->parse_flags()); + re->Decref(); + return true; + } + return false; +} + +void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem, + RE2::Anchor anchor) { + prog_->set_flags(flags); + + if (flags & Regexp::Latin1) + encoding_ = kEncodingLatin1; + max_mem_ = max_mem; + if (max_mem <= 0) { + max_inst_ = 100000; // more than enough + } else if (max_mem <= sizeof(Prog)) { + // No room for anything. + max_inst_ = 0; + } else { + int64 m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst); + // Limit instruction count so that inst->id() fits nicely in an int. + // SparseArray also assumes that the indices (inst->id()) are ints. + // The call to WalkExponential uses 2*max_inst_ below, + // and other places in the code use 2 or 3 * prog->size(). + // Limiting to 2^24 should avoid overflow in those places. + // (The point of allowing more than 32 bits of memory is to + // have plenty of room for the DFA states, not to use it up + // on the program.) + if (m >= 1<<24) + m = 1<<24; + + // Inst imposes its own limit (currently bigger than 2^24 but be safe). + if (m > Prog::Inst::kMaxInst) + m = Prog::Inst::kMaxInst; + + max_inst_ = m; + } + + anchor_ = anchor; +} + +// Compiles re, returning program. +// Caller is responsible for deleting prog_. +// If reversed is true, compiles a program that expects +// to run over the input string backward (reverses all concatenations). +// The reversed flag is also recorded in the returned program. +Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) { + Compiler c; + + c.Setup(re->parse_flags(), max_mem, RE2::ANCHOR_BOTH /* unused */); + c.reversed_ = reversed; + + // Simplify to remove things like counted repetitions + // and character classes like \d. + Regexp* sre = re->Simplify(); + if (sre == NULL) + return NULL; + + // Record whether prog is anchored, removing the anchors. + // (They get in the way of other optimizations.) + bool is_anchor_start = IsAnchorStart(&sre, 0); + bool is_anchor_end = IsAnchorEnd(&sre, 0); + + // Generate fragment for entire regexp. + Frag f = c.WalkExponential(sre, kNullFrag, 2*c.max_inst_); + sre->Decref(); + if (c.failed_) + return NULL; + + // Success! Finish by putting Match node at end, and record start. + // Turn off c.reversed_ (if it is set) to force the remaining concatenations + // to behave normally. + c.reversed_ = false; + Frag all = c.Cat(f, c.Match(0)); + c.prog_->set_start(all.begin); + + if (reversed) { + c.prog_->set_anchor_start(is_anchor_end); + c.prog_->set_anchor_end(is_anchor_start); + } else { + c.prog_->set_anchor_start(is_anchor_start); + c.prog_->set_anchor_end(is_anchor_end); + } + + // Also create unanchored version, which starts with a .*? loop. + if (c.prog_->anchor_start()) { + c.prog_->set_start_unanchored(c.prog_->start()); + } else { + Frag unanchored = c.Cat(c.DotStar(), all); + c.prog_->set_start_unanchored(unanchored.begin); + } + + c.prog_->set_reversed(reversed); + + // Hand ownership of prog_ to caller. + return c.Finish(); +} + +Prog* Compiler::Finish() { + if (failed_) + return NULL; + + if (prog_->start() == 0 && prog_->start_unanchored() == 0) { + // No possible matches; keep Fail instruction only. + inst_len_ = 1; + } + + // Trim instruction to minimum array and transfer to Prog. + Trim(); + prog_->inst_ = inst_; + prog_->size_ = inst_len_; + inst_ = NULL; + + // Compute byte map. + prog_->ComputeByteMap(); + + prog_->Optimize(); + + // Record remaining memory for DFA. + if (max_mem_ <= 0) { + prog_->set_dfa_mem(1<<20); + } else { + int64 m = max_mem_ - sizeof(Prog) - inst_len_*sizeof(Prog::Inst); + if (m < 0) + m = 0; + prog_->set_dfa_mem(m); + } + + Prog* p = prog_; + prog_ = NULL; + return p; +} + +// Converts Regexp to Prog. +Prog* Regexp::CompileToProg(int64 max_mem) { + return Compiler::Compile(this, false, max_mem); +} + +Prog* Regexp::CompileToReverseProg(int64 max_mem) { + return Compiler::Compile(this, true, max_mem); +} + +Frag Compiler::DotStar() { + return Star(ByteRange(0x00, 0xff, false), true); +} + +// Compiles RE set to Prog. +Prog* Compiler::CompileSet(const RE2::Options& options, RE2::Anchor anchor, + Regexp* re) { + Compiler c; + + Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(options.ParseFlags()); + c.Setup(pf, options.max_mem(), anchor); + + // Compile alternation of fragments. + Frag all = c.WalkExponential(re, kNullFrag, 2*c.max_inst_); + re->Decref(); + if (c.failed_) + return NULL; + + if (anchor == RE2::UNANCHORED) { + // The trailing .* was added while handling kRegexpHaveMatch. + // We just have to add the leading one. + all = c.Cat(c.DotStar(), all); + } + + c.prog_->set_start(all.begin); + c.prog_->set_start_unanchored(all.begin); + c.prog_->set_anchor_start(true); + c.prog_->set_anchor_end(true); + + Prog* prog = c.Finish(); + if (prog == NULL) + return NULL; + + // Make sure DFA has enough memory to operate, + // since we're not going to fall back to the NFA. + bool failed; + StringPiece sp = "hello, world"; + prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch, + NULL, &failed, NULL); + if (failed) { + delete prog; + return NULL; + } + + return prog; +} + +Prog* Prog::CompileSet(const RE2::Options& options, RE2::Anchor anchor, + Regexp* re) { + return Compiler::CompileSet(options, anchor, re); +} + +} // namespace re2 |