diff options
Diffstat (limited to 're2/prog.h')
-rw-r--r-- | re2/prog.h | 376 |
1 files changed, 376 insertions, 0 deletions
diff --git a/re2/prog.h b/re2/prog.h new file mode 100644 index 0000000..2cf65bc --- /dev/null +++ b/re2/prog.h @@ -0,0 +1,376 @@ +// Copyright 2007 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Compiled representation of regular expressions. +// See regexp.h for the Regexp class, which represents a regular +// expression symbolically. + +#ifndef RE2_PROG_H__ +#define RE2_PROG_H__ + +#include "util/util.h" +#include "re2/re2.h" + +namespace re2 { + +// Simple fixed-size bitmap. +template<int Bits> +class Bitmap { + public: + Bitmap() { Reset(); } + int Size() { return Bits; } + + void Reset() { + for (int i = 0; i < Words; i++) + w_[i] = 0; + } + bool Get(int k) const { + return w_[k >> WordLog] & (1<<(k & 31)); + } + void Set(int k) { + w_[k >> WordLog] |= 1<<(k & 31); + } + void Clear(int k) { + w_[k >> WordLog] &= ~(1<<(k & 31)); + } + uint32 Word(int i) const { + return w_[i]; + } + + private: + static const int WordLog = 5; + static const int Words = (Bits+31)/32; + uint32 w_[Words]; + DISALLOW_EVIL_CONSTRUCTORS(Bitmap); +}; + + +// Opcodes for Inst +enum InstOp { + kInstAlt = 0, // choose between out_ and out1_ + kInstAltMatch, // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa. + kInstByteRange, // next (possible case-folded) byte must be in [lo_, hi_] + kInstCapture, // capturing parenthesis number cap_ + kInstEmptyWidth, // empty-width special (^ $ ...); bit(s) set in empty_ + kInstMatch, // found a match! + kInstNop, // no-op; occasionally unavoidable + kInstFail, // never match; occasionally unavoidable +}; + +// Bit flags for empty-width specials +enum EmptyOp { + kEmptyBeginLine = 1<<0, // ^ - beginning of line + kEmptyEndLine = 1<<1, // $ - end of line + kEmptyBeginText = 1<<2, // \A - beginning of text + kEmptyEndText = 1<<3, // \z - end of text + kEmptyWordBoundary = 1<<4, // \b - word boundary + kEmptyNonWordBoundary = 1<<5, // \B - not \b + kEmptyAllFlags = (1<<6)-1, +}; + +class Regexp; + +class DFA; +struct OneState; + +// Compiled form of regexp program. +class Prog { + public: + Prog(); + ~Prog(); + + // Single instruction in regexp program. + class Inst { + public: + Inst() : out_opcode_(0), out1_(0) { } + + // Constructors per opcode + void InitAlt(uint32 out, uint32 out1); + void InitByteRange(int lo, int hi, int foldcase, uint32 out); + void InitCapture(int cap, uint32 out); + void InitEmptyWidth(EmptyOp empty, uint32 out); + void InitMatch(int id); + void InitNop(uint32 out); + void InitFail(); + + // Getters + int id(Prog* p) { return this - p->inst_; } + InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); } + int out() { return out_opcode_>>3; } + int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; } + int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; } + int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; } + int hi() { DCHECK_EQ(opcode(), kInstByteRange); return hi_; } + int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return foldcase_; } + int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; } + EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; } + bool greedy(Prog *p) { + DCHECK_EQ(opcode(), kInstAltMatch); + return p->inst(out())->opcode() == kInstByteRange; + } + + // Does this inst (an kInstByteRange) match c? + inline bool Matches(int c) { + DCHECK_EQ(opcode(), kInstByteRange); + if (foldcase_ && 'A' <= c && c <= 'Z') + c += 'a' - 'A'; + return lo_ <= c && c <= hi_; + } + + // Returns string representation for debugging. + string Dump(); + + // Maximum instruction id. + // (Must fit in out_opcode_, and PatchList steals another bit.) + static const int kMaxInst = (1<<28) - 1; + + private: + void set_opcode(InstOp opcode) { + out_opcode_ = (out()<<3) | opcode; + } + + void set_out(int out) { + out_opcode_ = (out<<3) | opcode(); + } + + void set_out_opcode(int out, InstOp opcode) { + out_opcode_ = (out<<3) | opcode; + } + + uint32 out_opcode_; // 29 bits of out, 3 (low) bits opcode + union { // additional instruction arguments: + uint32 out1_; // opcode == kInstAlt + // alternate next instruction + + int32 cap_; // opcode == kInstCapture + // Index of capture register (holds text + // position recorded by capturing parentheses). + // For \n (the submatch for the nth parentheses), + // the left parenthesis captures into register 2*n + // and the right one captures into register 2*n+1. + + int32 match_id_; // opcode == kInstMatch + // Match ID to identify this match (for re2::Set). + + struct { // opcode == kInstByteRange + uint8 lo_; // byte range is lo_-hi_ inclusive + uint8 hi_; // + uint8 foldcase_; // convert A-Z to a-z before checking range. + }; + + EmptyOp empty_; // opcode == kInstEmptyWidth + // empty_ is bitwise OR of kEmpty* flags above. + }; + + friend class Compiler; + friend struct PatchList; + friend class Prog; + + DISALLOW_EVIL_CONSTRUCTORS(Inst); + }; + + // Whether to anchor the search. + enum Anchor { + kUnanchored, // match anywhere + kAnchored, // match only starting at beginning of text + }; + + // Kind of match to look for (for anchor != kFullMatch) + // + // kLongestMatch mode finds the overall longest + // match but still makes its submatch choices the way + // Perl would, not in the way prescribed by POSIX. + // The POSIX rules are much more expensive to implement, + // and no one has needed them. + // + // kFullMatch is not strictly necessary -- we could use + // kLongestMatch and then check the length of the match -- but + // the matching code can run faster if it knows to consider only + // full matches. + enum MatchKind { + kFirstMatch, // like Perl, PCRE + kLongestMatch, // like egrep or POSIX + kFullMatch, // match only entire text; implies anchor==kAnchored + kManyMatch // for SearchDFA, records set of matches + }; + + Inst *inst(int id) { return &inst_[id]; } + int start() { return start_; } + int start_unanchored() { return start_unanchored_; } + void set_start(int start) { start_ = start; } + void set_start_unanchored(int start) { start_unanchored_ = start; } + int64 size() { return size_; } + bool reversed() { return reversed_; } + void set_reversed(bool reversed) { reversed_ = reversed; } + int64 byte_inst_count() { return byte_inst_count_; } + const Bitmap<256>& byterange() { return byterange_; } + void set_dfa_mem(int64 dfa_mem) { dfa_mem_ = dfa_mem; } + int64 dfa_mem() { return dfa_mem_; } + int flags() { return flags_; } + void set_flags(int flags) { flags_ = flags; } + bool anchor_start() { return anchor_start_; } + void set_anchor_start(bool b) { anchor_start_ = b; } + bool anchor_end() { return anchor_end_; } + void set_anchor_end(bool b) { anchor_end_ = b; } + int bytemap_range() { return bytemap_range_; } + const uint8* bytemap() { return bytemap_; } + + // Returns string representation of program for debugging. + string Dump(); + string DumpUnanchored(); + + // Record that at some point in the prog, the bytes in the range + // lo-hi (inclusive) are treated as different from bytes outside the range. + // Tracking this lets the DFA collapse commonly-treated byte ranges + // when recording state pointers, greatly reducing its memory footprint. + void MarkByteRange(int lo, int hi); + + // Returns the set of kEmpty flags that are in effect at + // position p within context. + static uint32 EmptyFlags(const StringPiece& context, const char* p); + + // Returns whether byte c is a word character: ASCII only. + // Used by the implementation of \b and \B. + // This is not right for Unicode, but: + // - it's hard to get right in a byte-at-a-time matching world + // (the DFA has only one-byte lookahead). + // - even if the lookahead were possible, the Progs would be huge. + // This crude approximation is the same one PCRE uses. + static bool IsWordChar(uint8 c) { + return ('A' <= c && c <= 'Z') || + ('a' <= c && c <= 'z') || + ('0' <= c && c <= '9') || + c == '_'; + } + + // Execution engines. They all search for the regexp (run the prog) + // in text, which is in the larger context (used for ^ $ \b etc). + // Anchor and kind control the kind of search. + // Returns true if match found, false if not. + // If match found, fills match[0..nmatch-1] with submatch info. + // match[0] is overall match, match[1] is first set of parens, etc. + // If a particular submatch is not matched during the regexp match, + // it is set to NULL. + // + // Matching text == StringPiece(NULL, 0) is treated as any other empty + // string, but note that on return, it will not be possible to distinguish + // submatches that matched that empty string from submatches that didn't + // match anything. Either way, match[i] == NULL. + + // Search using NFA: can find submatches but kind of slow. + bool SearchNFA(const StringPiece& text, const StringPiece& context, + Anchor anchor, MatchKind kind, + StringPiece* match, int nmatch); + + // Search using DFA: much faster than NFA but only finds + // end of match and can use a lot more memory. + // Returns whether a match was found. + // If the DFA runs out of memory, sets *failed to true and returns false. + // If matches != NULL and kind == kManyMatch and there is a match, + // SearchDFA fills matches with the match IDs of the final matching state. + bool SearchDFA(const StringPiece& text, const StringPiece& context, + Anchor anchor, MatchKind kind, + StringPiece* match0, bool* failed, + vector<int>* matches); + + // Build the entire DFA for the given match kind. FOR TESTING ONLY. + // Usually the DFA is built out incrementally, as needed, which + // avoids lots of unnecessary work. This function is useful only + // for testing purposes. Returns number of states. + int BuildEntireDFA(MatchKind kind); + + // Compute byte map. + void ComputeByteMap(); + + // Run peep-hole optimizer on program. + void Optimize(); + + // One-pass NFA: only correct if IsOnePass() is true, + // but much faster than NFA (competitive with PCRE) + // for those expressions. + bool IsOnePass(); + bool SearchOnePass(const StringPiece& text, const StringPiece& context, + Anchor anchor, MatchKind kind, + StringPiece* match, int nmatch); + + // Bit-state backtracking. Fast on small cases but uses memory + // proportional to the product of the program size and the text size. + bool SearchBitState(const StringPiece& text, const StringPiece& context, + Anchor anchor, MatchKind kind, + StringPiece* match, int nmatch); + + static const int kMaxOnePassCapture = 5; // $0 through $4 + + // Backtracking search: the gold standard against which the other + // implementations are checked. FOR TESTING ONLY. + // It allocates a ton of memory to avoid running forever. + // It is also recursive, so can't use in production (will overflow stacks). + // The name "Unsafe" here is supposed to be a flag that + // you should not be using this function. + bool UnsafeSearchBacktrack(const StringPiece& text, + const StringPiece& context, + Anchor anchor, MatchKind kind, + StringPiece* match, int nmatch); + + // Computes range for any strings matching regexp. The min and max can in + // some cases be arbitrarily precise, so the caller gets to specify the + // maximum desired length of string returned. + // + // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any + // string s that is an anchored match for this regexp satisfies + // min <= s && s <= max. + // + // Note that PossibleMatchRange() will only consider the first copy of an + // infinitely repeated element (i.e., any regexp element followed by a '*' or + // '+' operator). Regexps with "{N}" constructions are not affected, as those + // do not compile down to infinite repetitions. + // + // Returns true on success, false on error. + bool PossibleMatchRange(string* min, string* max, int maxlen); + + // Compiles a collection of regexps to Prog. Each regexp will have + // its own Match instruction recording the index in the vector. + static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor, + Regexp* re); + + private: + friend class Compiler; + + DFA* GetDFA(MatchKind kind); + + bool anchor_start_; // regexp has explicit start anchor + bool anchor_end_; // regexp has explicit end anchor + bool reversed_; // whether program runs backward over input + bool did_onepass_; // has IsOnePass been called? + + int start_; // entry point for program + int start_unanchored_; // unanchored entry point for program + int size_; // number of instructions + int byte_inst_count_; // number of kInstByteRange instructions + int bytemap_range_; // bytemap_[x] < bytemap_range_ + int flags_; // regexp parse flags + int onepass_statesize_; // byte size of each OneState* node + + Inst* inst_; // pointer to instruction array + + Mutex dfa_mutex_; // Protects dfa_first_, dfa_longest_ + DFA* volatile dfa_first_; // DFA cached for kFirstMatch + DFA* volatile dfa_longest_; // DFA cached for kLongestMatch and kFullMatch + int64 dfa_mem_; // Maximum memory for DFAs. + void (*delete_dfa_)(DFA* dfa); + + Bitmap<256> byterange_; // byterange.Get(x) true if x ends a + // commonly-treated byte range. + uint8 bytemap_[256]; // map from input bytes to byte classes + uint8 *unbytemap_; // bytemap_[unbytemap_[x]] == x + + uint8* onepass_nodes_; // data for OnePass nodes + OneState* onepass_start_; // start node for OnePass program + + DISALLOW_EVIL_CONSTRUCTORS(Prog); +}; + +} // namespace re2 + +#endif // RE2_PROG_H__ |