aboutsummaryrefslogtreecommitdiff
path: root/re2/prog.h
diff options
context:
space:
mode:
Diffstat (limited to 're2/prog.h')
-rw-r--r--re2/prog.h376
1 files changed, 376 insertions, 0 deletions
diff --git a/re2/prog.h b/re2/prog.h
new file mode 100644
index 0000000..2cf65bc
--- /dev/null
+++ b/re2/prog.h
@@ -0,0 +1,376 @@
+// Copyright 2007 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Compiled representation of regular expressions.
+// See regexp.h for the Regexp class, which represents a regular
+// expression symbolically.
+
+#ifndef RE2_PROG_H__
+#define RE2_PROG_H__
+
+#include "util/util.h"
+#include "re2/re2.h"
+
+namespace re2 {
+
+// Simple fixed-size bitmap.
+template<int Bits>
+class Bitmap {
+ public:
+ Bitmap() { Reset(); }
+ int Size() { return Bits; }
+
+ void Reset() {
+ for (int i = 0; i < Words; i++)
+ w_[i] = 0;
+ }
+ bool Get(int k) const {
+ return w_[k >> WordLog] & (1<<(k & 31));
+ }
+ void Set(int k) {
+ w_[k >> WordLog] |= 1<<(k & 31);
+ }
+ void Clear(int k) {
+ w_[k >> WordLog] &= ~(1<<(k & 31));
+ }
+ uint32 Word(int i) const {
+ return w_[i];
+ }
+
+ private:
+ static const int WordLog = 5;
+ static const int Words = (Bits+31)/32;
+ uint32 w_[Words];
+ DISALLOW_EVIL_CONSTRUCTORS(Bitmap);
+};
+
+
+// Opcodes for Inst
+enum InstOp {
+ kInstAlt = 0, // choose between out_ and out1_
+ kInstAltMatch, // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa.
+ kInstByteRange, // next (possible case-folded) byte must be in [lo_, hi_]
+ kInstCapture, // capturing parenthesis number cap_
+ kInstEmptyWidth, // empty-width special (^ $ ...); bit(s) set in empty_
+ kInstMatch, // found a match!
+ kInstNop, // no-op; occasionally unavoidable
+ kInstFail, // never match; occasionally unavoidable
+};
+
+// Bit flags for empty-width specials
+enum EmptyOp {
+ kEmptyBeginLine = 1<<0, // ^ - beginning of line
+ kEmptyEndLine = 1<<1, // $ - end of line
+ kEmptyBeginText = 1<<2, // \A - beginning of text
+ kEmptyEndText = 1<<3, // \z - end of text
+ kEmptyWordBoundary = 1<<4, // \b - word boundary
+ kEmptyNonWordBoundary = 1<<5, // \B - not \b
+ kEmptyAllFlags = (1<<6)-1,
+};
+
+class Regexp;
+
+class DFA;
+struct OneState;
+
+// Compiled form of regexp program.
+class Prog {
+ public:
+ Prog();
+ ~Prog();
+
+ // Single instruction in regexp program.
+ class Inst {
+ public:
+ Inst() : out_opcode_(0), out1_(0) { }
+
+ // Constructors per opcode
+ void InitAlt(uint32 out, uint32 out1);
+ void InitByteRange(int lo, int hi, int foldcase, uint32 out);
+ void InitCapture(int cap, uint32 out);
+ void InitEmptyWidth(EmptyOp empty, uint32 out);
+ void InitMatch(int id);
+ void InitNop(uint32 out);
+ void InitFail();
+
+ // Getters
+ int id(Prog* p) { return this - p->inst_; }
+ InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); }
+ int out() { return out_opcode_>>3; }
+ int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; }
+ int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; }
+ int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; }
+ int hi() { DCHECK_EQ(opcode(), kInstByteRange); return hi_; }
+ int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return foldcase_; }
+ int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; }
+ EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; }
+ bool greedy(Prog *p) {
+ DCHECK_EQ(opcode(), kInstAltMatch);
+ return p->inst(out())->opcode() == kInstByteRange;
+ }
+
+ // Does this inst (an kInstByteRange) match c?
+ inline bool Matches(int c) {
+ DCHECK_EQ(opcode(), kInstByteRange);
+ if (foldcase_ && 'A' <= c && c <= 'Z')
+ c += 'a' - 'A';
+ return lo_ <= c && c <= hi_;
+ }
+
+ // Returns string representation for debugging.
+ string Dump();
+
+ // Maximum instruction id.
+ // (Must fit in out_opcode_, and PatchList steals another bit.)
+ static const int kMaxInst = (1<<28) - 1;
+
+ private:
+ void set_opcode(InstOp opcode) {
+ out_opcode_ = (out()<<3) | opcode;
+ }
+
+ void set_out(int out) {
+ out_opcode_ = (out<<3) | opcode();
+ }
+
+ void set_out_opcode(int out, InstOp opcode) {
+ out_opcode_ = (out<<3) | opcode;
+ }
+
+ uint32 out_opcode_; // 29 bits of out, 3 (low) bits opcode
+ union { // additional instruction arguments:
+ uint32 out1_; // opcode == kInstAlt
+ // alternate next instruction
+
+ int32 cap_; // opcode == kInstCapture
+ // Index of capture register (holds text
+ // position recorded by capturing parentheses).
+ // For \n (the submatch for the nth parentheses),
+ // the left parenthesis captures into register 2*n
+ // and the right one captures into register 2*n+1.
+
+ int32 match_id_; // opcode == kInstMatch
+ // Match ID to identify this match (for re2::Set).
+
+ struct { // opcode == kInstByteRange
+ uint8 lo_; // byte range is lo_-hi_ inclusive
+ uint8 hi_; //
+ uint8 foldcase_; // convert A-Z to a-z before checking range.
+ };
+
+ EmptyOp empty_; // opcode == kInstEmptyWidth
+ // empty_ is bitwise OR of kEmpty* flags above.
+ };
+
+ friend class Compiler;
+ friend struct PatchList;
+ friend class Prog;
+
+ DISALLOW_EVIL_CONSTRUCTORS(Inst);
+ };
+
+ // Whether to anchor the search.
+ enum Anchor {
+ kUnanchored, // match anywhere
+ kAnchored, // match only starting at beginning of text
+ };
+
+ // Kind of match to look for (for anchor != kFullMatch)
+ //
+ // kLongestMatch mode finds the overall longest
+ // match but still makes its submatch choices the way
+ // Perl would, not in the way prescribed by POSIX.
+ // The POSIX rules are much more expensive to implement,
+ // and no one has needed them.
+ //
+ // kFullMatch is not strictly necessary -- we could use
+ // kLongestMatch and then check the length of the match -- but
+ // the matching code can run faster if it knows to consider only
+ // full matches.
+ enum MatchKind {
+ kFirstMatch, // like Perl, PCRE
+ kLongestMatch, // like egrep or POSIX
+ kFullMatch, // match only entire text; implies anchor==kAnchored
+ kManyMatch // for SearchDFA, records set of matches
+ };
+
+ Inst *inst(int id) { return &inst_[id]; }
+ int start() { return start_; }
+ int start_unanchored() { return start_unanchored_; }
+ void set_start(int start) { start_ = start; }
+ void set_start_unanchored(int start) { start_unanchored_ = start; }
+ int64 size() { return size_; }
+ bool reversed() { return reversed_; }
+ void set_reversed(bool reversed) { reversed_ = reversed; }
+ int64 byte_inst_count() { return byte_inst_count_; }
+ const Bitmap<256>& byterange() { return byterange_; }
+ void set_dfa_mem(int64 dfa_mem) { dfa_mem_ = dfa_mem; }
+ int64 dfa_mem() { return dfa_mem_; }
+ int flags() { return flags_; }
+ void set_flags(int flags) { flags_ = flags; }
+ bool anchor_start() { return anchor_start_; }
+ void set_anchor_start(bool b) { anchor_start_ = b; }
+ bool anchor_end() { return anchor_end_; }
+ void set_anchor_end(bool b) { anchor_end_ = b; }
+ int bytemap_range() { return bytemap_range_; }
+ const uint8* bytemap() { return bytemap_; }
+
+ // Returns string representation of program for debugging.
+ string Dump();
+ string DumpUnanchored();
+
+ // Record that at some point in the prog, the bytes in the range
+ // lo-hi (inclusive) are treated as different from bytes outside the range.
+ // Tracking this lets the DFA collapse commonly-treated byte ranges
+ // when recording state pointers, greatly reducing its memory footprint.
+ void MarkByteRange(int lo, int hi);
+
+ // Returns the set of kEmpty flags that are in effect at
+ // position p within context.
+ static uint32 EmptyFlags(const StringPiece& context, const char* p);
+
+ // Returns whether byte c is a word character: ASCII only.
+ // Used by the implementation of \b and \B.
+ // This is not right for Unicode, but:
+ // - it's hard to get right in a byte-at-a-time matching world
+ // (the DFA has only one-byte lookahead).
+ // - even if the lookahead were possible, the Progs would be huge.
+ // This crude approximation is the same one PCRE uses.
+ static bool IsWordChar(uint8 c) {
+ return ('A' <= c && c <= 'Z') ||
+ ('a' <= c && c <= 'z') ||
+ ('0' <= c && c <= '9') ||
+ c == '_';
+ }
+
+ // Execution engines. They all search for the regexp (run the prog)
+ // in text, which is in the larger context (used for ^ $ \b etc).
+ // Anchor and kind control the kind of search.
+ // Returns true if match found, false if not.
+ // If match found, fills match[0..nmatch-1] with submatch info.
+ // match[0] is overall match, match[1] is first set of parens, etc.
+ // If a particular submatch is not matched during the regexp match,
+ // it is set to NULL.
+ //
+ // Matching text == StringPiece(NULL, 0) is treated as any other empty
+ // string, but note that on return, it will not be possible to distinguish
+ // submatches that matched that empty string from submatches that didn't
+ // match anything. Either way, match[i] == NULL.
+
+ // Search using NFA: can find submatches but kind of slow.
+ bool SearchNFA(const StringPiece& text, const StringPiece& context,
+ Anchor anchor, MatchKind kind,
+ StringPiece* match, int nmatch);
+
+ // Search using DFA: much faster than NFA but only finds
+ // end of match and can use a lot more memory.
+ // Returns whether a match was found.
+ // If the DFA runs out of memory, sets *failed to true and returns false.
+ // If matches != NULL and kind == kManyMatch and there is a match,
+ // SearchDFA fills matches with the match IDs of the final matching state.
+ bool SearchDFA(const StringPiece& text, const StringPiece& context,
+ Anchor anchor, MatchKind kind,
+ StringPiece* match0, bool* failed,
+ vector<int>* matches);
+
+ // Build the entire DFA for the given match kind. FOR TESTING ONLY.
+ // Usually the DFA is built out incrementally, as needed, which
+ // avoids lots of unnecessary work. This function is useful only
+ // for testing purposes. Returns number of states.
+ int BuildEntireDFA(MatchKind kind);
+
+ // Compute byte map.
+ void ComputeByteMap();
+
+ // Run peep-hole optimizer on program.
+ void Optimize();
+
+ // One-pass NFA: only correct if IsOnePass() is true,
+ // but much faster than NFA (competitive with PCRE)
+ // for those expressions.
+ bool IsOnePass();
+ bool SearchOnePass(const StringPiece& text, const StringPiece& context,
+ Anchor anchor, MatchKind kind,
+ StringPiece* match, int nmatch);
+
+ // Bit-state backtracking. Fast on small cases but uses memory
+ // proportional to the product of the program size and the text size.
+ bool SearchBitState(const StringPiece& text, const StringPiece& context,
+ Anchor anchor, MatchKind kind,
+ StringPiece* match, int nmatch);
+
+ static const int kMaxOnePassCapture = 5; // $0 through $4
+
+ // Backtracking search: the gold standard against which the other
+ // implementations are checked. FOR TESTING ONLY.
+ // It allocates a ton of memory to avoid running forever.
+ // It is also recursive, so can't use in production (will overflow stacks).
+ // The name "Unsafe" here is supposed to be a flag that
+ // you should not be using this function.
+ bool UnsafeSearchBacktrack(const StringPiece& text,
+ const StringPiece& context,
+ Anchor anchor, MatchKind kind,
+ StringPiece* match, int nmatch);
+
+ // Computes range for any strings matching regexp. The min and max can in
+ // some cases be arbitrarily precise, so the caller gets to specify the
+ // maximum desired length of string returned.
+ //
+ // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
+ // string s that is an anchored match for this regexp satisfies
+ // min <= s && s <= max.
+ //
+ // Note that PossibleMatchRange() will only consider the first copy of an
+ // infinitely repeated element (i.e., any regexp element followed by a '*' or
+ // '+' operator). Regexps with "{N}" constructions are not affected, as those
+ // do not compile down to infinite repetitions.
+ //
+ // Returns true on success, false on error.
+ bool PossibleMatchRange(string* min, string* max, int maxlen);
+
+ // Compiles a collection of regexps to Prog. Each regexp will have
+ // its own Match instruction recording the index in the vector.
+ static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
+ Regexp* re);
+
+ private:
+ friend class Compiler;
+
+ DFA* GetDFA(MatchKind kind);
+
+ bool anchor_start_; // regexp has explicit start anchor
+ bool anchor_end_; // regexp has explicit end anchor
+ bool reversed_; // whether program runs backward over input
+ bool did_onepass_; // has IsOnePass been called?
+
+ int start_; // entry point for program
+ int start_unanchored_; // unanchored entry point for program
+ int size_; // number of instructions
+ int byte_inst_count_; // number of kInstByteRange instructions
+ int bytemap_range_; // bytemap_[x] < bytemap_range_
+ int flags_; // regexp parse flags
+ int onepass_statesize_; // byte size of each OneState* node
+
+ Inst* inst_; // pointer to instruction array
+
+ Mutex dfa_mutex_; // Protects dfa_first_, dfa_longest_
+ DFA* volatile dfa_first_; // DFA cached for kFirstMatch
+ DFA* volatile dfa_longest_; // DFA cached for kLongestMatch and kFullMatch
+ int64 dfa_mem_; // Maximum memory for DFAs.
+ void (*delete_dfa_)(DFA* dfa);
+
+ Bitmap<256> byterange_; // byterange.Get(x) true if x ends a
+ // commonly-treated byte range.
+ uint8 bytemap_[256]; // map from input bytes to byte classes
+ uint8 *unbytemap_; // bytemap_[unbytemap_[x]] == x
+
+ uint8* onepass_nodes_; // data for OnePass nodes
+ OneState* onepass_start_; // start node for OnePass program
+
+ DISALLOW_EVIL_CONSTRUCTORS(Prog);
+};
+
+} // namespace re2
+
+#endif // RE2_PROG_H__