aboutsummaryrefslogtreecommitdiff
path: root/re2/prog.h
blob: 2cf65bc767285f076c7680f31156be47fb7e85cc (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
// Copyright 2007 The RE2 Authors.  All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Compiled representation of regular expressions.
// See regexp.h for the Regexp class, which represents a regular
// expression symbolically.

#ifndef RE2_PROG_H__
#define RE2_PROG_H__

#include "util/util.h"
#include "re2/re2.h"

namespace re2 {

// Simple fixed-size bitmap.
template<int Bits>
class Bitmap {
 public:
  Bitmap() { Reset(); }
  int Size() { return Bits; }

  void Reset() {
    for (int i = 0; i < Words; i++)
      w_[i] = 0;
  }
  bool Get(int k) const {
    return w_[k >> WordLog] & (1<<(k & 31));
  }
  void Set(int k) {
    w_[k >> WordLog] |= 1<<(k & 31);
  }
  void Clear(int k) {
    w_[k >> WordLog] &= ~(1<<(k & 31));
  }
  uint32 Word(int i) const {
    return w_[i];
  }

 private:
  static const int WordLog = 5;
  static const int Words = (Bits+31)/32;
  uint32 w_[Words];
  DISALLOW_EVIL_CONSTRUCTORS(Bitmap);
};


// Opcodes for Inst
enum InstOp {
  kInstAlt = 0,      // choose between out_ and out1_
  kInstAltMatch,     // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa.
  kInstByteRange,    // next (possible case-folded) byte must be in [lo_, hi_]
  kInstCapture,      // capturing parenthesis number cap_
  kInstEmptyWidth,   // empty-width special (^ $ ...); bit(s) set in empty_
  kInstMatch,        // found a match!
  kInstNop,          // no-op; occasionally unavoidable
  kInstFail,         // never match; occasionally unavoidable
};

// Bit flags for empty-width specials
enum EmptyOp {
  kEmptyBeginLine        = 1<<0,      // ^ - beginning of line
  kEmptyEndLine          = 1<<1,      // $ - end of line
  kEmptyBeginText        = 1<<2,      // \A - beginning of text
  kEmptyEndText          = 1<<3,      // \z - end of text
  kEmptyWordBoundary     = 1<<4,      // \b - word boundary
  kEmptyNonWordBoundary  = 1<<5,      // \B - not \b
  kEmptyAllFlags         = (1<<6)-1,
};

class Regexp;

class DFA;
struct OneState;

// Compiled form of regexp program.
class Prog {
 public:
  Prog();
  ~Prog();

  // Single instruction in regexp program.
  class Inst {
   public:
    Inst() : out_opcode_(0), out1_(0) { }

    // Constructors per opcode
    void InitAlt(uint32 out, uint32 out1);
    void InitByteRange(int lo, int hi, int foldcase, uint32 out);
    void InitCapture(int cap, uint32 out);
    void InitEmptyWidth(EmptyOp empty, uint32 out);
    void InitMatch(int id);
    void InitNop(uint32 out);
    void InitFail();

    // Getters
    int id(Prog* p) { return this - p->inst_; }
    InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); }
    int out()     { return out_opcode_>>3; }
    int out1()    { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; }
    int cap()       { DCHECK_EQ(opcode(), kInstCapture); return cap_; }
    int lo()        { DCHECK_EQ(opcode(), kInstByteRange); return lo_; }
    int hi()        { DCHECK_EQ(opcode(), kInstByteRange); return hi_; }
    int foldcase()  { DCHECK_EQ(opcode(), kInstByteRange); return foldcase_; }
    int match_id()  { DCHECK_EQ(opcode(), kInstMatch); return match_id_; }
    EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; }
    bool greedy(Prog *p) {
      DCHECK_EQ(opcode(), kInstAltMatch);
      return p->inst(out())->opcode() == kInstByteRange;
    }

    // Does this inst (an kInstByteRange) match c?
    inline bool Matches(int c) {
      DCHECK_EQ(opcode(), kInstByteRange);
      if (foldcase_ && 'A' <= c && c <= 'Z')
        c += 'a' - 'A';
      return lo_ <= c && c <= hi_;
    }

    // Returns string representation for debugging.
    string Dump();

    // Maximum instruction id.
    // (Must fit in out_opcode_, and PatchList steals another bit.)
    static const int kMaxInst = (1<<28) - 1;

   private:
    void set_opcode(InstOp opcode) {
      out_opcode_ = (out()<<3) | opcode;
    }

    void set_out(int out) {
      out_opcode_ = (out<<3) | opcode();
    }

    void set_out_opcode(int out, InstOp opcode) {
      out_opcode_ = (out<<3) | opcode;
    }

    uint32 out_opcode_;  // 29 bits of out, 3 (low) bits opcode
    union {              // additional instruction arguments:
      uint32 out1_;      // opcode == kInstAlt
                         //   alternate next instruction

      int32 cap_;        // opcode == kInstCapture
                         //   Index of capture register (holds text
                         //   position recorded by capturing parentheses).
                         //   For \n (the submatch for the nth parentheses),
                         //   the left parenthesis captures into register 2*n
                         //   and the right one captures into register 2*n+1.

      int32 match_id_;   // opcode == kInstMatch
                         //   Match ID to identify this match (for re2::Set).

      struct {           // opcode == kInstByteRange
        uint8 lo_;       //   byte range is lo_-hi_ inclusive
        uint8 hi_;       //
        uint8 foldcase_; //   convert A-Z to a-z before checking range.
      };

      EmptyOp empty_;    // opcode == kInstEmptyWidth
                         //   empty_ is bitwise OR of kEmpty* flags above.
    };

    friend class Compiler;
    friend struct PatchList;
    friend class Prog;

    DISALLOW_EVIL_CONSTRUCTORS(Inst);
  };

  // Whether to anchor the search.
  enum Anchor {
    kUnanchored,  // match anywhere
    kAnchored,    // match only starting at beginning of text
  };

  // Kind of match to look for (for anchor != kFullMatch)
  //
  // kLongestMatch mode finds the overall longest
  // match but still makes its submatch choices the way
  // Perl would, not in the way prescribed by POSIX.
  // The POSIX rules are much more expensive to implement,
  // and no one has needed them.
  //
  // kFullMatch is not strictly necessary -- we could use
  // kLongestMatch and then check the length of the match -- but
  // the matching code can run faster if it knows to consider only
  // full matches.
  enum MatchKind {
    kFirstMatch,     // like Perl, PCRE
    kLongestMatch,   // like egrep or POSIX
    kFullMatch,      // match only entire text; implies anchor==kAnchored
    kManyMatch       // for SearchDFA, records set of matches
  };

  Inst *inst(int id) { return &inst_[id]; }
  int start() { return start_; }
  int start_unanchored() { return start_unanchored_; }
  void set_start(int start) { start_ = start; }
  void set_start_unanchored(int start) { start_unanchored_ = start; }
  int64 size() { return size_; }
  bool reversed() { return reversed_; }
  void set_reversed(bool reversed) { reversed_ = reversed; }
  int64 byte_inst_count() { return byte_inst_count_; }
  const Bitmap<256>& byterange() { return byterange_; }
  void set_dfa_mem(int64 dfa_mem) { dfa_mem_ = dfa_mem; }
  int64 dfa_mem() { return dfa_mem_; }
  int flags() { return flags_; }
  void set_flags(int flags) { flags_ = flags; }
  bool anchor_start() { return anchor_start_; }
  void set_anchor_start(bool b) { anchor_start_ = b; }
  bool anchor_end() { return anchor_end_; }
  void set_anchor_end(bool b) { anchor_end_ = b; }
  int bytemap_range() { return bytemap_range_; }
  const uint8* bytemap() { return bytemap_; }

  // Returns string representation of program for debugging.
  string Dump();
  string DumpUnanchored();

  // Record that at some point in the prog, the bytes in the range
  // lo-hi (inclusive) are treated as different from bytes outside the range.
  // Tracking this lets the DFA collapse commonly-treated byte ranges
  // when recording state pointers, greatly reducing its memory footprint.
  void MarkByteRange(int lo, int hi);

  // Returns the set of kEmpty flags that are in effect at
  // position p within context.
  static uint32 EmptyFlags(const StringPiece& context, const char* p);

  // Returns whether byte c is a word character: ASCII only.
  // Used by the implementation of \b and \B.
  // This is not right for Unicode, but:
  //   - it's hard to get right in a byte-at-a-time matching world
  //     (the DFA has only one-byte lookahead).
  //   - even if the lookahead were possible, the Progs would be huge.
  // This crude approximation is the same one PCRE uses.
  static bool IsWordChar(uint8 c) {
    return ('A' <= c && c <= 'Z') ||
           ('a' <= c && c <= 'z') ||
           ('0' <= c && c <= '9') ||
           c == '_';
  }

  // Execution engines.  They all search for the regexp (run the prog)
  // in text, which is in the larger context (used for ^ $ \b etc).
  // Anchor and kind control the kind of search.
  // Returns true if match found, false if not.
  // If match found, fills match[0..nmatch-1] with submatch info.
  // match[0] is overall match, match[1] is first set of parens, etc.
  // If a particular submatch is not matched during the regexp match,
  // it is set to NULL.
  //
  // Matching text == StringPiece(NULL, 0) is treated as any other empty
  // string, but note that on return, it will not be possible to distinguish
  // submatches that matched that empty string from submatches that didn't
  // match anything.  Either way, match[i] == NULL.

  // Search using NFA: can find submatches but kind of slow.
  bool SearchNFA(const StringPiece& text, const StringPiece& context,
                 Anchor anchor, MatchKind kind,
                 StringPiece* match, int nmatch);

  // Search using DFA: much faster than NFA but only finds
  // end of match and can use a lot more memory.
  // Returns whether a match was found.
  // If the DFA runs out of memory, sets *failed to true and returns false.
  // If matches != NULL and kind == kManyMatch and there is a match,
  // SearchDFA fills matches with the match IDs of the final matching state.
  bool SearchDFA(const StringPiece& text, const StringPiece& context,
                 Anchor anchor, MatchKind kind,
                 StringPiece* match0, bool* failed,
                 vector<int>* matches);

  // Build the entire DFA for the given match kind.  FOR TESTING ONLY.
  // Usually the DFA is built out incrementally, as needed, which
  // avoids lots of unnecessary work.  This function is useful only
  // for testing purposes.  Returns number of states.
  int BuildEntireDFA(MatchKind kind);

  // Compute byte map.
  void ComputeByteMap();

  // Run peep-hole optimizer on program.
  void Optimize();

  // One-pass NFA: only correct if IsOnePass() is true,
  // but much faster than NFA (competitive with PCRE)
  // for those expressions.
  bool IsOnePass();
  bool SearchOnePass(const StringPiece& text, const StringPiece& context,
                     Anchor anchor, MatchKind kind,
                     StringPiece* match, int nmatch);

  // Bit-state backtracking.  Fast on small cases but uses memory
  // proportional to the product of the program size and the text size.
  bool SearchBitState(const StringPiece& text, const StringPiece& context,
                      Anchor anchor, MatchKind kind,
                      StringPiece* match, int nmatch);

  static const int kMaxOnePassCapture = 5;  // $0 through $4

  // Backtracking search: the gold standard against which the other
  // implementations are checked.  FOR TESTING ONLY.
  // It allocates a ton of memory to avoid running forever.
  // It is also recursive, so can't use in production (will overflow stacks).
  // The name "Unsafe" here is supposed to be a flag that
  // you should not be using this function.
  bool UnsafeSearchBacktrack(const StringPiece& text,
                             const StringPiece& context,
                             Anchor anchor, MatchKind kind,
                             StringPiece* match, int nmatch);

  // Computes range for any strings matching regexp. The min and max can in
  // some cases be arbitrarily precise, so the caller gets to specify the
  // maximum desired length of string returned.
  //
  // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
  // string s that is an anchored match for this regexp satisfies
  //   min <= s && s <= max.
  //
  // Note that PossibleMatchRange() will only consider the first copy of an
  // infinitely repeated element (i.e., any regexp element followed by a '*' or
  // '+' operator). Regexps with "{N}" constructions are not affected, as those
  // do not compile down to infinite repetitions.
  //
  // Returns true on success, false on error.
  bool PossibleMatchRange(string* min, string* max, int maxlen);

  // Compiles a collection of regexps to Prog.  Each regexp will have
  // its own Match instruction recording the index in the vector.
  static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
                          Regexp* re);

 private:
  friend class Compiler;

  DFA* GetDFA(MatchKind kind);

  bool anchor_start_;       // regexp has explicit start anchor
  bool anchor_end_;         // regexp has explicit end anchor
  bool reversed_;           // whether program runs backward over input
  bool did_onepass_;        // has IsOnePass been called?

  int start_;               // entry point for program
  int start_unanchored_;    // unanchored entry point for program
  int size_;                // number of instructions
  int byte_inst_count_;     // number of kInstByteRange instructions
  int bytemap_range_;       // bytemap_[x] < bytemap_range_
  int flags_;               // regexp parse flags
  int onepass_statesize_;   // byte size of each OneState* node

  Inst* inst_;              // pointer to instruction array

  Mutex dfa_mutex_;    // Protects dfa_first_, dfa_longest_
  DFA* volatile dfa_first_;     // DFA cached for kFirstMatch
  DFA* volatile dfa_longest_;   // DFA cached for kLongestMatch and kFullMatch
  int64 dfa_mem_;      // Maximum memory for DFAs.
  void (*delete_dfa_)(DFA* dfa);

  Bitmap<256> byterange_;    // byterange.Get(x) true if x ends a
                             // commonly-treated byte range.
  uint8 bytemap_[256];       // map from input bytes to byte classes
  uint8 *unbytemap_;         // bytemap_[unbytemap_[x]] == x

  uint8* onepass_nodes_;     // data for OnePass nodes
  OneState* onepass_start_;  // start node for OnePass program

  DISALLOW_EVIL_CONSTRUCTORS(Prog);
};

}  // namespace re2

#endif  // RE2_PROG_H__