aboutsummaryrefslogtreecommitdiff
path: root/re2/prefilter_tree.h
blob: 596b734ff6a5f05f84780ce693089351bcc7e92b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
// Copyright 2009 The RE2 Authors.  All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// The PrefilterTree class is used to form an AND-OR tree of strings
// that would trigger each regexp. The 'prefilter' of each regexp is
// added tp PrefilterTree, and then PrefilterTree is used to find all
// the unique strings across the prefilters. During search, by using
// matches from a string matching engine, PrefilterTree deduces the
// set of regexps that are to be triggered. The 'string matching
// engine' itself is outside of this class, and the caller can use any
// favorite engine. PrefilterTree provides a set of strings (called
// atoms) that the user of this class should use to do the string
// matching.
//
#ifndef RE2_PREFILTER_TREE_H_
#define RE2_PREFILTER_TREE_H_

#include "util/util.h"
#include "util/sparse_array.h"

namespace re2 {

typedef SparseArray<int> IntMap;

class Prefilter;

class PrefilterTree {
 public:
  PrefilterTree();
  ~PrefilterTree();

  // Adds the prefilter for the next regexp. Note that we assume that
  // Add called sequentially for all regexps. All Add calls
  // must precede Compile.
  void Add(Prefilter* prefilter);

  // The Compile returns a vector of string in atom_vec.
  // Call this after all the prefilters are added through Add.
  // No calls to Add after Compile are allowed.
  // The caller should use the returned set of strings to do string matching.
  // Each time a string matches, the corresponding index then has to be
  // and passed to RegexpsGivenStrings below.
  void Compile(vector<string>* atom_vec);

  // Given the indices of the atoms that matched, returns the indexes
  // of regexps that should be searched.  The matched_atoms should
  // contain all the ids of string atoms that were found to match the
  // content. The caller can use any string match engine to perform
  // this function. This function is thread safe.
  void RegexpsGivenStrings(const vector<int>& matched_atoms,
                           vector<int>* regexps) const;

  // Print debug prefilter. Also prints unique ids associated with
  // nodes of the prefilter of the regexp.
  void PrintPrefilter(int regexpid);


  // Each unique node has a corresponding Entry that helps in
  // passing the matching trigger information along the tree.
  struct Entry {
   public:
    // How many children should match before this node triggers the
    // parent. For an atom and an OR node, this is 1 and for an AND
    // node, it is the number of unique children.
    int propagate_up_at_count;

    // When this node is ready to trigger the parent, what are the indices
    // of the parent nodes to trigger. The reason there may be more than
    // one is because of sharing. For example (abc | def) and (xyz | def)
    // are two different nodes, but they share the atom 'def'. So when
    // 'def' matches, it triggers two parents, corresponding to the two
    // different OR nodes.
    IntMap* parents;

    // When this node is ready to trigger the parent, what are the
    // regexps that are triggered.
    vector<int> regexps;
  };

 private:
  // This function assigns unique ids to various parts of the
  // prefilter, by looking at if these nodes are already in the
  // PrefilterTree.
  void AssignUniqueIds(vector<string>* atom_vec);

  // Given the matching atoms, find the regexps to be triggered.
  void PropagateMatch(const vector<int>& atom_ids,
                      IntMap* regexps) const;

  // Returns the prefilter node that has the same NodeString as this
  // node. For the canonical node, returns node.
  Prefilter* CanonicalNode(Prefilter* node);

  // A string that uniquely identifies the node. Assumes that the
  // children of node has already been assigned unique ids.
  string NodeString(Prefilter* node) const;

  // Recursively constructs a readable prefilter string.
  string DebugNodeString(Prefilter* node) const;

  // Used for debugging.
  void PrintDebugInfo();

  // These are all the nodes formed by Compile. Essentially, there is
  // one node for each unique atom and each unique AND/OR node.
  vector<Entry> entries_;

  // Map node string to canonical Prefilter node.
  map<string, Prefilter*> node_map_;

  // indices of regexps that always pass through the filter (since we
  // found no required literals in these regexps).
  vector<int> unfiltered_;

  // vector of Prefilter for all regexps.
  vector<Prefilter*> prefilter_vec_;

  // Atom index in returned strings to entry id mapping.
  vector<int> atom_index_to_id_;

  // Has the prefilter tree been compiled.
  bool compiled_;

  DISALLOW_EVIL_CONSTRUCTORS(PrefilterTree);
};

}  // namespace

#endif  // RE2_PREFILTER_TREE_H_