// Copyright 2009 The RE2 Authors. All Rights Reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. #include "util/util.h" #include "util/flags.h" #include "re2/prefilter.h" #include "re2/prefilter_tree.h" #include "re2/re2.h" DEFINE_int32(filtered_re2_min_atom_len, 3, "Strings less than this length are not stored as atoms"); namespace re2 { PrefilterTree::PrefilterTree() : compiled_(false) { } PrefilterTree::~PrefilterTree() { for (int i = 0; i < prefilter_vec_.size(); i++) delete prefilter_vec_[i]; for (int i = 0; i < entries_.size(); i++) delete entries_[i].parents; } // Functions used for adding and Compiling prefilters to the // PrefilterTree. static bool KeepPart(Prefilter* prefilter, int level) { if (prefilter == NULL) return false; switch (prefilter->op()) { default: LOG(DFATAL) << "Unexpected op in KeepPart: " << prefilter->op(); return false; case Prefilter::ALL: return false; case Prefilter::ATOM: return prefilter->atom().size() >= FLAGS_filtered_re2_min_atom_len; case Prefilter::AND: { int j = 0; vector* subs = prefilter->subs(); for (int i = 0; i < subs->size(); i++) if (KeepPart((*subs)[i], level + 1)) (*subs)[j++] = (*subs)[i]; else delete (*subs)[i]; subs->resize(j); return j > 0; } case Prefilter::OR: for (int i = 0; i < prefilter->subs()->size(); i++) if (!KeepPart((*prefilter->subs())[i], level + 1)) return false; return true; } } void PrefilterTree::Add(Prefilter *f) { if (compiled_) { LOG(DFATAL) << "Add after Compile."; return; } if (f != NULL && !KeepPart(f, 0)) { delete f; f = NULL; } prefilter_vec_.push_back(f); } void PrefilterTree::Compile(vector* atom_vec) { if (compiled_) { LOG(DFATAL) << "Compile after Compile."; return; } // We do this check to support some legacy uses of // PrefilterTree that call Compile before adding any regexps, // and expect Compile not to have effect. if (prefilter_vec_.empty()) return; compiled_ = true; AssignUniqueIds(atom_vec); // Identify nodes that are too common among prefilters and are // triggering too many parents. Then get rid of them if possible. // Note that getting rid of a prefilter node simply means they are // no longer necessary for their parent to trigger; that is, we do // not miss out on any regexps triggering by getting rid of a // prefilter node. for (int i = 0; i < entries_.size(); i++) { IntMap* parents = entries_[i].parents; if (parents->size() > 8) { // This one triggers too many things. If all the parents are AND // nodes and have other things guarding them, then get rid of // this trigger. TODO(vsri): Adjust the threshold appropriately, // make it a function of total number of nodes? bool have_other_guard = true; for (IntMap::iterator it = parents->begin(); it != parents->end(); ++it) have_other_guard = have_other_guard && (entries_[it->index()].propagate_up_at_count > 1); if (have_other_guard) { for (IntMap::iterator it = parents->begin(); it != parents->end(); ++it) entries_[it->index()].propagate_up_at_count -= 1; parents->clear(); // Forget the parents } } } PrintDebugInfo(); } Prefilter* PrefilterTree::CanonicalNode(Prefilter* node) { string node_string = NodeString(node); map::iterator iter = node_map_.find(node_string); if (iter == node_map_.end()) return NULL; return (*iter).second; } static string Itoa(int n) { char buf[100]; snprintf(buf, sizeof buf, "%d", n); return string(buf); } string PrefilterTree::NodeString(Prefilter* node) const { // Adding the operation disambiguates AND/OR/atom nodes. string s = Itoa(node->op()) + ":"; if (node->op() == Prefilter::ATOM) { s += node->atom(); } else { for (int i = 0; i < node->subs()->size() ; i++) { if (i > 0) s += ','; s += Itoa((*node->subs())[i]->unique_id()); } } return s; } void PrefilterTree::AssignUniqueIds(vector* atom_vec) { atom_vec->clear(); // Build vector of all filter nodes, sorted topologically // from top to bottom in v. vector v; // Add the top level nodes of each regexp prefilter. for (int i = 0; i < prefilter_vec_.size(); i++) { Prefilter* f = prefilter_vec_[i]; if (f == NULL) unfiltered_.push_back(i); // We push NULL also on to v, so that we maintain the // mapping of index==regexpid for level=0 prefilter nodes. v.push_back(f); } // Now add all the descendant nodes. for (int i = 0; i < v.size(); i++) { Prefilter* f = v[i]; if (f == NULL) continue; if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) { const vector& subs = *f->subs(); for (int j = 0; j < subs.size(); j++) v.push_back(subs[j]); } } // Identify unique nodes. int unique_id = 0; for (int i = v.size() - 1; i >= 0; i--) { Prefilter *node = v[i]; if (node == NULL) continue; node->set_unique_id(-1); Prefilter* canonical = CanonicalNode(node); if (canonical == NULL) { // Any further nodes that have the same node string // will find this node as the canonical node. node_map_[NodeString(node)] = node; if (node->op() == Prefilter::ATOM) { atom_vec->push_back(node->atom()); atom_index_to_id_.push_back(unique_id); } node->set_unique_id(unique_id++); } else { node->set_unique_id(canonical->unique_id()); } } entries_.resize(node_map_.size()); // Create parent IntMap for the entries. for (int i = v.size() - 1; i >= 0; i--) { Prefilter* prefilter = v[i]; if (prefilter == NULL) continue; if (CanonicalNode(prefilter) != prefilter) continue; Entry* entry = &entries_[prefilter->unique_id()]; entry->parents = new IntMap(node_map_.size()); } // Fill the entries. for (int i = v.size() - 1; i >= 0; i--) { Prefilter* prefilter = v[i]; if (prefilter == NULL) continue; if (CanonicalNode(prefilter) != prefilter) continue; Entry* entry = &entries_[prefilter->unique_id()]; switch (prefilter->op()) { default: case Prefilter::ALL: LOG(DFATAL) << "Unexpected op: " << prefilter->op(); return; case Prefilter::ATOM: entry->propagate_up_at_count = 1; break; case Prefilter::OR: case Prefilter::AND: { IntMap uniq_child(node_map_.size()); for (int j = 0; j < prefilter->subs()->size() ; j++) { Prefilter* child = (*prefilter->subs())[j]; Prefilter* canonical = CanonicalNode(child); if (canonical == NULL) { LOG(DFATAL) << "Null canonical node"; return; } int child_id = canonical->unique_id(); if (!uniq_child.has_index(child_id)) uniq_child.set_new(child_id, 1); // To the child, we want to add to parent indices. Entry* child_entry = &entries_[child_id]; if (!child_entry->parents->has_index(prefilter->unique_id())) child_entry->parents->set_new(prefilter->unique_id(), 1); } entry->propagate_up_at_count = prefilter->op() == Prefilter::AND ? uniq_child.size() : 1; break; } } } // For top level nodes, populate regexp id. for (int i = 0; i < prefilter_vec_.size(); i++) { if (prefilter_vec_[i] == NULL) continue; int id = CanonicalNode(prefilter_vec_[i])->unique_id(); DCHECK_LE(0, id); Entry* entry = &entries_[id]; entry->regexps.push_back(i); } } // Functions for triggering during search. void PrefilterTree::RegexpsGivenStrings( const vector& matched_atoms, vector* regexps) const { regexps->clear(); if (!compiled_) { LOG(WARNING) << "Compile() not called"; for (int i = 0; i < prefilter_vec_.size(); ++i) regexps->push_back(i); } else { if (!prefilter_vec_.empty()) { IntMap regexps_map(prefilter_vec_.size()); vector matched_atom_ids; for (int j = 0; j < matched_atoms.size(); j++) { matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]); VLOG(10) << "Atom id:" << atom_index_to_id_[matched_atoms[j]]; } PropagateMatch(matched_atom_ids, ®exps_map); for (IntMap::iterator it = regexps_map.begin(); it != regexps_map.end(); ++it) regexps->push_back(it->index()); regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end()); } } sort(regexps->begin(), regexps->end()); } void PrefilterTree::PropagateMatch(const vector& atom_ids, IntMap* regexps) const { IntMap count(entries_.size()); IntMap work(entries_.size()); for (int i = 0; i < atom_ids.size(); i++) work.set(atom_ids[i], 1); for (IntMap::iterator it = work.begin(); it != work.end(); ++it) { const Entry& entry = entries_[it->index()]; VLOG(10) << "Processing: " << it->index(); // Record regexps triggered. for (int i = 0; i < entry.regexps.size(); i++) { VLOG(10) << "Regexp triggered: " << entry.regexps[i]; regexps->set(entry.regexps[i], 1); } int c; // Pass trigger up to parents. for (IntMap::iterator it = entry.parents->begin(); it != entry.parents->end(); ++it) { int j = it->index(); const Entry& parent = entries_[j]; VLOG(10) << " parent= " << j << " trig= " << parent.propagate_up_at_count; // Delay until all the children have succeeded. if (parent.propagate_up_at_count > 1) { if (count.has_index(j)) { c = count.get_existing(j) + 1; count.set_existing(j, c); } else { c = 1; count.set_new(j, c); } if (c < parent.propagate_up_at_count) continue; } VLOG(10) << "Triggering: " << j; // Trigger the parent. work.set(j, 1); } } } // Debugging help. void PrefilterTree::PrintPrefilter(int regexpid) { LOG(INFO) << DebugNodeString(prefilter_vec_[regexpid]); } void PrefilterTree::PrintDebugInfo() { VLOG(10) << "#Unique Atoms: " << atom_index_to_id_.size(); VLOG(10) << "#Unique Nodes: " << entries_.size(); for (int i = 0; i < entries_.size(); ++i) { IntMap* parents = entries_[i].parents; const vector& regexps = entries_[i].regexps; VLOG(10) << "EntryId: " << i << " N: " << parents->size() << " R: " << regexps.size(); for (IntMap::iterator it = parents->begin(); it != parents->end(); ++it) VLOG(10) << it->index(); } VLOG(10) << "Map:"; for (map::const_iterator iter = node_map_.begin(); iter != node_map_.end(); ++iter) VLOG(10) << "NodeId: " << (*iter).second->unique_id() << " Str: " << (*iter).first; } string PrefilterTree::DebugNodeString(Prefilter* node) const { string node_string = ""; if (node->op() == Prefilter::ATOM) { DCHECK(!node->atom().empty()); node_string += node->atom(); } else { // Adding the operation disambiguates AND and OR nodes. node_string += node->op() == Prefilter::AND ? "AND" : "OR"; node_string += "("; for (int i = 0; i < node->subs()->size() ; i++) { if (i > 0) node_string += ','; node_string += Itoa((*node->subs())[i]->unique_id()); node_string += ":"; node_string += DebugNodeString((*node->subs())[i]); } node_string += ")"; } return node_string; } } // namespace re2