aboutsummaryrefslogtreecommitdiff
path: root/re2/walker-inl.h
diff options
context:
space:
mode:
Diffstat (limited to 're2/walker-inl.h')
-rw-r--r--re2/walker-inl.h244
1 files changed, 244 insertions, 0 deletions
diff --git a/re2/walker-inl.h b/re2/walker-inl.h
new file mode 100644
index 0000000..4d2045f
--- /dev/null
+++ b/re2/walker-inl.h
@@ -0,0 +1,244 @@
+// Copyright 2006 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Helper class for traversing Regexps without recursion.
+// Clients should declare their own subclasses that override
+// the PreVisit and PostVisit methods, which are called before
+// and after visiting the subexpressions.
+
+// Not quite the Visitor pattern, because (among other things)
+// the Visitor pattern is recursive.
+
+#ifndef RE2_WALKER_INL_H__
+#define RE2_WALKER_INL_H__
+
+#include "re2/regexp.h"
+
+namespace re2 {
+
+template<typename T> struct WalkState;
+
+template<typename T> class Regexp::Walker {
+ public:
+ Walker();
+ virtual ~Walker();
+
+ // Virtual method called before visiting re's children.
+ // PreVisit passes ownership of its return value to its caller.
+ // The Arg* that PreVisit returns will be passed to PostVisit as pre_arg
+ // and passed to the child PreVisits and PostVisits as parent_arg.
+ // At the top-most Regexp, parent_arg is arg passed to walk.
+ // If PreVisit sets *stop to true, the walk does not recurse
+ // into the children. Instead it behaves as though the return
+ // value from PreVisit is the return value from PostVisit.
+ // The default PreVisit returns parent_arg.
+ virtual T PreVisit(Regexp* re, T parent_arg, bool* stop);
+
+ // Virtual method called after visiting re's children.
+ // The pre_arg is the T that PreVisit returned.
+ // The child_args is a vector of the T that the child PostVisits returned.
+ // PostVisit takes ownership of pre_arg.
+ // PostVisit takes ownership of the Ts
+ // in *child_args, but not the vector itself.
+ // PostVisit passes ownership of its return value
+ // to its caller.
+ // The default PostVisit simply returns pre_arg.
+ virtual T PostVisit(Regexp* re, T parent_arg, T pre_arg,
+ T* child_args, int nchild_args);
+
+ // Virtual method called to copy a T,
+ // when Walk notices that more than one child is the same re.
+ virtual T Copy(T arg);
+
+ // Virtual method called to do a "quick visit" of the re,
+ // but not its children. Only called once the visit budget
+ // has been used up and we're trying to abort the walk
+ // as quickly as possible. Should return a value that
+ // makes sense for the parent PostVisits still to be run.
+ // This function is (hopefully) only called by
+ // WalkExponential, but must be implemented by all clients,
+ // just in case.
+ virtual T ShortVisit(Regexp* re, T parent_arg) = 0;
+
+ // Walks over a regular expression.
+ // Top_arg is passed as parent_arg to PreVisit and PostVisit of re.
+ // Returns the T returned by PostVisit on re.
+ T Walk(Regexp* re, T top_arg);
+
+ // Like Walk, but doesn't use Copy. This can lead to
+ // exponential runtimes on cross-linked Regexps like the
+ // ones generated by Simplify. To help limit this,
+ // at most max_visits nodes will be visited and then
+ // the walk will be cut off early.
+ // If the walk *is* cut off early, ShortVisit(re)
+ // will be called on regexps that cannot be fully
+ // visited rather than calling PreVisit/PostVisit.
+ T WalkExponential(Regexp* re, T top_arg, int max_visits);
+
+ // Clears the stack. Should never be necessary, since
+ // Walk always enters and exits with an empty stack.
+ // Logs DFATAL if stack is not already clear.
+ void Reset();
+
+ // Returns whether walk was cut off.
+ bool stopped_early() { return stopped_early_; }
+
+ private:
+ // Walk state for the entire traversal.
+ stack<WalkState<T> >* stack_;
+ bool stopped_early_;
+ int max_visits_;
+
+ T WalkInternal(Regexp* re, T top_arg, bool use_copy);
+
+ DISALLOW_EVIL_CONSTRUCTORS(Walker);
+};
+
+template<typename T> T Regexp::Walker<T>::PreVisit(Regexp* re,
+ T parent_arg,
+ bool* stop) {
+ return parent_arg;
+}
+
+template<typename T> T Regexp::Walker<T>::PostVisit(Regexp* re,
+ T parent_arg,
+ T pre_arg,
+ T* child_args,
+ int nchild_args) {
+ return pre_arg;
+}
+
+template<typename T> T Regexp::Walker<T>::Copy(T arg) {
+ return arg;
+}
+
+// State about a single level in the traversal.
+template<typename T> struct WalkState {
+ WalkState<T>(Regexp* re, T parent)
+ : re(re),
+ n(-1),
+ parent_arg(parent),
+ child_args(NULL) { }
+
+ Regexp* re; // The regexp
+ int n; // The index of the next child to process; -1 means need to PreVisit
+ T parent_arg; // Accumulated arguments.
+ T pre_arg;
+ T child_arg; // One-element buffer for child_args.
+ T* child_args;
+};
+
+template<typename T> Regexp::Walker<T>::Walker() {
+ stack_ = new stack<WalkState<T> >;
+ stopped_early_ = false;
+}
+
+template<typename T> Regexp::Walker<T>::~Walker() {
+ Reset();
+ delete stack_;
+}
+
+// Clears the stack. Should never be necessary, since
+// Walk always enters and exits with an empty stack.
+// Logs DFATAL if stack is not already clear.
+template<typename T> void Regexp::Walker<T>::Reset() {
+ if (stack_ && stack_->size() > 0) {
+ LOG(DFATAL) << "Stack not empty.";
+ while (stack_->size() > 0) {
+ delete stack_->top().child_args;
+ stack_->pop();
+ }
+ }
+}
+
+template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg,
+ bool use_copy) {
+ Reset();
+
+ if (re == NULL) {
+ LOG(DFATAL) << "Walk NULL";
+ return top_arg;
+ }
+
+ stack_->push(WalkState<T>(re, top_arg));
+
+ WalkState<T>* s;
+ for (;;) {
+ T t;
+ s = &stack_->top();
+ Regexp* re = s->re;
+ switch (s->n) {
+ case -1: {
+ if (--max_visits_ < 0) {
+ stopped_early_ = true;
+ t = ShortVisit(re, s->parent_arg);
+ break;
+ }
+ bool stop = false;
+ s->pre_arg = PreVisit(re, s->parent_arg, &stop);
+ if (stop) {
+ t = s->pre_arg;
+ break;
+ }
+ s->n = 0;
+ s->child_args = NULL;
+ if (re->nsub_ == 1)
+ s->child_args = &s->child_arg;
+ else if (re->nsub_ > 1)
+ s->child_args = new T[re->nsub_];
+ // Fall through.
+ }
+ default: {
+ if (re->nsub_ > 0) {
+ Regexp** sub = re->sub();
+ if (s->n < re->nsub_) {
+ if (use_copy && s->n > 0 && sub[s->n - 1] == sub[s->n]) {
+ s->child_args[s->n] = Copy(s->child_args[s->n - 1]);
+ s->n++;
+ } else {
+ stack_->push(WalkState<T>(sub[s->n], s->pre_arg));
+ }
+ continue;
+ }
+ }
+
+ t = PostVisit(re, s->parent_arg, s->pre_arg, s->child_args, s->n);
+ if (re->nsub_ > 1)
+ delete[] s->child_args;
+ break;
+ }
+ }
+
+ // We've finished stack_->top().
+ // Update next guy down.
+ stack_->pop();
+ if (stack_->size() == 0)
+ return t;
+ s = &stack_->top();
+ if (s->child_args != NULL)
+ s->child_args[s->n] = t;
+ else
+ s->child_arg = t;
+ s->n++;
+ }
+}
+
+template<typename T> T Regexp::Walker<T>::Walk(Regexp* re, T top_arg) {
+ // Without the exponential walking behavior,
+ // this budget should be more than enough for any
+ // regexp, and yet not enough to get us in trouble
+ // as far as CPU time.
+ max_visits_ = 1000000;
+ return WalkInternal(re, top_arg, true);
+}
+
+template<typename T> T Regexp::Walker<T>::WalkExponential(Regexp* re, T top_arg,
+ int max_visits) {
+ max_visits_ = max_visits;
+ return WalkInternal(re, top_arg, false);
+}
+
+} // namespace re2
+
+#endif // RE2_WALKER_INL_H__