diff options
Diffstat (limited to 're2/walker-inl.h')
-rw-r--r-- | re2/walker-inl.h | 244 |
1 files changed, 244 insertions, 0 deletions
diff --git a/re2/walker-inl.h b/re2/walker-inl.h new file mode 100644 index 0000000..4d2045f --- /dev/null +++ b/re2/walker-inl.h @@ -0,0 +1,244 @@ +// Copyright 2006 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Helper class for traversing Regexps without recursion. +// Clients should declare their own subclasses that override +// the PreVisit and PostVisit methods, which are called before +// and after visiting the subexpressions. + +// Not quite the Visitor pattern, because (among other things) +// the Visitor pattern is recursive. + +#ifndef RE2_WALKER_INL_H__ +#define RE2_WALKER_INL_H__ + +#include "re2/regexp.h" + +namespace re2 { + +template<typename T> struct WalkState; + +template<typename T> class Regexp::Walker { + public: + Walker(); + virtual ~Walker(); + + // Virtual method called before visiting re's children. + // PreVisit passes ownership of its return value to its caller. + // The Arg* that PreVisit returns will be passed to PostVisit as pre_arg + // and passed to the child PreVisits and PostVisits as parent_arg. + // At the top-most Regexp, parent_arg is arg passed to walk. + // If PreVisit sets *stop to true, the walk does not recurse + // into the children. Instead it behaves as though the return + // value from PreVisit is the return value from PostVisit. + // The default PreVisit returns parent_arg. + virtual T PreVisit(Regexp* re, T parent_arg, bool* stop); + + // Virtual method called after visiting re's children. + // The pre_arg is the T that PreVisit returned. + // The child_args is a vector of the T that the child PostVisits returned. + // PostVisit takes ownership of pre_arg. + // PostVisit takes ownership of the Ts + // in *child_args, but not the vector itself. + // PostVisit passes ownership of its return value + // to its caller. + // The default PostVisit simply returns pre_arg. + virtual T PostVisit(Regexp* re, T parent_arg, T pre_arg, + T* child_args, int nchild_args); + + // Virtual method called to copy a T, + // when Walk notices that more than one child is the same re. + virtual T Copy(T arg); + + // Virtual method called to do a "quick visit" of the re, + // but not its children. Only called once the visit budget + // has been used up and we're trying to abort the walk + // as quickly as possible. Should return a value that + // makes sense for the parent PostVisits still to be run. + // This function is (hopefully) only called by + // WalkExponential, but must be implemented by all clients, + // just in case. + virtual T ShortVisit(Regexp* re, T parent_arg) = 0; + + // Walks over a regular expression. + // Top_arg is passed as parent_arg to PreVisit and PostVisit of re. + // Returns the T returned by PostVisit on re. + T Walk(Regexp* re, T top_arg); + + // Like Walk, but doesn't use Copy. This can lead to + // exponential runtimes on cross-linked Regexps like the + // ones generated by Simplify. To help limit this, + // at most max_visits nodes will be visited and then + // the walk will be cut off early. + // If the walk *is* cut off early, ShortVisit(re) + // will be called on regexps that cannot be fully + // visited rather than calling PreVisit/PostVisit. + T WalkExponential(Regexp* re, T top_arg, int max_visits); + + // Clears the stack. Should never be necessary, since + // Walk always enters and exits with an empty stack. + // Logs DFATAL if stack is not already clear. + void Reset(); + + // Returns whether walk was cut off. + bool stopped_early() { return stopped_early_; } + + private: + // Walk state for the entire traversal. + stack<WalkState<T> >* stack_; + bool stopped_early_; + int max_visits_; + + T WalkInternal(Regexp* re, T top_arg, bool use_copy); + + DISALLOW_EVIL_CONSTRUCTORS(Walker); +}; + +template<typename T> T Regexp::Walker<T>::PreVisit(Regexp* re, + T parent_arg, + bool* stop) { + return parent_arg; +} + +template<typename T> T Regexp::Walker<T>::PostVisit(Regexp* re, + T parent_arg, + T pre_arg, + T* child_args, + int nchild_args) { + return pre_arg; +} + +template<typename T> T Regexp::Walker<T>::Copy(T arg) { + return arg; +} + +// State about a single level in the traversal. +template<typename T> struct WalkState { + WalkState<T>(Regexp* re, T parent) + : re(re), + n(-1), + parent_arg(parent), + child_args(NULL) { } + + Regexp* re; // The regexp + int n; // The index of the next child to process; -1 means need to PreVisit + T parent_arg; // Accumulated arguments. + T pre_arg; + T child_arg; // One-element buffer for child_args. + T* child_args; +}; + +template<typename T> Regexp::Walker<T>::Walker() { + stack_ = new stack<WalkState<T> >; + stopped_early_ = false; +} + +template<typename T> Regexp::Walker<T>::~Walker() { + Reset(); + delete stack_; +} + +// Clears the stack. Should never be necessary, since +// Walk always enters and exits with an empty stack. +// Logs DFATAL if stack is not already clear. +template<typename T> void Regexp::Walker<T>::Reset() { + if (stack_ && stack_->size() > 0) { + LOG(DFATAL) << "Stack not empty."; + while (stack_->size() > 0) { + delete stack_->top().child_args; + stack_->pop(); + } + } +} + +template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg, + bool use_copy) { + Reset(); + + if (re == NULL) { + LOG(DFATAL) << "Walk NULL"; + return top_arg; + } + + stack_->push(WalkState<T>(re, top_arg)); + + WalkState<T>* s; + for (;;) { + T t; + s = &stack_->top(); + Regexp* re = s->re; + switch (s->n) { + case -1: { + if (--max_visits_ < 0) { + stopped_early_ = true; + t = ShortVisit(re, s->parent_arg); + break; + } + bool stop = false; + s->pre_arg = PreVisit(re, s->parent_arg, &stop); + if (stop) { + t = s->pre_arg; + break; + } + s->n = 0; + s->child_args = NULL; + if (re->nsub_ == 1) + s->child_args = &s->child_arg; + else if (re->nsub_ > 1) + s->child_args = new T[re->nsub_]; + // Fall through. + } + default: { + if (re->nsub_ > 0) { + Regexp** sub = re->sub(); + if (s->n < re->nsub_) { + if (use_copy && s->n > 0 && sub[s->n - 1] == sub[s->n]) { + s->child_args[s->n] = Copy(s->child_args[s->n - 1]); + s->n++; + } else { + stack_->push(WalkState<T>(sub[s->n], s->pre_arg)); + } + continue; + } + } + + t = PostVisit(re, s->parent_arg, s->pre_arg, s->child_args, s->n); + if (re->nsub_ > 1) + delete[] s->child_args; + break; + } + } + + // We've finished stack_->top(). + // Update next guy down. + stack_->pop(); + if (stack_->size() == 0) + return t; + s = &stack_->top(); + if (s->child_args != NULL) + s->child_args[s->n] = t; + else + s->child_arg = t; + s->n++; + } +} + +template<typename T> T Regexp::Walker<T>::Walk(Regexp* re, T top_arg) { + // Without the exponential walking behavior, + // this budget should be more than enough for any + // regexp, and yet not enough to get us in trouble + // as far as CPU time. + max_visits_ = 1000000; + return WalkInternal(re, top_arg, true); +} + +template<typename T> T Regexp::Walker<T>::WalkExponential(Regexp* re, T top_arg, + int max_visits) { + max_visits_ = max_visits; + return WalkInternal(re, top_arg, false); +} + +} // namespace re2 + +#endif // RE2_WALKER_INL_H__ |