aboutsummaryrefslogtreecommitdiff
path: root/re2/tostring.cc
diff options
context:
space:
mode:
authorIan Hodson <idh@google.com>2012-05-14 12:29:36 +0100
committerIan Hodson <idh@google.com>2012-05-14 16:55:54 +0100
commit2ee91b4af4353b9e6a9d591c32fedfc58fd4ef35 (patch)
tree2df7e491809d1d56f94f9db0a23a517c6e99dd63 /re2/tostring.cc
parent6849b47ff6a92539a62165fd1306f9b6ac52c9b3 (diff)
downloadregex-re2-2ee91b4af4353b9e6a9d591c32fedfc58fd4ef35.tar.gz
Change-Id: I54e60045f0fee91b64e4d7291e5bb6c6cffb4562
Diffstat (limited to 're2/tostring.cc')
-rw-r--r--re2/tostring.cc341
1 files changed, 341 insertions, 0 deletions
diff --git a/re2/tostring.cc b/re2/tostring.cc
new file mode 100644
index 0000000..555524f
--- /dev/null
+++ b/re2/tostring.cc
@@ -0,0 +1,341 @@
+// Copyright 2006 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Format a regular expression structure as a string.
+// Tested by parse_test.cc
+
+#include "util/util.h"
+#include "re2/regexp.h"
+#include "re2/walker-inl.h"
+
+namespace re2 {
+
+enum {
+ PrecAtom,
+ PrecUnary,
+ PrecConcat,
+ PrecAlternate,
+ PrecEmpty,
+ PrecParen,
+ PrecToplevel,
+};
+
+// Helper function. See description below.
+static void AppendCCRange(string* t, Rune lo, Rune hi);
+
+// Walker to generate string in s_.
+// The arg pointers are actually integers giving the
+// context precedence.
+// The child_args are always NULL.
+class ToStringWalker : public Regexp::Walker<int> {
+ public:
+ explicit ToStringWalker(string* t) : t_(t) {}
+
+ virtual int PreVisit(Regexp* re, int parent_arg, bool* stop);
+ virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg,
+ int* child_args, int nchild_args);
+ virtual int ShortVisit(Regexp* re, int parent_arg) {
+ return 0;
+ }
+
+ private:
+ string* t_; // The string the walker appends to.
+
+ DISALLOW_EVIL_CONSTRUCTORS(ToStringWalker);
+};
+
+string Regexp::ToString() {
+ string t;
+ ToStringWalker w(&t);
+ w.WalkExponential(this, PrecToplevel, 100000);
+ if (w.stopped_early())
+ t += " [truncated]";
+ return t;
+}
+
+#define ToString DontCallToString // Avoid accidental recursion.
+
+// Visits re before children are processed.
+// Appends ( if needed and passes new precedence to children.
+int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
+ int prec = parent_arg;
+ int nprec = PrecAtom;
+
+ switch (re->op()) {
+ case kRegexpNoMatch:
+ case kRegexpEmptyMatch:
+ case kRegexpLiteral:
+ case kRegexpAnyChar:
+ case kRegexpAnyByte:
+ case kRegexpBeginLine:
+ case kRegexpEndLine:
+ case kRegexpBeginText:
+ case kRegexpEndText:
+ case kRegexpWordBoundary:
+ case kRegexpNoWordBoundary:
+ case kRegexpCharClass:
+ case kRegexpHaveMatch:
+ nprec = PrecAtom;
+ break;
+
+ case kRegexpConcat:
+ case kRegexpLiteralString:
+ if (prec < PrecConcat)
+ t_->append("(?:");
+ nprec = PrecConcat;
+ break;
+
+ case kRegexpAlternate:
+ if (prec < PrecAlternate)
+ t_->append("(?:");
+ nprec = PrecAlternate;
+ break;
+
+ case kRegexpCapture:
+ t_->append("(");
+ if (re->name()) {
+ t_->append("?P<");
+ t_->append(*re->name());
+ t_->append(">");
+ }
+ nprec = PrecParen;
+ break;
+
+ case kRegexpStar:
+ case kRegexpPlus:
+ case kRegexpQuest:
+ case kRegexpRepeat:
+ if (prec < PrecUnary)
+ t_->append("(?:");
+ // The subprecedence here is PrecAtom instead of PrecUnary
+ // because PCRE treats two unary ops in a row as a parse error.
+ nprec = PrecAtom;
+ break;
+ }
+
+ return nprec;
+}
+
+static void AppendLiteral(string *t, Rune r, bool foldcase) {
+ if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) {
+ t->append(1, '\\');
+ t->append(1, r);
+ } else if (foldcase && 'a' <= r && r <= 'z') {
+ if ('a' <= r && r <= 'z')
+ r += 'A' - 'a';
+ t->append(1, '[');
+ t->append(1, r);
+ t->append(1, r + 'a' - 'A');
+ t->append(1, ']');
+ } else {
+ AppendCCRange(t, r, r);
+ }
+}
+
+// Visits re after children are processed.
+// For childless regexps, all the work is done here.
+// For regexps with children, append any unary suffixes or ).
+int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,
+ int* child_args, int nchild_args) {
+ int prec = parent_arg;
+ switch (re->op()) {
+ case kRegexpNoMatch:
+ // There's no simple symbol for "no match", but
+ // [^0-Runemax] excludes everything.
+ t_->append("[^\\x00-\\x{10ffff}]");
+ break;
+
+ case kRegexpEmptyMatch:
+ // Append (?:) to make empty string visible,
+ // unless this is already being parenthesized.
+ if (prec < PrecEmpty)
+ t_->append("(?:)");
+ break;
+
+ case kRegexpLiteral:
+ AppendLiteral(t_, re->rune(), re->parse_flags() & Regexp::FoldCase);
+ break;
+
+ case kRegexpLiteralString:
+ for (int i = 0; i < re->nrunes(); i++)
+ AppendLiteral(t_, re->runes()[i], re->parse_flags() & Regexp::FoldCase);
+ if (prec < PrecConcat)
+ t_->append(")");
+ break;
+
+ case kRegexpConcat:
+ if (prec < PrecConcat)
+ t_->append(")");
+ break;
+
+ case kRegexpAlternate:
+ // Clumsy but workable: the children all appended |
+ // at the end of their strings, so just remove the last one.
+ if ((*t_)[t_->size()-1] == '|')
+ t_->erase(t_->size()-1);
+ else
+ LOG(DFATAL) << "Bad final char: " << t_;
+ if (prec < PrecAlternate)
+ t_->append(")");
+ break;
+
+ case kRegexpStar:
+ t_->append("*");
+ if (re->parse_flags() & Regexp::NonGreedy)
+ t_->append("?");
+ if (prec < PrecUnary)
+ t_->append(")");
+ break;
+
+ case kRegexpPlus:
+ t_->append("+");
+ if (re->parse_flags() & Regexp::NonGreedy)
+ t_->append("?");
+ if (prec < PrecUnary)
+ t_->append(")");
+ break;
+
+ case kRegexpQuest:
+ t_->append("?");
+ if (re->parse_flags() & Regexp::NonGreedy)
+ t_->append("?");
+ if (prec < PrecUnary)
+ t_->append(")");
+ break;
+
+ case kRegexpRepeat:
+ if (re->max() == -1)
+ t_->append(StringPrintf("{%d,}", re->min()));
+ else if (re->min() == re->max())
+ t_->append(StringPrintf("{%d}", re->min()));
+ else
+ t_->append(StringPrintf("{%d,%d}", re->min(), re->max()));
+ if (re->parse_flags() & Regexp::NonGreedy)
+ t_->append("?");
+ if (prec < PrecUnary)
+ t_->append(")");
+ break;
+
+ case kRegexpAnyChar:
+ t_->append(".");
+ break;
+
+ case kRegexpAnyByte:
+ t_->append("\\C");
+ break;
+
+ case kRegexpBeginLine:
+ t_->append("^");
+ break;
+
+ case kRegexpEndLine:
+ t_->append("$");
+ break;
+
+ case kRegexpBeginText:
+ t_->append("(?-m:^)");
+ break;
+
+ case kRegexpEndText:
+ if (re->parse_flags() & Regexp::WasDollar)
+ t_->append("(?-m:$)");
+ else
+ t_->append("\\z");
+ break;
+
+ case kRegexpWordBoundary:
+ t_->append("\\b");
+ break;
+
+ case kRegexpNoWordBoundary:
+ t_->append("\\B");
+ break;
+
+ case kRegexpCharClass: {
+ if (re->cc()->size() == 0) {
+ t_->append("[^\\x00-\\x{10ffff}]");
+ break;
+ }
+ t_->append("[");
+ // Heuristic: show class as negated if it contains the
+ // non-character 0xFFFE.
+ CharClass* cc = re->cc();
+ if (cc->Contains(0xFFFE)) {
+ cc = cc->Negate();
+ t_->append("^");
+ }
+ for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i)
+ AppendCCRange(t_, i->lo, i->hi);
+ if (cc != re->cc())
+ cc->Delete();
+ t_->append("]");
+ break;
+ }
+
+ case kRegexpCapture:
+ t_->append(")");
+ break;
+
+ case kRegexpHaveMatch:
+ // There's no syntax accepted by the parser to generate
+ // this node (it is generated by RE2::Set) so make something
+ // up that is readable but won't compile.
+ t_->append("(?HaveMatch:%d)", re->match_id());
+ break;
+ }
+
+ // If the parent is an alternation, append the | for it.
+ if (prec == PrecAlternate)
+ t_->append("|");
+
+ return 0;
+}
+
+// Appends a rune for use in a character class to the string t.
+static void AppendCCChar(string* t, Rune r) {
+ if (0x20 <= r && r <= 0x7E) {
+ if (strchr("[]^-\\", r))
+ t->append("\\");
+ t->append(1, r);
+ return;
+ }
+ switch (r) {
+ default:
+ break;
+
+ case '\r':
+ t->append("\\r");
+ return;
+
+ case '\t':
+ t->append("\\t");
+ return;
+
+ case '\n':
+ t->append("\\n");
+ return;
+
+ case '\f':
+ t->append("\\f");
+ return;
+ }
+
+ if (r < 0x100) {
+ StringAppendF(t, "\\x%02x", static_cast<int>(r));
+ return;
+ }
+ StringAppendF(t, "\\x{%x}", static_cast<int>(r));
+}
+
+static void AppendCCRange(string* t, Rune lo, Rune hi) {
+ if (lo > hi)
+ return;
+ AppendCCChar(t, lo);
+ if (lo < hi) {
+ t->append("-");
+ AppendCCChar(t, hi);
+ }
+}
+
+} // namespace re2