aboutsummaryrefslogtreecommitdiff
path: root/re2/regexp.cc
diff options
context:
space:
mode:
Diffstat (limited to 're2/regexp.cc')
-rw-r--r--re2/regexp.cc920
1 files changed, 920 insertions, 0 deletions
diff --git a/re2/regexp.cc b/re2/regexp.cc
new file mode 100644
index 0000000..9486b3c
--- /dev/null
+++ b/re2/regexp.cc
@@ -0,0 +1,920 @@
+// Copyright 2006 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Regular expression representation.
+// Tested by parse_test.cc
+
+#include "util/util.h"
+#include "re2/regexp.h"
+#include "re2/stringpiece.h"
+#include "re2/walker-inl.h"
+
+namespace re2 {
+
+// Constructor. Allocates vectors as appropriate for operator.
+Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
+ : op_(op),
+ simple_(false),
+ parse_flags_(static_cast<uint16>(parse_flags)),
+ ref_(1),
+ nsub_(0),
+ down_(NULL) {
+ subone_ = NULL;
+ memset(the_union_, 0, sizeof the_union_);
+}
+
+// Destructor. Assumes already cleaned up children.
+// Private: use Decref() instead of delete to destroy Regexps.
+// Can't call Decref on the sub-Regexps here because
+// that could cause arbitrarily deep recursion, so
+// required Decref() to have handled them for us.
+Regexp::~Regexp() {
+ if (nsub_ > 0)
+ LOG(DFATAL) << "Regexp not destroyed.";
+
+ switch (op_) {
+ default:
+ break;
+ case kRegexpCapture:
+ delete name_;
+ break;
+ case kRegexpLiteralString:
+ delete[] runes_;
+ break;
+ case kRegexpCharClass:
+ cc_->Delete();
+ delete ccb_;
+ break;
+ }
+}
+
+// If it's possible to destroy this regexp without recurring,
+// do so and return true. Else return false.
+bool Regexp::QuickDestroy() {
+ if (nsub_ == 0) {
+ delete this;
+ return true;
+ }
+ return false;
+}
+
+static map<Regexp*, int> ref_map;
+static Mutex ref_mutex;
+
+int Regexp::Ref() {
+ if (ref_ < kMaxRef)
+ return ref_;
+
+ MutexLock l(&ref_mutex);
+ return ref_map[this];
+}
+
+// Increments reference count, returns object as convenience.
+Regexp* Regexp::Incref() {
+ if (ref_ >= kMaxRef-1) {
+ // Store ref count in overflow map.
+ MutexLock l(&ref_mutex);
+ if (ref_ == kMaxRef) { // already overflowed
+ ref_map[this]++;
+ return this;
+ }
+ // overflowing now
+ ref_map[this] = kMaxRef;
+ ref_ = kMaxRef;
+ return this;
+ }
+
+ ref_++;
+ return this;
+}
+
+// Decrements reference count and deletes this object if count reaches 0.
+void Regexp::Decref() {
+ if (ref_ == kMaxRef) {
+ // Ref count is stored in overflow map.
+ MutexLock l(&ref_mutex);
+ int r = ref_map[this] - 1;
+ if (r < kMaxRef) {
+ ref_ = r;
+ ref_map.erase(this);
+ } else {
+ ref_map[this] = r;
+ }
+ return;
+ }
+ ref_--;
+ if (ref_ == 0)
+ Destroy();
+}
+
+// Deletes this object; ref count has count reached 0.
+void Regexp::Destroy() {
+ if (QuickDestroy())
+ return;
+
+ // Handle recursive Destroy with explicit stack
+ // to avoid arbitrarily deep recursion on process stack [sigh].
+ down_ = NULL;
+ Regexp* stack = this;
+ while (stack != NULL) {
+ Regexp* re = stack;
+ stack = re->down_;
+ if (re->ref_ != 0)
+ LOG(DFATAL) << "Bad reference count " << re->ref_;
+ if (re->nsub_ > 0) {
+ Regexp** subs = re->sub();
+ for (int i = 0; i < re->nsub_; i++) {
+ Regexp* sub = subs[i];
+ if (sub == NULL)
+ continue;
+ if (sub->ref_ == kMaxRef)
+ sub->Decref();
+ else
+ --sub->ref_;
+ if (sub->ref_ == 0 && !sub->QuickDestroy()) {
+ sub->down_ = stack;
+ stack = sub;
+ }
+ }
+ if (re->nsub_ > 1)
+ delete[] subs;
+ re->nsub_ = 0;
+ }
+ delete re;
+ }
+}
+
+void Regexp::AddRuneToString(Rune r) {
+ DCHECK(op_ == kRegexpLiteralString);
+ if (nrunes_ == 0) {
+ // start with 8
+ runes_ = new Rune[8];
+ } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
+ // double on powers of two
+ Rune *old = runes_;
+ runes_ = new Rune[nrunes_ * 2];
+ for (int i = 0; i < nrunes_; i++)
+ runes_[i] = old[i];
+ delete[] old;
+ }
+
+ runes_[nrunes_++] = r;
+}
+
+Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
+ Regexp* re = new Regexp(kRegexpHaveMatch, flags);
+ re->match_id_ = match_id;
+ return re;
+}
+
+Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
+ if (sub->op() == kRegexpPlus && sub->parse_flags() == flags)
+ return sub;
+ Regexp* re = new Regexp(kRegexpPlus, flags);
+ re->AllocSub(1);
+ re->sub()[0] = sub;
+ return re;
+}
+
+Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
+ if (sub->op() == kRegexpStar && sub->parse_flags() == flags)
+ return sub;
+ Regexp* re = new Regexp(kRegexpStar, flags);
+ re->AllocSub(1);
+ re->sub()[0] = sub;
+ return re;
+}
+
+Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
+ if (sub->op() == kRegexpQuest && sub->parse_flags() == flags)
+ return sub;
+ Regexp* re = new Regexp(kRegexpQuest, flags);
+ re->AllocSub(1);
+ re->sub()[0] = sub;
+ return re;
+}
+
+Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
+ ParseFlags flags, bool can_factor) {
+ if (nsub == 1)
+ return sub[0];
+
+ Regexp** subcopy = NULL;
+ if (op == kRegexpAlternate && can_factor) {
+ // Going to edit sub; make a copy so we don't step on caller.
+ subcopy = new Regexp*[nsub];
+ memmove(subcopy, sub, nsub * sizeof sub[0]);
+ sub = subcopy;
+ nsub = FactorAlternation(sub, nsub, flags);
+ if (nsub == 1) {
+ Regexp* re = sub[0];
+ delete[] subcopy;
+ return re;
+ }
+ }
+
+ if (nsub > kMaxNsub) {
+ // Too many subexpressions to fit in a single Regexp.
+ // Make a two-level tree. Two levels gets us to 65535^2.
+ int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
+ Regexp* re = new Regexp(op, flags);
+ re->AllocSub(nbigsub);
+ Regexp** subs = re->sub();
+ for (int i = 0; i < nbigsub - 1; i++)
+ subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
+ subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
+ nsub - (nbigsub-1)*kMaxNsub, flags,
+ false);
+ delete[] subcopy;
+ return re;
+ }
+
+ Regexp* re = new Regexp(op, flags);
+ re->AllocSub(nsub);
+ Regexp** subs = re->sub();
+ for (int i = 0; i < nsub; i++)
+ subs[i] = sub[i];
+
+ delete[] subcopy;
+ return re;
+}
+
+Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
+ return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
+}
+
+Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
+ return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
+}
+
+Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
+ return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
+}
+
+Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
+ Regexp* re = new Regexp(kRegexpCapture, flags);
+ re->AllocSub(1);
+ re->sub()[0] = sub;
+ re->cap_ = cap;
+ return re;
+}
+
+Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
+ Regexp* re = new Regexp(kRegexpRepeat, flags);
+ re->AllocSub(1);
+ re->sub()[0] = sub;
+ re->min_ = min;
+ re->max_ = max;
+ return re;
+}
+
+Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
+ Regexp* re = new Regexp(kRegexpLiteral, flags);
+ re->rune_ = rune;
+ return re;
+}
+
+Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
+ if (nrunes <= 0)
+ return new Regexp(kRegexpEmptyMatch, flags);
+ if (nrunes == 1)
+ return NewLiteral(runes[0], flags);
+ Regexp* re = new Regexp(kRegexpLiteralString, flags);
+ for (int i = 0; i < nrunes; i++)
+ re->AddRuneToString(runes[i]);
+ return re;
+}
+
+Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
+ Regexp* re = new Regexp(kRegexpCharClass, flags);
+ re->cc_ = cc;
+ return re;
+}
+
+// Swaps this and that in place.
+void Regexp::Swap(Regexp* that) {
+ // Can use memmove because Regexp is just a struct (no vtable).
+ char tmp[sizeof *this];
+ memmove(tmp, this, sizeof tmp);
+ memmove(this, that, sizeof tmp);
+ memmove(that, tmp, sizeof tmp);
+}
+
+// Tests equality of all top-level structure but not subregexps.
+static bool TopEqual(Regexp* a, Regexp* b) {
+ if (a->op() != b->op())
+ return false;
+
+ switch (a->op()) {
+ case kRegexpNoMatch:
+ case kRegexpEmptyMatch:
+ case kRegexpAnyChar:
+ case kRegexpAnyByte:
+ case kRegexpBeginLine:
+ case kRegexpEndLine:
+ case kRegexpWordBoundary:
+ case kRegexpNoWordBoundary:
+ case kRegexpBeginText:
+ return true;
+
+ case kRegexpEndText:
+ // The parse flags remember whether it's \z or (?-m:$),
+ // which matters when testing against PCRE.
+ return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
+
+ case kRegexpLiteral:
+ return a->rune() == b->rune() &&
+ ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
+
+ case kRegexpLiteralString:
+ return a->nrunes() == b->nrunes() &&
+ ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
+ memcmp(a->runes(), b->runes(),
+ a->nrunes() * sizeof a->runes()[0]) == 0;
+
+ case kRegexpAlternate:
+ case kRegexpConcat:
+ return a->nsub() == b->nsub();
+
+ case kRegexpStar:
+ case kRegexpPlus:
+ case kRegexpQuest:
+ return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
+
+ case kRegexpRepeat:
+ return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
+ a->min() == b->min() &&
+ a->max() == b->max();
+
+ case kRegexpCapture:
+ return a->cap() == b->cap() && a->name() == b->name();
+
+ case kRegexpHaveMatch:
+ return a->match_id() == b->match_id();
+
+ case kRegexpCharClass: {
+ CharClass* acc = a->cc();
+ CharClass* bcc = b->cc();
+ return acc->size() == bcc->size() &&
+ acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
+ memcmp(acc->begin(), bcc->begin(),
+ (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
+ }
+ }
+
+ LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
+ return 0;
+}
+
+bool Regexp::Equal(Regexp* a, Regexp* b) {
+ if (a == NULL || b == NULL)
+ return a == b;
+
+ if (!TopEqual(a, b))
+ return false;
+
+ // Fast path:
+ // return without allocating vector if there are no subregexps.
+ switch (a->op()) {
+ case kRegexpAlternate:
+ case kRegexpConcat:
+ case kRegexpStar:
+ case kRegexpPlus:
+ case kRegexpQuest:
+ case kRegexpRepeat:
+ case kRegexpCapture:
+ break;
+
+ default:
+ return true;
+ }
+
+ // Committed to doing real work.
+ // The stack (vector) has pairs of regexps waiting to
+ // be compared. The regexps are only equal if
+ // all the pairs end up being equal.
+ vector<Regexp*> stk;
+
+ for (;;) {
+ // Invariant: TopEqual(a, b) == true.
+ Regexp* a2;
+ Regexp* b2;
+ switch (a->op()) {
+ default:
+ break;
+ case kRegexpAlternate:
+ case kRegexpConcat:
+ for (int i = 0; i < a->nsub(); i++) {
+ a2 = a->sub()[i];
+ b2 = b->sub()[i];
+ if (!TopEqual(a2, b2))
+ return false;
+ stk.push_back(a2);
+ stk.push_back(b2);
+ }
+ break;
+
+ case kRegexpStar:
+ case kRegexpPlus:
+ case kRegexpQuest:
+ case kRegexpRepeat:
+ case kRegexpCapture:
+ a2 = a->sub()[0];
+ b2 = b->sub()[0];
+ if (!TopEqual(a2, b2))
+ return false;
+ // Really:
+ // stk.push_back(a2);
+ // stk.push_back(b2);
+ // break;
+ // but faster to assign directly and loop.
+ a = a2;
+ b = b2;
+ continue;
+ }
+
+ int n = stk.size();
+ if (n == 0)
+ break;
+
+ a = stk[n-2];
+ b = stk[n-1];
+ stk.resize(n-2);
+ }
+
+ return true;
+}
+
+// Keep in sync with enum RegexpStatusCode in regexp.h
+static const string kErrorStrings[] = {
+ "no error",
+ "unexpected error",
+ "invalid escape sequence",
+ "invalid character class",
+ "invalid character class range",
+ "missing ]",
+ "missing )",
+ "trailing \\",
+ "no argument for repetition operator",
+ "invalid repetition size",
+ "bad repetition operator",
+ "invalid perl operator",
+ "invalid UTF-8",
+ "invalid named capture group",
+};
+
+const string& RegexpStatus::CodeText(enum RegexpStatusCode code) {
+ if (code < 0 || code >= arraysize(kErrorStrings))
+ code = kRegexpInternalError;
+ return kErrorStrings[code];
+}
+
+string RegexpStatus::Text() const {
+ if (error_arg_.empty())
+ return CodeText(code_);
+ string s;
+ s.append(CodeText(code_));
+ s.append(": ");
+ s.append(error_arg_.data(), error_arg_.size());
+ return s;
+}
+
+void RegexpStatus::Copy(const RegexpStatus& status) {
+ code_ = status.code_;
+ error_arg_ = status.error_arg_;
+}
+
+typedef int Ignored; // Walker<void> doesn't exist
+
+// Walker subclass to count capturing parens in regexp.
+class NumCapturesWalker : public Regexp::Walker<Ignored> {
+ public:
+ NumCapturesWalker() : ncapture_(0) {}
+ int ncapture() { return ncapture_; }
+
+ virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
+ if (re->op() == kRegexpCapture)
+ ncapture_++;
+ return ignored;
+ }
+ virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
+ // Should never be called: we use Walk not WalkExponential.
+ LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
+ return ignored;
+ }
+
+ private:
+ int ncapture_;
+ DISALLOW_EVIL_CONSTRUCTORS(NumCapturesWalker);
+};
+
+int Regexp::NumCaptures() {
+ NumCapturesWalker w;
+ w.Walk(this, 0);
+ return w.ncapture();
+}
+
+// Walker class to build map of named capture groups and their indices.
+class NamedCapturesWalker : public Regexp::Walker<Ignored> {
+ public:
+ NamedCapturesWalker() : map_(NULL) {}
+ ~NamedCapturesWalker() { delete map_; }
+
+ map<string, int>* TakeMap() {
+ map<string, int>* m = map_;
+ map_ = NULL;
+ return m;
+ }
+
+ Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
+ if (re->op() == kRegexpCapture && re->name() != NULL) {
+ // Allocate map once we find a name.
+ if (map_ == NULL)
+ map_ = new map<string, int>;
+
+ // Record first occurrence of each name.
+ // (The rule is that if you have the same name
+ // multiple times, only the leftmost one counts.)
+ if (map_->find(*re->name()) == map_->end())
+ (*map_)[*re->name()] = re->cap();
+ }
+ return ignored;
+ }
+
+ virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
+ // Should never be called: we use Walk not WalkExponential.
+ LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
+ return ignored;
+ }
+
+ private:
+ map<string, int>* map_;
+ DISALLOW_EVIL_CONSTRUCTORS(NamedCapturesWalker);
+};
+
+map<string, int>* Regexp::NamedCaptures() {
+ NamedCapturesWalker w;
+ w.Walk(this, 0);
+ return w.TakeMap();
+}
+
+// Walker class to build map from capture group indices to their names.
+class CaptureNamesWalker : public Regexp::Walker<Ignored> {
+ public:
+ CaptureNamesWalker() : map_(NULL) {}
+ ~CaptureNamesWalker() { delete map_; }
+
+ map<int, string>* TakeMap() {
+ map<int, string>* m = map_;
+ map_ = NULL;
+ return m;
+ }
+
+ Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
+ if (re->op() == kRegexpCapture && re->name() != NULL) {
+ // Allocate map once we find a name.
+ if (map_ == NULL)
+ map_ = new map<int, string>;
+
+ (*map_)[re->cap()] = *re->name();
+ }
+ return ignored;
+ }
+
+ virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
+ // Should never be called: we use Walk not WalkExponential.
+ LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
+ return ignored;
+ }
+
+ private:
+ map<int, string>* map_;
+ DISALLOW_EVIL_CONSTRUCTORS(CaptureNamesWalker);
+};
+
+map<int, string>* Regexp::CaptureNames() {
+ CaptureNamesWalker w;
+ w.Walk(this, 0);
+ return w.TakeMap();
+}
+
+// Determines whether regexp matches must be anchored
+// with a fixed string prefix. If so, returns the prefix and
+// the regexp that remains after the prefix. The prefix might
+// be ASCII case-insensitive.
+bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
+ // No need for a walker: the regexp must be of the form
+ // 1. some number of ^ anchors
+ // 2. a literal char or string
+ // 3. the rest
+ prefix->clear();
+ *foldcase = false;
+ *suffix = NULL;
+ if (op_ != kRegexpConcat)
+ return false;
+
+ // Some number of anchors, then a literal or concatenation.
+ int i = 0;
+ Regexp** sub = this->sub();
+ while (i < nsub_ && sub[i]->op_ == kRegexpBeginText)
+ i++;
+ if (i == 0 || i >= nsub_)
+ return false;
+
+ Regexp* re = sub[i];
+ switch (re->op_) {
+ default:
+ return false;
+
+ case kRegexpLiteralString:
+ // Convert to string in proper encoding.
+ if (re->parse_flags() & Latin1) {
+ prefix->resize(re->nrunes_);
+ for (int j = 0; j < re->nrunes_; j++)
+ (*prefix)[j] = re->runes_[j];
+ } else {
+ // Convert to UTF-8 in place.
+ // Assume worst-case space and then trim.
+ prefix->resize(re->nrunes_ * UTFmax);
+ char *p = &(*prefix)[0];
+ for (int j = 0; j < re->nrunes_; j++) {
+ Rune r = re->runes_[j];
+ if (r < Runeself)
+ *p++ = r;
+ else
+ p += runetochar(p, &r);
+ }
+ prefix->resize(p - &(*prefix)[0]);
+ }
+ break;
+
+ case kRegexpLiteral:
+ if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) {
+ prefix->append(1, re->rune_);
+ } else {
+ char buf[UTFmax];
+ prefix->append(buf, runetochar(buf, &re->rune_));
+ }
+ break;
+ }
+ *foldcase = (sub[i]->parse_flags() & FoldCase);
+ i++;
+
+ // The rest.
+ if (i < nsub_) {
+ for (int j = i; j < nsub_; j++)
+ sub[j]->Incref();
+ re = Concat(sub + i, nsub_ - i, parse_flags());
+ } else {
+ re = new Regexp(kRegexpEmptyMatch, parse_flags());
+ }
+ *suffix = re;
+ return true;
+}
+
+// Character class builder is a balanced binary tree (STL set)
+// containing non-overlapping, non-abutting RuneRanges.
+// The less-than operator used in the tree treats two
+// ranges as equal if they overlap at all, so that
+// lookups for a particular Rune are possible.
+
+CharClassBuilder::CharClassBuilder() {
+ nrunes_ = 0;
+ upper_ = 0;
+ lower_ = 0;
+}
+
+// Add lo-hi to the class; return whether class got bigger.
+bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
+ if (hi < lo)
+ return false;
+
+ if (lo <= 'z' && hi >= 'A') {
+ // Overlaps some alpha, maybe not all.
+ // Update bitmaps telling which ASCII letters are in the set.
+ Rune lo1 = max<Rune>(lo, 'A');
+ Rune hi1 = min<Rune>(hi, 'Z');
+ if (lo1 <= hi1)
+ upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
+
+ lo1 = max<Rune>(lo, 'a');
+ hi1 = min<Rune>(hi, 'z');
+ if (lo1 <= hi1)
+ lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
+ }
+
+ { // Check whether lo, hi is already in the class.
+ iterator it = ranges_.find(RuneRange(lo, lo));
+ if (it != end() && it->lo <= lo && hi <= it->hi)
+ return false;
+ }
+
+ // Look for a range abutting lo on the left.
+ // If it exists, take it out and increase our range.
+ if (lo > 0) {
+ iterator it = ranges_.find(RuneRange(lo-1, lo-1));
+ if (it != end()) {
+ lo = it->lo;
+ if (it->hi > hi)
+ hi = it->hi;
+ nrunes_ -= it->hi - it->lo + 1;
+ ranges_.erase(it);
+ }
+ }
+
+ // Look for a range abutting hi on the right.
+ // If it exists, take it out and increase our range.
+ if (hi < Runemax) {
+ iterator it = ranges_.find(RuneRange(hi+1, hi+1));
+ if (it != end()) {
+ hi = it->hi;
+ nrunes_ -= it->hi - it->lo + 1;
+ ranges_.erase(it);
+ }
+ }
+
+ // Look for ranges between lo and hi. Take them out.
+ // This is only safe because the set has no overlapping ranges.
+ // We've already removed any ranges abutting lo and hi, so
+ // any that overlap [lo, hi] must be contained within it.
+ for (;;) {
+ iterator it = ranges_.find(RuneRange(lo, hi));
+ if (it == end())
+ break;
+ nrunes_ -= it->hi - it->lo + 1;
+ ranges_.erase(it);
+ }
+
+ // Finally, add [lo, hi].
+ nrunes_ += hi - lo + 1;
+ ranges_.insert(RuneRange(lo, hi));
+ return true;
+}
+
+void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
+ for (iterator it = cc->begin(); it != cc->end(); ++it)
+ AddRange(it->lo, it->hi);
+}
+
+bool CharClassBuilder::Contains(Rune r) {
+ return ranges_.find(RuneRange(r, r)) != end();
+}
+
+// Does the character class behave the same on A-Z as on a-z?
+bool CharClassBuilder::FoldsASCII() {
+ return ((upper_ ^ lower_) & AlphaMask) == 0;
+}
+
+CharClassBuilder* CharClassBuilder::Copy() {
+ CharClassBuilder* cc = new CharClassBuilder;
+ for (iterator it = begin(); it != end(); ++it)
+ cc->ranges_.insert(RuneRange(it->lo, it->hi));
+ cc->upper_ = upper_;
+ cc->lower_ = lower_;
+ cc->nrunes_ = nrunes_;
+ return cc;
+}
+
+
+
+void CharClassBuilder::RemoveAbove(Rune r) {
+ if (r >= Runemax)
+ return;
+
+ if (r < 'z') {
+ if (r < 'a')
+ lower_ = 0;
+ else
+ lower_ &= AlphaMask >> ('z' - r);
+ }
+
+ if (r < 'Z') {
+ if (r < 'A')
+ upper_ = 0;
+ else
+ upper_ &= AlphaMask >> ('Z' - r);
+ }
+
+ for (;;) {
+
+ iterator it = ranges_.find(RuneRange(r + 1, Runemax));
+ if (it == end())
+ break;
+ RuneRange rr = *it;
+ ranges_.erase(it);
+ nrunes_ -= rr.hi - rr.lo + 1;
+ if (rr.lo <= r) {
+ rr.hi = r;
+ ranges_.insert(rr);
+ nrunes_ += rr.hi - rr.lo + 1;
+ }
+ }
+}
+
+void CharClassBuilder::Negate() {
+ // Build up negation and then copy in.
+ // Could edit ranges in place, but C++ won't let me.
+ vector<RuneRange> v;
+ v.reserve(ranges_.size() + 1);
+
+ // In negation, first range begins at 0, unless
+ // the current class begins at 0.
+ iterator it = begin();
+ if (it == end()) {
+ v.push_back(RuneRange(0, Runemax));
+ } else {
+ int nextlo = 0;
+ if (it->lo == 0) {
+ nextlo = it->hi + 1;
+ ++it;
+ }
+ for (; it != end(); ++it) {
+ v.push_back(RuneRange(nextlo, it->lo - 1));
+ nextlo = it->hi + 1;
+ }
+ if (nextlo <= Runemax)
+ v.push_back(RuneRange(nextlo, Runemax));
+ }
+
+ ranges_.clear();
+ for (int i = 0; i < v.size(); i++)
+ ranges_.insert(v[i]);
+
+ upper_ = AlphaMask & ~upper_;
+ lower_ = AlphaMask & ~lower_;
+ nrunes_ = Runemax+1 - nrunes_;
+}
+
+// Character class is a sorted list of ranges.
+// The ranges are allocated in the same block as the header,
+// necessitating a special allocator and Delete method.
+
+CharClass* CharClass::New(int maxranges) {
+ CharClass* cc;
+ uint8* data = new uint8[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
+ cc = reinterpret_cast<CharClass*>(data);
+ cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
+ cc->nranges_ = 0;
+ cc->folds_ascii_ = false;
+ cc->nrunes_ = 0;
+ return cc;
+}
+
+void CharClass::Delete() {
+ if (this == NULL)
+ return;
+ uint8 *data = reinterpret_cast<uint8*>(this);
+ delete[] data;
+}
+
+CharClass* CharClass::Negate() {
+ CharClass* cc = CharClass::New(nranges_+1);
+ cc->folds_ascii_ = folds_ascii_;
+ cc->nrunes_ = Runemax + 1 - nrunes_;
+ int n = 0;
+ int nextlo = 0;
+ for (CharClass::iterator it = begin(); it != end(); ++it) {
+ if (it->lo == nextlo) {
+ nextlo = it->hi + 1;
+ } else {
+ cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
+ nextlo = it->hi + 1;
+ }
+ }
+ if (nextlo <= Runemax)
+ cc->ranges_[n++] = RuneRange(nextlo, Runemax);
+ cc->nranges_ = n;
+ return cc;
+}
+
+bool CharClass::Contains(Rune r) {
+ RuneRange* rr = ranges_;
+ int n = nranges_;
+ while (n > 0) {
+ int m = n/2;
+ if (rr[m].hi < r) {
+ rr += m+1;
+ n -= m+1;
+ } else if (r < rr[m].lo) {
+ n = m;
+ } else { // rr[m].lo <= r && r <= rr[m].hi
+ return true;
+ }
+ }
+ return false;
+}
+
+CharClass* CharClassBuilder::GetCharClass() {
+ CharClass* cc = CharClass::New(ranges_.size());
+ int n = 0;
+ for (iterator it = begin(); it != end(); ++it)
+ cc->ranges_[n++] = *it;
+ cc->nranges_ = n;
+ DCHECK_LE(n, ranges_.size());
+ cc->nrunes_ = nrunes_;
+ cc->folds_ascii_ = FoldsASCII();
+ return cc;
+}
+
+} // namespace re2