diff options
author | David Neto <dneto@google.com> | 2019-02-20 17:18:28 -0500 |
---|---|---|
committer | David Neto <dneto@google.com> | 2019-02-21 19:52:57 -0500 |
commit | 83d14e6912c72138f77617d225a2f4a8786a9d82 (patch) | |
tree | 9ff24faf1e0271a77fbb0019edf793cfc0518a93 /re2/regexp.cc | |
parent | 03e00e139ba1ef1ac728bb9b78181cf74a6b93be (diff) | |
parent | 79ef3b2d31f06493f687ef9e947d9632bad54b9b (diff) | |
download | regex-re2-83d14e6912c72138f77617d225a2f4a8786a9d82.tar.gz |
Merge remote-tracking branch 'aosp/upstream-master' into up-shaderc2
Refresh with content from github.com/google/re2
commit 79ef3b2d31f06493f687ef9e947d9632bad54b9b
date 2019-02-13
Change-Id: I4139f29d632e41722992309e96aca4f29c799c87
Diffstat (limited to 're2/regexp.cc')
-rw-r--r-- | re2/regexp.cc | 185 |
1 files changed, 112 insertions, 73 deletions
diff --git a/re2/regexp.cc b/re2/regexp.cc index a74ceec..7cfbbcb 100644 --- a/re2/regexp.cc +++ b/re2/regexp.cc @@ -5,8 +5,21 @@ // Regular expression representation. // Tested by parse_test.cc -#include "util/util.h" #include "re2/regexp.h" + +#include <stddef.h> +#include <stdint.h> +#include <string.h> +#include <algorithm> +#include <map> +#include <mutex> +#include <string> +#include <vector> + +#include "util/util.h" +#include "util/logging.h" +#include "util/mutex.h" +#include "util/utf.h" #include "re2/stringpiece.h" #include "re2/walker-inl.h" @@ -14,9 +27,9 @@ namespace re2 { // Constructor. Allocates vectors as appropriate for operator. Regexp::Regexp(RegexpOp op, ParseFlags parse_flags) - : op_(op), + : op_(static_cast<uint8_t>(op)), simple_(false), - parse_flags_(static_cast<uint16>(parse_flags)), + parse_flags_(static_cast<uint16_t>(parse_flags)), ref_(1), nsub_(0), down_(NULL) { @@ -43,7 +56,8 @@ Regexp::~Regexp() { delete[] runes_; break; case kRegexpCharClass: - cc_->Delete(); + if (cc_) + cc_->Delete(); delete ccb_; break; } @@ -59,30 +73,29 @@ bool Regexp::QuickDestroy() { return false; } -static map<Regexp*, int> *ref_map; -GLOBAL_MUTEX(ref_mutex); +// Lazily allocated. +static Mutex* ref_mutex; +static std::map<Regexp*, int>* ref_map; int Regexp::Ref() { if (ref_ < kMaxRef) return ref_; - GLOBAL_MUTEX_LOCK(ref_mutex); - int r = 0; - if (ref_map != NULL) { - r = (*ref_map)[this]; - } - GLOBAL_MUTEX_UNLOCK(ref_mutex); - return r; + MutexLock l(ref_mutex); + return (*ref_map)[this]; } // Increments reference count, returns object as convenience. Regexp* Regexp::Incref() { if (ref_ >= kMaxRef-1) { + static std::once_flag ref_once; + std::call_once(ref_once, []() { + ref_mutex = new Mutex; + ref_map = new std::map<Regexp*, int>; + }); + // Store ref count in overflow map. - GLOBAL_MUTEX_LOCK(ref_mutex); - if (ref_map == NULL) { - ref_map = new map<Regexp*, int>; - } + MutexLock l(ref_mutex); if (ref_ == kMaxRef) { // already overflowed (*ref_map)[this]++; @@ -91,7 +104,6 @@ Regexp* Regexp::Incref() { (*ref_map)[this] = kMaxRef; ref_ = kMaxRef; } - GLOBAL_MUTEX_UNLOCK(ref_mutex); return this; } @@ -103,15 +115,14 @@ Regexp* Regexp::Incref() { void Regexp::Decref() { if (ref_ == kMaxRef) { // Ref count is stored in overflow map. - GLOBAL_MUTEX_LOCK(ref_mutex); + MutexLock l(ref_mutex); int r = (*ref_map)[this] - 1; if (r < kMaxRef) { - ref_ = r; + ref_ = static_cast<uint16_t>(r); ref_map->erase(this); } else { (*ref_map)[this] = r; } - GLOBAL_MUTEX_UNLOCK(ref_mutex); return; } ref_--; @@ -179,31 +190,45 @@ Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) { return re; } -Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) { - if (sub->op() == kRegexpPlus && sub->parse_flags() == flags) +Regexp* Regexp::StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags) { + // Squash **, ++ and ??. + if (op == sub->op() && flags == sub->parse_flags()) return sub; - Regexp* re = new Regexp(kRegexpPlus, flags); + + // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because + // op is Star/Plus/Quest, we just have to check that sub->op() is too. + if ((sub->op() == kRegexpStar || + sub->op() == kRegexpPlus || + sub->op() == kRegexpQuest) && + flags == sub->parse_flags()) { + // If sub is Star, no need to rewrite it. + if (sub->op() == kRegexpStar) + return sub; + + // Rewrite sub to Star. + Regexp* re = new Regexp(kRegexpStar, flags); + re->AllocSub(1); + re->sub()[0] = sub->sub()[0]->Incref(); + sub->Decref(); // We didn't consume the reference after all. + return re; + } + + Regexp* re = new Regexp(op, flags); re->AllocSub(1); re->sub()[0] = sub; return re; } +Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) { + return StarPlusOrQuest(kRegexpPlus, sub, flags); +} + Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) { - if (sub->op() == kRegexpStar && sub->parse_flags() == flags) - return sub; - Regexp* re = new Regexp(kRegexpStar, flags); - re->AllocSub(1); - re->sub()[0] = sub; - return re; + return StarPlusOrQuest(kRegexpStar, sub, flags); } Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) { - if (sub->op() == kRegexpQuest && sub->parse_flags() == flags) - return sub; - Regexp* re = new Regexp(kRegexpQuest, flags); - re->AllocSub(1); - re->sub()[0] = sub; - return re; + return StarPlusOrQuest(kRegexpQuest, sub, flags); } Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub, @@ -211,6 +236,13 @@ Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub, if (nsub == 1) return sub[0]; + if (nsub == 0) { + if (op == kRegexpAlternate) + return new Regexp(kRegexpNoMatch, flags); + else + return new Regexp(kRegexpEmptyMatch, flags); + } + Regexp** subcopy = NULL; if (op == kRegexpAlternate && can_factor) { // Going to edit sub; make a copy so we don't step on caller. @@ -303,13 +335,15 @@ Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) { return re; } -// Swaps this and that in place. void Regexp::Swap(Regexp* that) { - // Can use memmove because Regexp is just a struct (no vtable). + // Regexp is not trivially copyable, so we cannot freely copy it with + // memmove(3), but swapping objects like so is safe for our purposes. char tmp[sizeof *this]; - memmove(tmp, this, sizeof tmp); - memmove(this, that, sizeof tmp); - memmove(that, tmp, sizeof tmp); + void* vthis = reinterpret_cast<void*>(this); + void* vthat = reinterpret_cast<void*>(that); + memmove(tmp, vthis, sizeof *this); + memmove(vthis, vthat, sizeof *this); + memmove(vthat, tmp, sizeof *this); } // Tests equality of all top-level structure but not subregexps. @@ -405,7 +439,7 @@ bool Regexp::Equal(Regexp* a, Regexp* b) { // The stack (vector) has pairs of regexps waiting to // be compared. The regexps are only equal if // all the pairs end up being equal. - vector<Regexp*> stk; + std::vector<Regexp*> stk; for (;;) { // Invariant: TopEqual(a, b) == true. @@ -445,10 +479,11 @@ bool Regexp::Equal(Regexp* a, Regexp* b) { continue; } - int n = stk.size(); + size_t n = stk.size(); if (n == 0) break; + DCHECK_GE(n, 2); a = stk[n-2]; b = stk[n-1]; stk.resize(n-2); @@ -517,7 +552,9 @@ class NumCapturesWalker : public Regexp::Walker<Ignored> { private: int ncapture_; - DISALLOW_EVIL_CONSTRUCTORS(NumCapturesWalker); + + NumCapturesWalker(const NumCapturesWalker&) = delete; + NumCapturesWalker& operator=(const NumCapturesWalker&) = delete; }; int Regexp::NumCaptures() { @@ -532,8 +569,8 @@ class NamedCapturesWalker : public Regexp::Walker<Ignored> { NamedCapturesWalker() : map_(NULL) {} ~NamedCapturesWalker() { delete map_; } - map<string, int>* TakeMap() { - map<string, int>* m = map_; + std::map<string, int>* TakeMap() { + std::map<string, int>* m = map_; map_ = NULL; return m; } @@ -542,7 +579,7 @@ class NamedCapturesWalker : public Regexp::Walker<Ignored> { if (re->op() == kRegexpCapture && re->name() != NULL) { // Allocate map once we find a name. if (map_ == NULL) - map_ = new map<string, int>; + map_ = new std::map<string, int>; // Record first occurrence of each name. // (The rule is that if you have the same name @@ -560,11 +597,13 @@ class NamedCapturesWalker : public Regexp::Walker<Ignored> { } private: - map<string, int>* map_; - DISALLOW_EVIL_CONSTRUCTORS(NamedCapturesWalker); + std::map<string, int>* map_; + + NamedCapturesWalker(const NamedCapturesWalker&) = delete; + NamedCapturesWalker& operator=(const NamedCapturesWalker&) = delete; }; -map<string, int>* Regexp::NamedCaptures() { +std::map<string, int>* Regexp::NamedCaptures() { NamedCapturesWalker w; w.Walk(this, 0); return w.TakeMap(); @@ -576,8 +615,8 @@ class CaptureNamesWalker : public Regexp::Walker<Ignored> { CaptureNamesWalker() : map_(NULL) {} ~CaptureNamesWalker() { delete map_; } - map<int, string>* TakeMap() { - map<int, string>* m = map_; + std::map<int, string>* TakeMap() { + std::map<int, string>* m = map_; map_ = NULL; return m; } @@ -586,7 +625,7 @@ class CaptureNamesWalker : public Regexp::Walker<Ignored> { if (re->op() == kRegexpCapture && re->name() != NULL) { // Allocate map once we find a name. if (map_ == NULL) - map_ = new map<int, string>; + map_ = new std::map<int, string>; (*map_)[re->cap()] = *re->name(); } @@ -600,11 +639,13 @@ class CaptureNamesWalker : public Regexp::Walker<Ignored> { } private: - map<int, string>* map_; - DISALLOW_EVIL_CONSTRUCTORS(CaptureNamesWalker); + std::map<int, string>* map_; + + CaptureNamesWalker(const CaptureNamesWalker&) = delete; + CaptureNamesWalker& operator=(const CaptureNamesWalker&) = delete; }; -map<int, string>* Regexp::CaptureNames() { +std::map<int, string>* Regexp::CaptureNames() { CaptureNamesWalker w; w.Walk(this, 0); return w.TakeMap(); @@ -614,7 +655,7 @@ map<int, string>* Regexp::CaptureNames() { // with a fixed string prefix. If so, returns the prefix and // the regexp that remains after the prefix. The prefix might // be ASCII case-insensitive. -bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) { +bool Regexp::RequiredPrefix(string* prefix, bool* foldcase, Regexp** suffix) { // No need for a walker: the regexp must be of the form // 1. some number of ^ anchors // 2. a literal char or string @@ -643,7 +684,7 @@ bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) { if (re->parse_flags() & Latin1) { prefix->resize(re->nrunes_); for (int j = 0; j < re->nrunes_; j++) - (*prefix)[j] = re->runes_[j]; + (*prefix)[j] = static_cast<char>(re->runes_[j]); } else { // Convert to UTF-8 in place. // Assume worst-case space and then trim. @@ -652,7 +693,7 @@ bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) { for (int j = 0; j < re->nrunes_; j++) { Rune r = re->runes_[j]; if (r < Runeself) - *p++ = r; + *p++ = static_cast<char>(r); else p += runetochar(p, &r); } @@ -662,14 +703,14 @@ bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) { case kRegexpLiteral: if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) { - prefix->append(1, re->rune_); + prefix->append(1, static_cast<char>(re->rune_)); } else { char buf[UTFmax]; prefix->append(buf, runetochar(buf, &re->rune_)); } break; } - *foldcase = (sub[i]->parse_flags() & FoldCase); + *foldcase = (sub[i]->parse_flags() & FoldCase) != 0; i++; // The rest. @@ -704,13 +745,13 @@ bool CharClassBuilder::AddRange(Rune lo, Rune hi) { if (lo <= 'z' && hi >= 'A') { // Overlaps some alpha, maybe not all. // Update bitmaps telling which ASCII letters are in the set. - Rune lo1 = max<Rune>(lo, 'A'); - Rune hi1 = min<Rune>(hi, 'Z'); + Rune lo1 = std::max<Rune>(lo, 'A'); + Rune hi1 = std::min<Rune>(hi, 'Z'); if (lo1 <= hi1) upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A'); - lo1 = max<Rune>(lo, 'a'); - hi1 = min<Rune>(hi, 'z'); + lo1 = std::max<Rune>(lo, 'a'); + hi1 = std::min<Rune>(hi, 'z'); if (lo1 <= hi1) lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a'); } @@ -826,7 +867,7 @@ void CharClassBuilder::RemoveAbove(Rune r) { void CharClassBuilder::Negate() { // Build up negation and then copy in. // Could edit ranges in place, but C++ won't let me. - vector<RuneRange> v; + std::vector<RuneRange> v; v.reserve(ranges_.size() + 1); // In negation, first range begins at 0, unless @@ -849,7 +890,7 @@ void CharClassBuilder::Negate() { } ranges_.clear(); - for (int i = 0; i < v.size(); i++) + for (size_t i = 0; i < v.size(); i++) ranges_.insert(v[i]); upper_ = AlphaMask & ~upper_; @@ -863,7 +904,7 @@ void CharClassBuilder::Negate() { CharClass* CharClass::New(int maxranges) { CharClass* cc; - uint8* data = new uint8[sizeof *cc + maxranges*sizeof cc->ranges_[0]]; + uint8_t* data = new uint8_t[sizeof *cc + maxranges*sizeof cc->ranges_[0]]; cc = reinterpret_cast<CharClass*>(data); cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc); cc->nranges_ = 0; @@ -873,9 +914,7 @@ CharClass* CharClass::New(int maxranges) { } void CharClass::Delete() { - if (this == NULL) - return; - uint8 *data = reinterpret_cast<uint8*>(this); + uint8_t* data = reinterpret_cast<uint8_t*>(this); delete[] data; } @@ -917,12 +956,12 @@ bool CharClass::Contains(Rune r) { } CharClass* CharClassBuilder::GetCharClass() { - CharClass* cc = CharClass::New(ranges_.size()); + CharClass* cc = CharClass::New(static_cast<int>(ranges_.size())); int n = 0; for (iterator it = begin(); it != end(); ++it) cc->ranges_[n++] = *it; cc->nranges_ = n; - DCHECK_LE(n, ranges_.size()); + DCHECK_LE(n, static_cast<int>(ranges_.size())); cc->nrunes_ = nrunes_; cc->folds_ascii_ = FoldsASCII(); return cc; |