aboutsummaryrefslogtreecommitdiff
path: root/re2/regexp.cc
diff options
context:
space:
mode:
authorDavid Neto <dneto@google.com>2019-02-20 17:18:28 -0500
committerDavid Neto <dneto@google.com>2019-02-21 19:52:57 -0500
commit83d14e6912c72138f77617d225a2f4a8786a9d82 (patch)
tree9ff24faf1e0271a77fbb0019edf793cfc0518a93 /re2/regexp.cc
parent03e00e139ba1ef1ac728bb9b78181cf74a6b93be (diff)
parent79ef3b2d31f06493f687ef9e947d9632bad54b9b (diff)
downloadregex-re2-83d14e6912c72138f77617d225a2f4a8786a9d82.tar.gz
Merge remote-tracking branch 'aosp/upstream-master' into up-shaderc2
Refresh with content from github.com/google/re2 commit 79ef3b2d31f06493f687ef9e947d9632bad54b9b date 2019-02-13 Change-Id: I4139f29d632e41722992309e96aca4f29c799c87
Diffstat (limited to 're2/regexp.cc')
-rw-r--r--re2/regexp.cc185
1 files changed, 112 insertions, 73 deletions
diff --git a/re2/regexp.cc b/re2/regexp.cc
index a74ceec..7cfbbcb 100644
--- a/re2/regexp.cc
+++ b/re2/regexp.cc
@@ -5,8 +5,21 @@
// Regular expression representation.
// Tested by parse_test.cc
-#include "util/util.h"
#include "re2/regexp.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <algorithm>
+#include <map>
+#include <mutex>
+#include <string>
+#include <vector>
+
+#include "util/util.h"
+#include "util/logging.h"
+#include "util/mutex.h"
+#include "util/utf.h"
#include "re2/stringpiece.h"
#include "re2/walker-inl.h"
@@ -14,9 +27,9 @@ namespace re2 {
// Constructor. Allocates vectors as appropriate for operator.
Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
- : op_(op),
+ : op_(static_cast<uint8_t>(op)),
simple_(false),
- parse_flags_(static_cast<uint16>(parse_flags)),
+ parse_flags_(static_cast<uint16_t>(parse_flags)),
ref_(1),
nsub_(0),
down_(NULL) {
@@ -43,7 +56,8 @@ Regexp::~Regexp() {
delete[] runes_;
break;
case kRegexpCharClass:
- cc_->Delete();
+ if (cc_)
+ cc_->Delete();
delete ccb_;
break;
}
@@ -59,30 +73,29 @@ bool Regexp::QuickDestroy() {
return false;
}
-static map<Regexp*, int> *ref_map;
-GLOBAL_MUTEX(ref_mutex);
+// Lazily allocated.
+static Mutex* ref_mutex;
+static std::map<Regexp*, int>* ref_map;
int Regexp::Ref() {
if (ref_ < kMaxRef)
return ref_;
- GLOBAL_MUTEX_LOCK(ref_mutex);
- int r = 0;
- if (ref_map != NULL) {
- r = (*ref_map)[this];
- }
- GLOBAL_MUTEX_UNLOCK(ref_mutex);
- return r;
+ MutexLock l(ref_mutex);
+ return (*ref_map)[this];
}
// Increments reference count, returns object as convenience.
Regexp* Regexp::Incref() {
if (ref_ >= kMaxRef-1) {
+ static std::once_flag ref_once;
+ std::call_once(ref_once, []() {
+ ref_mutex = new Mutex;
+ ref_map = new std::map<Regexp*, int>;
+ });
+
// Store ref count in overflow map.
- GLOBAL_MUTEX_LOCK(ref_mutex);
- if (ref_map == NULL) {
- ref_map = new map<Regexp*, int>;
- }
+ MutexLock l(ref_mutex);
if (ref_ == kMaxRef) {
// already overflowed
(*ref_map)[this]++;
@@ -91,7 +104,6 @@ Regexp* Regexp::Incref() {
(*ref_map)[this] = kMaxRef;
ref_ = kMaxRef;
}
- GLOBAL_MUTEX_UNLOCK(ref_mutex);
return this;
}
@@ -103,15 +115,14 @@ Regexp* Regexp::Incref() {
void Regexp::Decref() {
if (ref_ == kMaxRef) {
// Ref count is stored in overflow map.
- GLOBAL_MUTEX_LOCK(ref_mutex);
+ MutexLock l(ref_mutex);
int r = (*ref_map)[this] - 1;
if (r < kMaxRef) {
- ref_ = r;
+ ref_ = static_cast<uint16_t>(r);
ref_map->erase(this);
} else {
(*ref_map)[this] = r;
}
- GLOBAL_MUTEX_UNLOCK(ref_mutex);
return;
}
ref_--;
@@ -179,31 +190,45 @@ Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
return re;
}
-Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
- if (sub->op() == kRegexpPlus && sub->parse_flags() == flags)
+Regexp* Regexp::StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags) {
+ // Squash **, ++ and ??.
+ if (op == sub->op() && flags == sub->parse_flags())
return sub;
- Regexp* re = new Regexp(kRegexpPlus, flags);
+
+ // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
+ // op is Star/Plus/Quest, we just have to check that sub->op() is too.
+ if ((sub->op() == kRegexpStar ||
+ sub->op() == kRegexpPlus ||
+ sub->op() == kRegexpQuest) &&
+ flags == sub->parse_flags()) {
+ // If sub is Star, no need to rewrite it.
+ if (sub->op() == kRegexpStar)
+ return sub;
+
+ // Rewrite sub to Star.
+ Regexp* re = new Regexp(kRegexpStar, flags);
+ re->AllocSub(1);
+ re->sub()[0] = sub->sub()[0]->Incref();
+ sub->Decref(); // We didn't consume the reference after all.
+ return re;
+ }
+
+ Regexp* re = new Regexp(op, flags);
re->AllocSub(1);
re->sub()[0] = sub;
return re;
}
+Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
+ return StarPlusOrQuest(kRegexpPlus, sub, flags);
+}
+
Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
- if (sub->op() == kRegexpStar && sub->parse_flags() == flags)
- return sub;
- Regexp* re = new Regexp(kRegexpStar, flags);
- re->AllocSub(1);
- re->sub()[0] = sub;
- return re;
+ return StarPlusOrQuest(kRegexpStar, sub, flags);
}
Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
- if (sub->op() == kRegexpQuest && sub->parse_flags() == flags)
- return sub;
- Regexp* re = new Regexp(kRegexpQuest, flags);
- re->AllocSub(1);
- re->sub()[0] = sub;
- return re;
+ return StarPlusOrQuest(kRegexpQuest, sub, flags);
}
Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
@@ -211,6 +236,13 @@ Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
if (nsub == 1)
return sub[0];
+ if (nsub == 0) {
+ if (op == kRegexpAlternate)
+ return new Regexp(kRegexpNoMatch, flags);
+ else
+ return new Regexp(kRegexpEmptyMatch, flags);
+ }
+
Regexp** subcopy = NULL;
if (op == kRegexpAlternate && can_factor) {
// Going to edit sub; make a copy so we don't step on caller.
@@ -303,13 +335,15 @@ Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
return re;
}
-// Swaps this and that in place.
void Regexp::Swap(Regexp* that) {
- // Can use memmove because Regexp is just a struct (no vtable).
+ // Regexp is not trivially copyable, so we cannot freely copy it with
+ // memmove(3), but swapping objects like so is safe for our purposes.
char tmp[sizeof *this];
- memmove(tmp, this, sizeof tmp);
- memmove(this, that, sizeof tmp);
- memmove(that, tmp, sizeof tmp);
+ void* vthis = reinterpret_cast<void*>(this);
+ void* vthat = reinterpret_cast<void*>(that);
+ memmove(tmp, vthis, sizeof *this);
+ memmove(vthis, vthat, sizeof *this);
+ memmove(vthat, tmp, sizeof *this);
}
// Tests equality of all top-level structure but not subregexps.
@@ -405,7 +439,7 @@ bool Regexp::Equal(Regexp* a, Regexp* b) {
// The stack (vector) has pairs of regexps waiting to
// be compared. The regexps are only equal if
// all the pairs end up being equal.
- vector<Regexp*> stk;
+ std::vector<Regexp*> stk;
for (;;) {
// Invariant: TopEqual(a, b) == true.
@@ -445,10 +479,11 @@ bool Regexp::Equal(Regexp* a, Regexp* b) {
continue;
}
- int n = stk.size();
+ size_t n = stk.size();
if (n == 0)
break;
+ DCHECK_GE(n, 2);
a = stk[n-2];
b = stk[n-1];
stk.resize(n-2);
@@ -517,7 +552,9 @@ class NumCapturesWalker : public Regexp::Walker<Ignored> {
private:
int ncapture_;
- DISALLOW_EVIL_CONSTRUCTORS(NumCapturesWalker);
+
+ NumCapturesWalker(const NumCapturesWalker&) = delete;
+ NumCapturesWalker& operator=(const NumCapturesWalker&) = delete;
};
int Regexp::NumCaptures() {
@@ -532,8 +569,8 @@ class NamedCapturesWalker : public Regexp::Walker<Ignored> {
NamedCapturesWalker() : map_(NULL) {}
~NamedCapturesWalker() { delete map_; }
- map<string, int>* TakeMap() {
- map<string, int>* m = map_;
+ std::map<string, int>* TakeMap() {
+ std::map<string, int>* m = map_;
map_ = NULL;
return m;
}
@@ -542,7 +579,7 @@ class NamedCapturesWalker : public Regexp::Walker<Ignored> {
if (re->op() == kRegexpCapture && re->name() != NULL) {
// Allocate map once we find a name.
if (map_ == NULL)
- map_ = new map<string, int>;
+ map_ = new std::map<string, int>;
// Record first occurrence of each name.
// (The rule is that if you have the same name
@@ -560,11 +597,13 @@ class NamedCapturesWalker : public Regexp::Walker<Ignored> {
}
private:
- map<string, int>* map_;
- DISALLOW_EVIL_CONSTRUCTORS(NamedCapturesWalker);
+ std::map<string, int>* map_;
+
+ NamedCapturesWalker(const NamedCapturesWalker&) = delete;
+ NamedCapturesWalker& operator=(const NamedCapturesWalker&) = delete;
};
-map<string, int>* Regexp::NamedCaptures() {
+std::map<string, int>* Regexp::NamedCaptures() {
NamedCapturesWalker w;
w.Walk(this, 0);
return w.TakeMap();
@@ -576,8 +615,8 @@ class CaptureNamesWalker : public Regexp::Walker<Ignored> {
CaptureNamesWalker() : map_(NULL) {}
~CaptureNamesWalker() { delete map_; }
- map<int, string>* TakeMap() {
- map<int, string>* m = map_;
+ std::map<int, string>* TakeMap() {
+ std::map<int, string>* m = map_;
map_ = NULL;
return m;
}
@@ -586,7 +625,7 @@ class CaptureNamesWalker : public Regexp::Walker<Ignored> {
if (re->op() == kRegexpCapture && re->name() != NULL) {
// Allocate map once we find a name.
if (map_ == NULL)
- map_ = new map<int, string>;
+ map_ = new std::map<int, string>;
(*map_)[re->cap()] = *re->name();
}
@@ -600,11 +639,13 @@ class CaptureNamesWalker : public Regexp::Walker<Ignored> {
}
private:
- map<int, string>* map_;
- DISALLOW_EVIL_CONSTRUCTORS(CaptureNamesWalker);
+ std::map<int, string>* map_;
+
+ CaptureNamesWalker(const CaptureNamesWalker&) = delete;
+ CaptureNamesWalker& operator=(const CaptureNamesWalker&) = delete;
};
-map<int, string>* Regexp::CaptureNames() {
+std::map<int, string>* Regexp::CaptureNames() {
CaptureNamesWalker w;
w.Walk(this, 0);
return w.TakeMap();
@@ -614,7 +655,7 @@ map<int, string>* Regexp::CaptureNames() {
// with a fixed string prefix. If so, returns the prefix and
// the regexp that remains after the prefix. The prefix might
// be ASCII case-insensitive.
-bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
+bool Regexp::RequiredPrefix(string* prefix, bool* foldcase, Regexp** suffix) {
// No need for a walker: the regexp must be of the form
// 1. some number of ^ anchors
// 2. a literal char or string
@@ -643,7 +684,7 @@ bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
if (re->parse_flags() & Latin1) {
prefix->resize(re->nrunes_);
for (int j = 0; j < re->nrunes_; j++)
- (*prefix)[j] = re->runes_[j];
+ (*prefix)[j] = static_cast<char>(re->runes_[j]);
} else {
// Convert to UTF-8 in place.
// Assume worst-case space and then trim.
@@ -652,7 +693,7 @@ bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
for (int j = 0; j < re->nrunes_; j++) {
Rune r = re->runes_[j];
if (r < Runeself)
- *p++ = r;
+ *p++ = static_cast<char>(r);
else
p += runetochar(p, &r);
}
@@ -662,14 +703,14 @@ bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
case kRegexpLiteral:
if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) {
- prefix->append(1, re->rune_);
+ prefix->append(1, static_cast<char>(re->rune_));
} else {
char buf[UTFmax];
prefix->append(buf, runetochar(buf, &re->rune_));
}
break;
}
- *foldcase = (sub[i]->parse_flags() & FoldCase);
+ *foldcase = (sub[i]->parse_flags() & FoldCase) != 0;
i++;
// The rest.
@@ -704,13 +745,13 @@ bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
if (lo <= 'z' && hi >= 'A') {
// Overlaps some alpha, maybe not all.
// Update bitmaps telling which ASCII letters are in the set.
- Rune lo1 = max<Rune>(lo, 'A');
- Rune hi1 = min<Rune>(hi, 'Z');
+ Rune lo1 = std::max<Rune>(lo, 'A');
+ Rune hi1 = std::min<Rune>(hi, 'Z');
if (lo1 <= hi1)
upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
- lo1 = max<Rune>(lo, 'a');
- hi1 = min<Rune>(hi, 'z');
+ lo1 = std::max<Rune>(lo, 'a');
+ hi1 = std::min<Rune>(hi, 'z');
if (lo1 <= hi1)
lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
}
@@ -826,7 +867,7 @@ void CharClassBuilder::RemoveAbove(Rune r) {
void CharClassBuilder::Negate() {
// Build up negation and then copy in.
// Could edit ranges in place, but C++ won't let me.
- vector<RuneRange> v;
+ std::vector<RuneRange> v;
v.reserve(ranges_.size() + 1);
// In negation, first range begins at 0, unless
@@ -849,7 +890,7 @@ void CharClassBuilder::Negate() {
}
ranges_.clear();
- for (int i = 0; i < v.size(); i++)
+ for (size_t i = 0; i < v.size(); i++)
ranges_.insert(v[i]);
upper_ = AlphaMask & ~upper_;
@@ -863,7 +904,7 @@ void CharClassBuilder::Negate() {
CharClass* CharClass::New(int maxranges) {
CharClass* cc;
- uint8* data = new uint8[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
+ uint8_t* data = new uint8_t[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
cc = reinterpret_cast<CharClass*>(data);
cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
cc->nranges_ = 0;
@@ -873,9 +914,7 @@ CharClass* CharClass::New(int maxranges) {
}
void CharClass::Delete() {
- if (this == NULL)
- return;
- uint8 *data = reinterpret_cast<uint8*>(this);
+ uint8_t* data = reinterpret_cast<uint8_t*>(this);
delete[] data;
}
@@ -917,12 +956,12 @@ bool CharClass::Contains(Rune r) {
}
CharClass* CharClassBuilder::GetCharClass() {
- CharClass* cc = CharClass::New(ranges_.size());
+ CharClass* cc = CharClass::New(static_cast<int>(ranges_.size()));
int n = 0;
for (iterator it = begin(); it != end(); ++it)
cc->ranges_[n++] = *it;
cc->nranges_ = n;
- DCHECK_LE(n, ranges_.size());
+ DCHECK_LE(n, static_cast<int>(ranges_.size()));
cc->nrunes_ = nrunes_;
cc->folds_ascii_ = FoldsASCII();
return cc;