From 0d4c52358a1af421705c54bd8a9fdd8a30558a2e Mon Sep 17 00:00:00 2001 From: Alexander Gutkin Date: Thu, 28 Feb 2013 13:47:27 +0000 Subject: Updaiting re2 to the re2-20130115. Updating RE2 to latest from: https://re2.googlecode.com/files/re2-20130115.tgz Change-Id: I0b2527af4443bf8815db2c78ef14f6edfe3ecb37 --- CONTRIBUTORS | 2 + NOTICE | 27 - README.android | 31 +- doc/mksyntaxgo | 41 + doc/syntax.txt | 4 +- lib/codereview/codereview.py | 1267 ++++++++++++++++------------- re2/compile.cc | 11 +- re2/dfa.cc | 100 ++- re2/filtered_re2.cc | 6 +- re2/nfa.cc | 6 +- re2/parse.cc | 17 +- re2/prefilter.cc | 66 +- re2/re2.cc | 102 ++- re2/re2.h | 78 +- re2/regexp.cc | 45 +- re2/regexp.h | 3 +- re2/testing/backtrack.cc | 254 ++++++ re2/testing/charclass_test.cc | 223 ++++++ re2/testing/compile_test.cc | 171 ++++ re2/testing/dfa_test.cc | 344 ++++++++ re2/testing/dump.cc | 164 ++++ re2/testing/exhaustive1_test.cc | 42 + re2/testing/exhaustive2_test.cc | 70 ++ re2/testing/exhaustive3_test.cc | 94 +++ re2/testing/exhaustive_test.cc | 38 + re2/testing/exhaustive_tester.cc | 188 +++++ re2/testing/exhaustive_tester.h | 85 ++ re2/testing/filtered_re2_test.cc | 275 +++++++ re2/testing/mimics_pcre_test.cc | 76 ++ re2/testing/null_walker.cc | 44 + re2/testing/parse_test.cc | 433 ++++++++++ re2/testing/possible_match_test.cc | 240 ++++++ re2/testing/random_test.cc | 95 +++ re2/testing/re2_arg_test.cc | 133 ++++ re2/testing/re2_test.cc | 1371 +++++++++++++++++++++++++++++++ re2/testing/regexp_benchmark.cc | 1461 ++++++++++++++++++++++++++++++++++ re2/testing/regexp_generator.cc | 264 ++++++ re2/testing/regexp_generator.h | 70 ++ re2/testing/regexp_test.cc | 81 ++ re2/testing/required_prefix_test.cc | 67 ++ re2/testing/search_test.cc | 325 ++++++++ re2/testing/set_test.cc | 114 +++ re2/testing/simplify_test.cc | 167 ++++ re2/testing/string_generator.cc | 113 +++ re2/testing/string_generator.h | 58 ++ re2/testing/string_generator_test.cc | 109 +++ re2/testing/tester.cc | 640 +++++++++++++++ re2/testing/tester.h | 121 +++ re2/testing/unicode_test.py | 207 +++++ re2/unicode.py | 0 testinstall.cc | 4 + util/logging.h | 16 +- util/mutex.h | 23 +- util/sparse_array.h | 8 +- util/sparse_set.h | 10 +- util/util.h | 9 +- util/valgrind.cc | 10 +- 57 files changed, 9286 insertions(+), 737 deletions(-) delete mode 100644 NOTICE create mode 100755 doc/mksyntaxgo create mode 100644 re2/testing/backtrack.cc create mode 100644 re2/testing/charclass_test.cc create mode 100644 re2/testing/compile_test.cc create mode 100644 re2/testing/dfa_test.cc create mode 100644 re2/testing/dump.cc create mode 100644 re2/testing/exhaustive1_test.cc create mode 100644 re2/testing/exhaustive2_test.cc create mode 100644 re2/testing/exhaustive3_test.cc create mode 100644 re2/testing/exhaustive_test.cc create mode 100644 re2/testing/exhaustive_tester.cc create mode 100644 re2/testing/exhaustive_tester.h create mode 100644 re2/testing/filtered_re2_test.cc create mode 100644 re2/testing/mimics_pcre_test.cc create mode 100644 re2/testing/null_walker.cc create mode 100644 re2/testing/parse_test.cc create mode 100644 re2/testing/possible_match_test.cc create mode 100644 re2/testing/random_test.cc create mode 100644 re2/testing/re2_arg_test.cc create mode 100644 re2/testing/re2_test.cc create mode 100644 re2/testing/regexp_benchmark.cc create mode 100644 re2/testing/regexp_generator.cc create mode 100644 re2/testing/regexp_generator.h create mode 100644 re2/testing/regexp_test.cc create mode 100644 re2/testing/required_prefix_test.cc create mode 100644 re2/testing/search_test.cc create mode 100644 re2/testing/set_test.cc create mode 100644 re2/testing/simplify_test.cc create mode 100644 re2/testing/string_generator.cc create mode 100644 re2/testing/string_generator.h create mode 100644 re2/testing/string_generator_test.cc create mode 100644 re2/testing/tester.cc create mode 100644 re2/testing/tester.h create mode 100755 re2/testing/unicode_test.py mode change 100755 => 100644 re2/unicode.py diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 981ce02..7b44e04 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -26,6 +26,8 @@ # Please keep the list sorted. +Dominic Battré +John Millikin Rob Pike Russ Cox Sanjay Ghemawat diff --git a/NOTICE b/NOTICE deleted file mode 100644 index 09e5ec1..0000000 --- a/NOTICE +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright (c) 2009 The RE2 Authors. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.android b/README.android index 453cd80..4911c3c 100644 --- a/README.android +++ b/README.android @@ -1,29 +1,28 @@ -Code obtained via ------------------ -hg clone https://re2.googlecode.com/hg re2 +Code obtained from +------------------ + +https://re2.googlecode.com/files/re2-20130115.tgz Version ------- -hg identify -2d252384c5e8 tip +re2-20130115.tgz Changes required to build using stlport on Android as follows (full diff) ------------------------------------------------------------------------- -diff -r ./re2/parse.cc /home/idh/temp9/re2/re2/parse.cc -19d18 -< #include -diff -r ./re2/re2.cc /home/idh/temp9/re2/re2/re2.cc -13d12 -< #include -Only in /home/idh/temp9/re2/re2: testing -diff -r ./util/util.h /home/idh/temp9/re2/util/util.h -43,48c43 +util/util.h: + +44,53c44 < #if defined(ANDROID) < -< #include +< #if defined(_STLPORT_VERSION) +< #include // using stlport +< #else +< #include // using gnustl +< #endif < using std::tr1::unordered_set; -< +< < #elif defined(__GNUC__) && !defined(USE_CXX0X) --- > #if defined(__GNUC__) && !defined(USE_CXX0X) + diff --git a/doc/mksyntaxgo b/doc/mksyntaxgo new file mode 100755 index 0000000..42e87d6 --- /dev/null +++ b/doc/mksyntaxgo @@ -0,0 +1,41 @@ +#!/bin/sh + +set -e +out=$GOROOT/src/pkg/regexp/syntax/doc.go +cp syntax.txt $out +sam -d $out <<'!' +,x g/NOT SUPPORTED/d +/^Unicode character class/,$d +,s/[«»]//g +,x g/^Possessive repetitions:/d +,x g/\\C/d +,x g/Flag syntax/d +,s/.=(true|false)/flag &/g +,s/^Flags:/ Flag syntax is xyz (set) or -xyz (clear) or xy-z (set xy, clear z). The flags are:\n/ +,s/\n\n\n+/\n\n/g +,x/(^.* .*\n)+/ | awk -F' ' '{printf(" %-14s %s\n", $1, $2)}' +1,2c +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// DO NOT EDIT. This file is generated by mksyntaxgo from the RE2 distribution. + +/* +Package syntax parses regular expressions into parse trees and compiles +parse trees into programs. Most clients of regular expressions will use the +facilities of package regexp (such as Compile and Match) instead of this package. + +Syntax + +The regular expression syntax understood by this package when parsing with the Perl flag is as follows. +Parts of the syntax can be disabled by passing alternate flags to Parse. + +. +$a +*/ +package syntax +. +w +q +! diff --git a/doc/syntax.txt b/doc/syntax.txt index 740e5ce..f940750 100644 --- a/doc/syntax.txt +++ b/doc/syntax.txt @@ -2,7 +2,7 @@ RE2 regular expression syntax reference -------------------------­-------­----- Single characters: -. any character, including newline (s=true) +. any character, possibly including newline (s=true) [xyz] character class [^xyz] negated character class \d Perl character class @@ -60,7 +60,7 @@ re@> possessive match of «re» NOT SUPPORTED vim Flags: i case-insensitive (default false) -m multi-line mode: ^ and $ match begin/end line in addition to begin/end text (default false) +m multi-line mode: «^» and «$» match begin/end line in addition to begin/end text (default false) s let «.» match «\n» (default false) U ungreedy: swap meaning of «x*» and «x*?», «x+» and «x+?», etc (default false) Flag syntax is «xyz» (set) or «-xyz» (clear) or «xy-z» (set «xy», clear «z»). diff --git a/lib/codereview/codereview.py b/lib/codereview/codereview.py index b980929..d26df2a 100644 --- a/lib/codereview/codereview.py +++ b/lib/codereview/codereview.py @@ -22,7 +22,7 @@ To configure, set the following options in your repository's .hg/hgrc file. [extensions] - codereview = path/to/codereview.py + codereview = /path/to/codereview.py [codereview] server = codereview.appspot.com @@ -38,110 +38,60 @@ For example, if change 123456 contains the files x.go and y.go, "hg diff @123456" is equivalent to"hg diff x.go y.go". ''' -from mercurial import cmdutil, commands, hg, util, error, match, discovery -from mercurial.node import nullrev, hex, nullid, short -import os, re, time -import stat -import subprocess -import threading -from HTMLParser import HTMLParser - -# The standard 'json' package is new in Python 2.6. -# Before that it was an external package named simplejson. -try: - # Standard location in 2.6 and beyond. - import json -except Exception, e: - try: - # Conventional name for earlier package. - import simplejson as json - except: - try: - # Was also bundled with django, which is commonly installed. - from django.utils import simplejson as json - except: - # We give up. - raise e - -try: - hgversion = util.version() -except: - from mercurial.version import version as v - hgversion = v.get_version() - -# in Mercurial 1.9 the cmdutil.match and cmdutil.revpair moved to scmutil -if hgversion >= '1.9': - from mercurial import scmutil -else: - scmutil = cmdutil - -oldMessage = """ -The code review extension requires Mercurial 1.3 or newer. - -To install a new Mercurial, - - sudo easy_install mercurial - -works on most systems. -""" - -linuxMessage = """ -You may need to clear your current Mercurial installation by running: - - sudo apt-get remove mercurial mercurial-common - sudo rm -rf /etc/mercurial -""" +import sys -if hgversion < '1.3': - msg = oldMessage - if os.access("/etc/mercurial", 0): - msg += linuxMessage - raise util.Abort(msg) +if __name__ == "__main__": + print >>sys.stderr, "This is a Mercurial extension and should not be invoked directly." + sys.exit(2) -def promptyesno(ui, msg): - # Arguments to ui.prompt changed between 1.3 and 1.3.1. - # Even so, some 1.3.1 distributions seem to have the old prompt!?!? - # What a terrible way to maintain software. - try: - return ui.promptchoice(msg, ["&yes", "&no"], 0) == 0 - except AttributeError: - return ui.prompt(msg, ["&yes", "&no"], "y") != "n" +# We require Python 2.6 for the json package. +if sys.version < '2.6': + print >>sys.stderr, "The codereview extension requires Python 2.6 or newer." + print >>sys.stderr, "You are running Python " + sys.version + sys.exit(2) -def incoming(repo, other): - fui = FakeMercurialUI() - ret = commands.incoming(fui, repo, *[other.path], **{'bundle': '', 'force': False}) - if ret and ret != 1: - raise util.Abort(ret) - out = fui.output - return out +import json +import os +import re +import stat +import subprocess +import threading +import time -def outgoing(repo): - fui = FakeMercurialUI() - ret = commands.outgoing(fui, repo, *[], **{}) - if ret and ret != 1: - raise util.Abort(ret) - out = fui.output - return out +from mercurial import commands as hg_commands +from mercurial import util as hg_util -# To experiment with Mercurial in the python interpreter: -# >>> repo = hg.repository(ui.ui(), path = ".") +defaultcc = None +codereview_disabled = None +real_rollback = None +releaseBranch = None +server = "codereview.appspot.com" +server_url_base = None ####################################################################### # Normally I would split this into multiple files, but it simplifies # import path headaches to keep it all in one file. Sorry. +# The different parts of the file are separated by banners like this one. -import sys -if __name__ == "__main__": - print >>sys.stderr, "This is a Mercurial extension and should not be invoked directly." - sys.exit(2) +####################################################################### +# Helpers -server = "codereview.appspot.com" -server_url_base = None -defaultcc = None -contributors = {} -missing_codereview = None -real_rollback = None -releaseBranch = None +def RelativePath(path, cwd): + n = len(cwd) + if path.startswith(cwd) and path[n] == '/': + return path[n+1:] + return path + +def Sub(l1, l2): + return [l for l in l1 if l not in l2] + +def Add(l1, l2): + l = l1 + Sub(l2, l1) + l.sort() + return l + +def Intersect(l1, l2): + return [l for l in l1 if l in l2] ####################################################################### # RE: UNICODE STRING HANDLING @@ -168,7 +118,7 @@ releaseBranch = None def typecheck(s, t): if type(s) != t: - raise util.Abort("type check failed: %s has type %s != %s" % (repr(s), type(s), t)) + raise hg_util.Abort("type check failed: %s has type %s != %s" % (repr(s), type(s), t)) # If we have to pass unicode instead of str, ustr does that conversion clearly. def ustr(s): @@ -199,6 +149,40 @@ def default_to_utf8(): default_to_utf8() +####################################################################### +# Status printer for long-running commands + +global_status = None + +def set_status(s): + # print >>sys.stderr, "\t", time.asctime(), s + global global_status + global_status = s + +class StatusThread(threading.Thread): + def __init__(self): + threading.Thread.__init__(self) + def run(self): + # pause a reasonable amount of time before + # starting to display status messages, so that + # most hg commands won't ever see them. + time.sleep(30) + + # now show status every 15 seconds + while True: + time.sleep(15 - time.time() % 15) + s = global_status + if s is None: + continue + if s == "": + s = "(unknown status)" + print >>sys.stderr, time.asctime(), s + +def start_status_thread(): + t = StatusThread() + t.setDaemon(True) # allowed to exit if t is still running + t.start() + ####################################################################### # Change list parsing. # @@ -275,17 +259,18 @@ class CL(object): typecheck(s, str) return s - def PendingText(self): + def PendingText(self, quick=False): cl = self s = cl.name + ":" + "\n" s += Indent(cl.desc, "\t") s += "\n" if cl.copied_from: s += "\tAuthor: " + cl.copied_from + "\n" - s += "\tReviewer: " + JoinComma(cl.reviewer) + "\n" - for (who, line) in cl.lgtm: - s += "\t\t" + who + ": " + line + "\n" - s += "\tCC: " + JoinComma(cl.cc) + "\n" + if not quick: + s += "\tReviewer: " + JoinComma(cl.reviewer) + "\n" + for (who, line) in cl.lgtm: + s += "\t\t" + who + ": " + line + "\n" + s += "\tCC: " + JoinComma(cl.cc) + "\n" s += "\tFiles:\n" for f in cl.files: s += "\t\t" + f + "\n" @@ -360,7 +345,7 @@ class CL(object): uploaded_diff_file = [("data", "data.diff", emptydiff)] if vcs and self.name != "new": - form_fields.append(("subject", "diff -r " + vcs.base_rev + " " + getremote(ui, repo, {}).path)) + form_fields.append(("subject", "diff -r " + vcs.base_rev + " " + ui.expandpath("default"))) else: # First upload sets the subject for the CL itself. form_fields.append(("subject", self.Subject())) @@ -379,7 +364,7 @@ class CL(object): ui.status(msg + "\n") set_status("uploaded CL metadata + diffs") if not response_body.startswith("Issue created.") and not response_body.startswith("Issue updated."): - raise util.Abort("failed to update issue: " + response_body) + raise hg_util.Abort("failed to update issue: " + response_body) issue = msg[msg.rfind("/")+1:] self.name = issue if not self.url: @@ -404,7 +389,7 @@ class CL(object): pmsg += " (cc: %s)" % (', '.join(self.cc),) pmsg += ",\n" pmsg += "\n" - repourl = getremote(ui, repo, {}).path + repourl = ui.expandpath("default") if not self.mailed: pmsg += "I'd like you to review this change to\n" + repourl + "\n" else: @@ -567,37 +552,6 @@ def LoadCL(ui, repo, name, web=True): set_status("loaded CL " + name) return cl, '' -global_status = None - -def set_status(s): - # print >>sys.stderr, "\t", time.asctime(), s - global global_status - global_status = s - -class StatusThread(threading.Thread): - def __init__(self): - threading.Thread.__init__(self) - def run(self): - # pause a reasonable amount of time before - # starting to display status messages, so that - # most hg commands won't ever see them. - time.sleep(30) - - # now show status every 15 seconds - while True: - time.sleep(15 - time.time() % 15) - s = global_status - if s is None: - continue - if s == "": - s = "(unknown status)" - print >>sys.stderr, time.asctime(), s - -def start_status_thread(): - t = StatusThread() - t.setDaemon(True) # allowed to exit if t is still running - t.start() - class LoadCLThread(threading.Thread): def __init__(self, ui, repo, dir, f, web): threading.Thread.__init__(self) @@ -735,101 +689,6 @@ _change_prolog = """# Change list. # Multi-line values should be indented. """ -####################################################################### -# Mercurial helper functions - -# Get effective change nodes taking into account applied MQ patches -def effective_revpair(repo): - try: - return scmutil.revpair(repo, ['qparent']) - except: - return scmutil.revpair(repo, None) - -# Return list of changed files in repository that match pats. -# Warn about patterns that did not match. -def matchpats(ui, repo, pats, opts): - matcher = scmutil.match(repo, pats, opts) - node1, node2 = effective_revpair(repo) - modified, added, removed, deleted, unknown, ignored, clean = repo.status(node1, node2, matcher, ignored=True, clean=True, unknown=True) - return (modified, added, removed, deleted, unknown, ignored, clean) - -# Return list of changed files in repository that match pats. -# The patterns came from the command line, so we warn -# if they have no effect or cannot be understood. -def ChangedFiles(ui, repo, pats, opts, taken=None): - taken = taken or {} - # Run each pattern separately so that we can warn about - # patterns that didn't do anything useful. - for p in pats: - modified, added, removed, deleted, unknown, ignored, clean = matchpats(ui, repo, [p], opts) - redo = False - for f in unknown: - promptadd(ui, repo, f) - redo = True - for f in deleted: - promptremove(ui, repo, f) - redo = True - if redo: - modified, added, removed, deleted, unknown, ignored, clean = matchpats(ui, repo, [p], opts) - for f in modified + added + removed: - if f in taken: - ui.warn("warning: %s already in CL %s\n" % (f, taken[f].name)) - if not modified and not added and not removed: - ui.warn("warning: %s did not match any modified files\n" % (p,)) - - # Again, all at once (eliminates duplicates) - modified, added, removed = matchpats(ui, repo, pats, opts)[:3] - l = modified + added + removed - l.sort() - if taken: - l = Sub(l, taken.keys()) - return l - -# Return list of changed files in repository that match pats and still exist. -def ChangedExistingFiles(ui, repo, pats, opts): - modified, added = matchpats(ui, repo, pats, opts)[:2] - l = modified + added - l.sort() - return l - -# Return list of files claimed by existing CLs -def Taken(ui, repo): - all = LoadAllCL(ui, repo, web=False) - taken = {} - for _, cl in all.items(): - for f in cl.files: - taken[f] = cl - return taken - -# Return list of changed files that are not claimed by other CLs -def DefaultFiles(ui, repo, pats, opts): - return ChangedFiles(ui, repo, pats, opts, taken=Taken(ui, repo)) - -def Sub(l1, l2): - return [l for l in l1 if l not in l2] - -def Add(l1, l2): - l = l1 + Sub(l2, l1) - l.sort() - return l - -def Intersect(l1, l2): - return [l for l in l1 if l in l2] - -def getremote(ui, repo, opts): - # save $http_proxy; creating the HTTP repo object will - # delete it in an attempt to "help" - proxy = os.environ.get('http_proxy') - source = hg.parseurl(ui.expandpath("default"), None)[0] - try: - remoteui = hg.remoteui # hg 1.6 - except: - remoteui = cmdutil.remoteui - other = hg.repository(remoteui(repo, opts), source) - if proxy is not None: - os.environ['http_proxy'] = proxy - return other - desc_re = '^(.+: |(tag )?(release|weekly)\.|fix build|undo CL)' desc_msg = '''Your CL description appears not to use the standard form. @@ -851,15 +710,17 @@ Examples: ''' +def promptyesno(ui, msg): + return ui.promptchoice(msg, ["&yes", "&no"], 0) == 0 def promptremove(ui, repo, f): if promptyesno(ui, "hg remove %s (y/n)?" % (f,)): - if commands.remove(ui, repo, 'path:'+f) != 0: + if hg_commands.remove(ui, repo, 'path:'+f) != 0: ui.warn("error removing %s" % (f,)) def promptadd(ui, repo, f): if promptyesno(ui, "hg add %s (y/n)?" % (f,)): - if commands.add(ui, repo, 'path:'+f) != 0: + if hg_commands.add(ui, repo, 'path:'+f) != 0: ui.warn("error adding %s" % (f,)) def EditCL(ui, repo, cl): @@ -899,10 +760,14 @@ def EditCL(ui, repo, cl): # Check file list for files that need to be hg added or hg removed # or simply aren't understood. pats = ['path:'+f for f in clx.files] - modified, added, removed, deleted, unknown, ignored, clean = matchpats(ui, repo, pats, {}) + changed = hg_matchPattern(ui, repo, *pats, modified=True, added=True, removed=True) + deleted = hg_matchPattern(ui, repo, *pats, deleted=True) + unknown = hg_matchPattern(ui, repo, *pats, unknown=True) + ignored = hg_matchPattern(ui, repo, *pats, ignored=True) + clean = hg_matchPattern(ui, repo, *pats, clean=True) files = [] for f in clx.files: - if f in modified or f in added or f in removed: + if f in changed: files.append(f) continue if f in deleted: @@ -954,7 +819,7 @@ def CommandLineCL(ui, repo, pats, opts, defaultcc=None): else: cl = CL("new") cl.local = True - cl.files = ChangedFiles(ui, repo, pats, opts, taken=Taken(ui, repo)) + cl.files = ChangedFiles(ui, repo, pats, taken=Taken(ui, repo)) if not cl.files: return None, "no files changed" if opts.get('reviewer'): @@ -972,42 +837,56 @@ def CommandLineCL(ui, repo, pats, opts, defaultcc=None): return None, err return cl, "" -# reposetup replaces cmdutil.match with this wrapper, -# which expands the syntax @clnumber to mean the files -# in that CL. -original_match = None -global_repo = None -global_ui = None -def ReplacementForCmdutilMatch(ctx, pats=None, opts=None, globbed=False, default='relpath'): - taken = [] - files = [] - pats = pats or [] - opts = opts or {} - +####################################################################### +# Change list file management + +# Return list of changed files in repository that match pats. +# The patterns came from the command line, so we warn +# if they have no effect or cannot be understood. +def ChangedFiles(ui, repo, pats, taken=None): + taken = taken or {} + # Run each pattern separately so that we can warn about + # patterns that didn't do anything useful. for p in pats: - if p.startswith('@'): - taken.append(p) - clname = p[1:] - if not GoodCLName(clname): - raise util.Abort("invalid CL name " + clname) - cl, err = LoadCL(global_repo.ui, global_repo, clname, web=False) - if err != '': - raise util.Abort("loading CL " + clname + ": " + err) - if not cl.files: - raise util.Abort("no files in CL " + clname) - files = Add(files, cl.files) - pats = Sub(pats, taken) + ['path:'+f for f in files] + for f in hg_matchPattern(ui, repo, p, unknown=True): + promptadd(ui, repo, f) + for f in hg_matchPattern(ui, repo, p, removed=True): + promptremove(ui, repo, f) + files = hg_matchPattern(ui, repo, p, modified=True, added=True, removed=True) + for f in files: + if f in taken: + ui.warn("warning: %s already in CL %s\n" % (f, taken[f].name)) + if not files: + ui.warn("warning: %s did not match any modified files\n" % (p,)) - # work-around for http://selenic.com/hg/rev/785bbc8634f8 - if hgversion >= '1.9' and not hasattr(ctx, 'match'): - ctx = ctx[None] - return original_match(ctx, pats=pats, opts=opts, globbed=globbed, default=default) + # Again, all at once (eliminates duplicates) + l = hg_matchPattern(ui, repo, *pats, modified=True, added=True, removed=True) + l.sort() + if taken: + l = Sub(l, taken.keys()) + return l -def RelativePath(path, cwd): - n = len(cwd) - if path.startswith(cwd) and path[n] == '/': - return path[n+1:] - return path +# Return list of changed files in repository that match pats and still exist. +def ChangedExistingFiles(ui, repo, pats, opts): + l = hg_matchPattern(ui, repo, *pats, modified=True, added=True) + l.sort() + return l + +# Return list of files claimed by existing CLs +def Taken(ui, repo): + all = LoadAllCL(ui, repo, web=False) + taken = {} + for _, cl in all.items(): + for f in cl.files: + taken[f] = cl + return taken + +# Return list of changed files that are not claimed by other CLs +def DefaultFiles(ui, repo, pats): + return ChangedFiles(ui, repo, pats, taken=Taken(ui, repo)) + +####################################################################### +# File format checking. def CheckFormat(ui, repo, files, just_warn=False): set_status("running gofmt") @@ -1016,7 +895,7 @@ def CheckFormat(ui, repo, files, just_warn=False): # Check that gofmt run on the list of files does not change them def CheckGofmt(ui, repo, files, just_warn): - files = [f for f in files if (f.startswith('src/') or f.startswith('test/bench/')) and f.endswith('.go')] + files = gofmt_required(files) if not files: return cwd = os.getcwd() @@ -1028,7 +907,7 @@ def CheckGofmt(ui, repo, files, just_warn): cmd = subprocess.Popen(["gofmt", "-l"] + files, shell=False, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=sys.platform != "win32") cmd.stdin.close() except: - raise util.Abort("gofmt: " + ExceptionDetail()) + raise hg_util.Abort("gofmt: " + ExceptionDetail()) data = cmd.stdout.read() errors = cmd.stderr.read() cmd.wait() @@ -1041,12 +920,12 @@ def CheckGofmt(ui, repo, files, just_warn): if just_warn: ui.warn("warning: " + msg + "\n") else: - raise util.Abort(msg) + raise hg_util.Abort(msg) return # Check that *.[chys] files indent using tabs. def CheckTabfmt(ui, repo, files, just_warn): - files = [f for f in files if f.startswith('src/') and re.search(r"\.[chys]$", f)] + files = [f for f in files if f.startswith('src/') and re.search(r"\.[chys]$", f) and not re.search(r"\.tab\.[ch]$", f)] if not files: return cwd = os.getcwd() @@ -1070,20 +949,327 @@ def CheckTabfmt(ui, repo, files, just_warn): if just_warn: ui.warn("warning: " + msg + "\n") else: - raise util.Abort(msg) + raise hg_util.Abort(msg) return ####################################################################### -# Mercurial commands +# CONTRIBUTORS file parsing -# every command must take a ui and and repo as arguments. -# opts is a dict where you can find other command line flags -# -# Other parameters are taken in order from items on the command line that -# don't start with a dash. If no default value is given in the parameter list, -# they are required. -# +contributorsCache = None +contributorsURL = None + +def ReadContributors(ui, repo): + global contributorsCache + if contributorsCache is not None: + return contributorsCache + + try: + if contributorsURL is not None: + opening = contributorsURL + f = urllib2.urlopen(contributorsURL) + else: + opening = repo.root + '/CONTRIBUTORS' + f = open(repo.root + '/CONTRIBUTORS', 'r') + except: + ui.write("warning: cannot open %s: %s\n" % (opening, ExceptionDetail())) + return + + contributors = {} + for line in f: + # CONTRIBUTORS is a list of lines like: + # Person + # Person + # The first email address is the one used in commit logs. + if line.startswith('#'): + continue + m = re.match(r"([^<>]+\S)\s+(<[^<>\s]+>)((\s+<[^<>\s]+>)*)\s*$", line) + if m: + name = m.group(1) + email = m.group(2)[1:-1] + contributors[email.lower()] = (name, email) + for extra in m.group(3).split(): + contributors[extra[1:-1].lower()] = (name, email) + + contributorsCache = contributors + return contributors + +def CheckContributor(ui, repo, user=None): + set_status("checking CONTRIBUTORS file") + user, userline = FindContributor(ui, repo, user, warn=False) + if not userline: + raise hg_util.Abort("cannot find %s in CONTRIBUTORS" % (user,)) + return userline + +def FindContributor(ui, repo, user=None, warn=True): + if not user: + user = ui.config("ui", "username") + if not user: + raise hg_util.Abort("[ui] username is not configured in .hgrc") + user = user.lower() + m = re.match(r".*<(.*)>", user) + if m: + user = m.group(1) + + contributors = ReadContributors(ui, repo) + if user not in contributors: + if warn: + ui.warn("warning: cannot find %s in CONTRIBUTORS\n" % (user,)) + return user, None + + user, email = contributors[user] + return email, "%s <%s>" % (user, email) + +####################################################################### +# Mercurial helper functions. +# Read http://mercurial.selenic.com/wiki/MercurialApi before writing any of these. +# We use the ui.pushbuffer/ui.popbuffer + hg_commands.xxx tricks for all interaction +# with Mercurial. It has proved the most stable as they make changes. + +hgversion = hg_util.version() + +# We require Mercurial 1.9 and suggest Mercurial 2.0. +# The details of the scmutil package changed then, +# so allowing earlier versions would require extra band-aids below. +# Ubuntu 11.10 ships with Mercurial 1.9.1 as the default version. +hg_required = "1.9" +hg_suggested = "2.0" + +old_message = """ + +The code review extension requires Mercurial """+hg_required+""" or newer. +You are using Mercurial """+hgversion+""". + +To install a new Mercurial, use + + sudo easy_install mercurial=="""+hg_suggested+""" + +or visit http://mercurial.selenic.com/downloads/. +""" + +linux_message = """ +You may need to clear your current Mercurial installation by running: + + sudo apt-get remove mercurial mercurial-common + sudo rm -rf /etc/mercurial +""" + +if hgversion < hg_required: + msg = old_message + if os.access("/etc/mercurial", 0): + msg += linux_message + raise hg_util.Abort(msg) + +from mercurial.hg import clean as hg_clean +from mercurial import cmdutil as hg_cmdutil +from mercurial import error as hg_error +from mercurial import match as hg_match +from mercurial import node as hg_node + +class uiwrap(object): + def __init__(self, ui): + self.ui = ui + ui.pushbuffer() + self.oldQuiet = ui.quiet + ui.quiet = True + self.oldVerbose = ui.verbose + ui.verbose = False + def output(self): + ui = self.ui + ui.quiet = self.oldQuiet + ui.verbose = self.oldVerbose + return ui.popbuffer() + +def to_slash(path): + if sys.platform == "win32": + return path.replace('\\', '/') + return path + +def hg_matchPattern(ui, repo, *pats, **opts): + w = uiwrap(ui) + hg_commands.status(ui, repo, *pats, **opts) + text = w.output() + ret = [] + prefix = to_slash(os.path.realpath(repo.root))+'/' + for line in text.split('\n'): + f = line.split() + if len(f) > 1: + if len(pats) > 0: + # Given patterns, Mercurial shows relative to cwd + p = to_slash(os.path.realpath(f[1])) + if not p.startswith(prefix): + print >>sys.stderr, "File %s not in repo root %s.\n" % (p, prefix) + else: + ret.append(p[len(prefix):]) + else: + # Without patterns, Mercurial shows relative to root (what we want) + ret.append(to_slash(f[1])) + return ret + +def hg_heads(ui, repo): + w = uiwrap(ui) + hg_commands.heads(ui, repo) + return w.output() + +noise = [ + "", + "resolving manifests", + "searching for changes", + "couldn't find merge tool hgmerge", + "adding changesets", + "adding manifests", + "adding file changes", + "all local heads known remotely", +] + +def isNoise(line): + line = str(line) + for x in noise: + if line == x: + return True + return False + +def hg_incoming(ui, repo): + w = uiwrap(ui) + ret = hg_commands.incoming(ui, repo, force=False, bundle="") + if ret and ret != 1: + raise hg_util.Abort(ret) + return w.output() + +def hg_log(ui, repo, **opts): + for k in ['date', 'keyword', 'rev', 'user']: + if not opts.has_key(k): + opts[k] = "" + w = uiwrap(ui) + ret = hg_commands.log(ui, repo, **opts) + if ret: + raise hg_util.Abort(ret) + return w.output() + +def hg_outgoing(ui, repo, **opts): + w = uiwrap(ui) + ret = hg_commands.outgoing(ui, repo, **opts) + if ret and ret != 1: + raise hg_util.Abort(ret) + return w.output() + +def hg_pull(ui, repo, **opts): + w = uiwrap(ui) + ui.quiet = False + ui.verbose = True # for file list + err = hg_commands.pull(ui, repo, **opts) + for line in w.output().split('\n'): + if isNoise(line): + continue + if line.startswith('moving '): + line = 'mv ' + line[len('moving '):] + if line.startswith('getting ') and line.find(' to ') >= 0: + line = 'mv ' + line[len('getting '):] + if line.startswith('getting '): + line = '+ ' + line[len('getting '):] + if line.startswith('removing '): + line = '- ' + line[len('removing '):] + ui.write(line + '\n') + return err + +def hg_push(ui, repo, **opts): + w = uiwrap(ui) + ui.quiet = False + ui.verbose = True + err = hg_commands.push(ui, repo, **opts) + for line in w.output().split('\n'): + if not isNoise(line): + ui.write(line + '\n') + return err + +def hg_commit(ui, repo, *pats, **opts): + return hg_commands.commit(ui, repo, *pats, **opts) + +####################################################################### +# Mercurial precommit hook to disable commit except through this interface. + +commit_okay = False + +def precommithook(ui, repo, **opts): + if commit_okay: + return False # False means okay. + ui.write("\ncodereview extension enabled; use mail, upload, or submit instead of commit\n\n") + return True + +####################################################################### +# @clnumber file pattern support + +# We replace scmutil.match with the MatchAt wrapper to add the @clnumber pattern. + +match_repo = None +match_ui = None +match_orig = None + +def InstallMatch(ui, repo): + global match_repo + global match_ui + global match_orig + + match_ui = ui + match_repo = repo + + from mercurial import scmutil + match_orig = scmutil.match + scmutil.match = MatchAt + +def MatchAt(ctx, pats=None, opts=None, globbed=False, default='relpath'): + taken = [] + files = [] + pats = pats or [] + opts = opts or {} + + for p in pats: + if p.startswith('@'): + taken.append(p) + clname = p[1:] + if clname == "default": + files = DefaultFiles(match_ui, match_repo, []) + else: + if not GoodCLName(clname): + raise hg_util.Abort("invalid CL name " + clname) + cl, err = LoadCL(match_repo.ui, match_repo, clname, web=False) + if err != '': + raise hg_util.Abort("loading CL " + clname + ": " + err) + if not cl.files: + raise hg_util.Abort("no files in CL " + clname) + files = Add(files, cl.files) + pats = Sub(pats, taken) + ['path:'+f for f in files] + # work-around for http://selenic.com/hg/rev/785bbc8634f8 + if not hasattr(ctx, 'match'): + ctx = ctx[None] + return match_orig(ctx, pats=pats, opts=opts, globbed=globbed, default=default) + +####################################################################### +# Commands added by code review extension. + +# As of Mercurial 2.1 the commands are all required to return integer +# exit codes, whereas earlier versions allowed returning arbitrary strings +# to be printed as errors. We wrap the old functions to make sure we +# always return integer exit codes now. Otherwise Mercurial dies +# with a TypeError traceback (unsupported operand type(s) for &: 'str' and 'int'). +# Introduce a Python decorator to convert old functions to the new +# stricter convention. + +def hgcommand(f): + def wrapped(ui, repo, *pats, **opts): + err = f(ui, repo, *pats, **opts) + if type(err) is int: + return err + if not err: + return 0 + raise hg_util.Abort(err) + wrapped.__doc__ = f.__doc__ + return wrapped + +####################################################################### +# hg change + +@hgcommand def change(ui, repo, *pats, **opts): """create, edit or delete a change list @@ -1106,8 +1292,8 @@ def change(ui, repo, *pats, **opts): before running hg change -d 123456. """ - if missing_codereview: - return missing_codereview + if codereview_disabled: + return codereview_disabled dirty = {} if len(pats) > 0 and GoodCLName(pats[0]): @@ -1121,12 +1307,12 @@ def change(ui, repo, *pats, **opts): if not cl.local and (opts["stdin"] or not opts["stdout"]): return "cannot change non-local CL " + name else: - if repo[None].branch() != "default": - return "cannot run hg change outside default branch" name = "new" cl = CL("new") + if repo[None].branch() != "default": + return "cannot create CL outside default branch; switch with 'hg update default'" dirty[cl] = True - files = ChangedFiles(ui, repo, pats, opts, taken=Taken(ui, repo)) + files = ChangedFiles(ui, repo, pats, taken=Taken(ui, repo)) if opts["delete"] or opts["deletelocal"]: if opts["delete"] and opts["deletelocal"]: @@ -1194,17 +1380,26 @@ def change(ui, repo, *pats, **opts): ui.write("CL created: " + cl.url + "\n") return +####################################################################### +# hg code-login (broken?) + +@hgcommand def code_login(ui, repo, **opts): """log in to code review server Logs in to the code review server, saving a cookie in a file in your home directory. """ - if missing_codereview: - return missing_codereview + if codereview_disabled: + return codereview_disabled MySend(None) +####################################################################### +# hg clpatch / undo / release-apply / download +# All concerned with applying or unapplying patches to the repository. + +@hgcommand def clpatch(ui, repo, clname, **opts): """import a patch from the code review server @@ -1219,6 +1414,7 @@ def clpatch(ui, repo, clname, **opts): return "cannot run hg clpatch outside default branch" return clpatch_or_undo(ui, repo, clname, opts, mode="clpatch") +@hgcommand def undo(ui, repo, clname, **opts): """undo the effect of a CL @@ -1230,6 +1426,7 @@ def undo(ui, repo, clname, **opts): return "cannot run hg undo outside default branch" return clpatch_or_undo(ui, repo, clname, opts, mode="undo") +@hgcommand def release_apply(ui, repo, clname, **opts): """apply a CL to the release branch @@ -1274,16 +1471,16 @@ def release_apply(ui, repo, clname, **opts): return "no active release branches" if c.branch() != releaseBranch: if c.modified() or c.added() or c.removed(): - raise util.Abort("uncommitted local changes - cannot switch branches") - err = hg.clean(repo, releaseBranch) + raise hg_util.Abort("uncommitted local changes - cannot switch branches") + err = hg_clean(repo, releaseBranch) if err: return err try: err = clpatch_or_undo(ui, repo, clname, opts, mode="backport") if err: - raise util.Abort(err) + raise hg_util.Abort(err) except Exception, e: - hg.clean(repo, "default") + hg_clean(repo, "default") raise e return None @@ -1318,14 +1515,10 @@ backportFooter = """ # Implementation of clpatch/undo. def clpatch_or_undo(ui, repo, clname, opts, mode): - if missing_codereview: - return missing_codereview + if codereview_disabled: + return codereview_disabled if mode == "undo" or mode == "backport": - if hgversion < '1.4': - # Don't have cmdutil.match (see implementation of sync command). - return "hg is too old to run hg %s - update to 1.4 or newer" % mode - # Find revision in Mercurial repository. # Assume CL number is 7+ decimal digits. # Otherwise is either change log sequence number (fewer decimal digits), @@ -1334,11 +1527,8 @@ def clpatch_or_undo(ui, repo, clname, opts, mode): # sequence numbers get to be 7 digits long. if re.match('^[0-9]{7,}$', clname): found = False - matchfn = scmutil.match(repo, [], {'rev': None}) - def prep(ctx, fns): - pass - for ctx in cmdutil.walkchangerevs(repo, matchfn, {'rev': None}, prep): - rev = repo[ctx.rev()] + for r in hg_log(ui, repo, keyword="codereview.appspot.com/"+clname, limit=100, template="{node}\n").split(): + rev = repo[r] # Last line with a code review URL is the actual review URL. # Earlier ones might be part of the CL description. n = rev2clname(rev) @@ -1356,7 +1546,7 @@ def clpatch_or_undo(ui, repo, clname, opts, mode): return "cannot find CL name in revision description" # Create fresh CL and start with patch that would reverse the change. - vers = short(rev.node()) + vers = hg_node.short(rev.node()) cl = CL("new") desc = str(rev.description()) if mode == "undo": @@ -1364,7 +1554,7 @@ def clpatch_or_undo(ui, repo, clname, opts, mode): else: cl.desc = (backportHeader % (releaseBranch, line1(desc), clname, vers)) + desc + undoFooter v1 = vers - v0 = short(rev.parents()[0].node()) + v0 = hg_node.short(rev.parents()[0].node()) if mode == "undo": arg = v1 + ":" + v0 else: @@ -1382,7 +1572,7 @@ def clpatch_or_undo(ui, repo, clname, opts, mode): # find current hg version (hg identify) ctx = repo[None] parents = ctx.parents() - id = '+'.join([short(p.node()) for p in parents]) + id = '+'.join([hg_node.short(p.node()) for p in parents]) # if version does not match the patch version, # try to update the patch line numbers. @@ -1406,7 +1596,7 @@ def clpatch_or_undo(ui, repo, clname, opts, mode): try: cmd = subprocess.Popen(argv, shell=False, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=None, close_fds=sys.platform != "win32") except: - return "hgpatch: " + ExceptionDetail() + return "hgpatch: " + ExceptionDetail() + "\nInstall hgpatch with:\n$ go get code.google.com/p/go.codereview/cmd/hgpatch\n" out, err = cmd.communicate(patch) if cmd.returncode != 0 and not opts["ignore_hgpatch_failure"]: @@ -1415,7 +1605,7 @@ def clpatch_or_undo(ui, repo, clname, opts, mode): cl.files = out.strip().split() if not cl.files and not opts["ignore_hgpatch_failure"]: return "codereview issue %s has no changed files" % clname - files = ChangedFiles(ui, repo, [], opts) + files = ChangedFiles(ui, repo, []) extra = Sub(cl.files, files) if extra: ui.warn("warning: these files were listed in the patch but not changed:\n\t" + "\n\t".join(extra) + "\n") @@ -1489,14 +1679,15 @@ def lineDelta(deltas, n, len): d = newdelta return d, "" +@hgcommand def download(ui, repo, clname, **opts): """download a change from the code review server Download prints a description of the given change list followed by its diff, downloaded from the code review server. """ - if missing_codereview: - return missing_codereview + if codereview_disabled: + return codereview_disabled cl, vers, patch, err = DownloadCL(ui, repo, clname) if err != "": @@ -1505,6 +1696,10 @@ def download(ui, repo, clname, **opts): ui.write(patch + "\n") return +####################################################################### +# hg file + +@hgcommand def file(ui, repo, clname, pat, *pats, **opts): """assign files to or remove files from a change list @@ -1513,8 +1708,8 @@ def file(ui, repo, clname, pat, *pats, **opts): The -d option only removes files from the change list. It does not edit them or remove them from the repository. """ - if missing_codereview: - return missing_codereview + if codereview_disabled: + return codereview_disabled pats = tuple([pat] + list(pats)) if not GoodCLName(clname): @@ -1527,7 +1722,7 @@ def file(ui, repo, clname, pat, *pats, **opts): if not cl.local: return "cannot change non-local CL " + clname - files = ChangedFiles(ui, repo, pats, opts) + files = ChangedFiles(ui, repo, pats) if opts["delete"]: oldfiles = Intersect(files, cl.files) @@ -1567,17 +1762,21 @@ def file(ui, repo, clname, pat, *pats, **opts): d.Flush(ui, repo) return +####################################################################### +# hg gofmt + +@hgcommand def gofmt(ui, repo, *pats, **opts): """apply gofmt to modified files Applies gofmt to the modified files in the repository that match the given patterns. """ - if missing_codereview: - return missing_codereview + if codereview_disabled: + return codereview_disabled files = ChangedExistingFiles(ui, repo, pats, opts) - files = [f for f in files if f.endswith(".go")] + files = gofmt_required(files) if not files: return "no modified go files" cwd = os.getcwd() @@ -1587,21 +1786,28 @@ def gofmt(ui, repo, *pats, **opts): if not opts["list"]: cmd += ["-w"] if os.spawnvp(os.P_WAIT, "gofmt", cmd + files) != 0: - raise util.Abort("gofmt did not exit cleanly") - except error.Abort, e: + raise hg_util.Abort("gofmt did not exit cleanly") + except hg_error.Abort, e: raise except: - raise util.Abort("gofmt: " + ExceptionDetail()) + raise hg_util.Abort("gofmt: " + ExceptionDetail()) return +def gofmt_required(files): + return [f for f in files if (not f.startswith('test/') or f.startswith('test/bench/')) and f.endswith('.go')] + +####################################################################### +# hg mail + +@hgcommand def mail(ui, repo, *pats, **opts): """mail a change for review Uploads a patch to the code review server and then sends mail to the reviewer and CC list asking for a review. """ - if missing_codereview: - return missing_codereview + if codereview_disabled: + return codereview_disabled cl, err = CommandLineCL(ui, repo, pats, opts, defaultcc=defaultcc) if err != "": @@ -1623,80 +1829,74 @@ def mail(ui, repo, *pats, **opts): cl.Mail(ui, repo) +####################################################################### +# hg p / hg pq / hg ps / hg pending + +@hgcommand +def ps(ui, repo, *pats, **opts): + """alias for hg p --short + """ + opts['short'] = True + return pending(ui, repo, *pats, **opts) + +@hgcommand +def pq(ui, repo, *pats, **opts): + """alias for hg p --quick + """ + opts['quick'] = True + return pending(ui, repo, *pats, **opts) + +@hgcommand def pending(ui, repo, *pats, **opts): """show pending changes Lists pending changes followed by a list of unassigned but modified files. """ - if missing_codereview: - return missing_codereview + if codereview_disabled: + return codereview_disabled - m = LoadAllCL(ui, repo, web=True) + quick = opts.get('quick', False) + short = opts.get('short', False) + m = LoadAllCL(ui, repo, web=not quick and not short) names = m.keys() names.sort() for name in names: cl = m[name] - ui.write(cl.PendingText() + "\n") + if short: + ui.write(name + "\t" + line1(cl.desc) + "\n") + else: + ui.write(cl.PendingText(quick=quick) + "\n") - files = DefaultFiles(ui, repo, [], opts) + if short: + return + files = DefaultFiles(ui, repo, []) if len(files) > 0: s = "Changed files not in any CL:\n" for f in files: s += "\t" + f + "\n" ui.write(s) -def reposetup(ui, repo): - global original_match - if original_match is None: - global global_repo, global_ui - global_repo = repo - global_ui = ui - start_status_thread() - original_match = scmutil.match - scmutil.match = ReplacementForCmdutilMatch - RietveldSetup(ui, repo) - -def CheckContributor(ui, repo, user=None): - set_status("checking CONTRIBUTORS file") - user, userline = FindContributor(ui, repo, user, warn=False) - if not userline: - raise util.Abort("cannot find %s in CONTRIBUTORS" % (user,)) - return userline - -def FindContributor(ui, repo, user=None, warn=True): - if not user: - user = ui.config("ui", "username") - if not user: - raise util.Abort("[ui] username is not configured in .hgrc") - user = user.lower() - m = re.match(r".*<(.*)>", user) - if m: - user = m.group(1) +####################################################################### +# hg submit - if user not in contributors: - if warn: - ui.warn("warning: cannot find %s in CONTRIBUTORS\n" % (user,)) - return user, None - - user, email = contributors[user] - return email, "%s <%s>" % (user, email) +def need_sync(): + raise hg_util.Abort("local repository out of date; must sync before submit") +@hgcommand def submit(ui, repo, *pats, **opts): """submit change to remote repository Submits change to remote repository. Bails out if the local repository is not in sync with the remote one. """ - if missing_codereview: - return missing_codereview + if codereview_disabled: + return codereview_disabled # We already called this on startup but sometimes Mercurial forgets. set_mercurial_encoding_to_utf8() - other = getremote(ui, repo, opts) - repo.ui.quiet = True - if not opts["no_incoming"] and incoming(repo, other): - return "local repository out of date; must sync before submit" + if not opts["no_incoming"] and hg_incoming(ui, repo): + need_sync() cl, err = CommandLineCL(ui, repo, pats, opts, defaultcc=defaultcc) if err != "": @@ -1742,64 +1942,59 @@ def submit(ui, repo, *pats, **opts): cl.Mail(ui, repo) # submit changes locally - date = opts.get('date') - if date: - opts['date'] = util.parsedate(date) - typecheck(opts['date'], str) - opts['message'] = cl.desc.rstrip() + "\n\n" + about - typecheck(opts['message'], str) - - if opts['dryrun']: - print "NOT SUBMITTING:" - print "User: ", userline - print "Message:" - print Indent(opts['message'], "\t") - print "Files:" - print Indent('\n'.join(cl.files), "\t") - return "dry run; not submitted" + message = cl.desc.rstrip() + "\n\n" + about + typecheck(message, str) set_status("pushing " + cl.name + " to remote server") - other = getremote(ui, repo, opts) - if outgoing(repo): - raise util.Abort("local repository corrupt or out-of-phase with remote: found outgoing changes") + if hg_outgoing(ui, repo): + raise hg_util.Abort("local repository corrupt or out-of-phase with remote: found outgoing changes") + + old_heads = len(hg_heads(ui, repo).split()) - m = match.exact(repo.root, repo.getcwd(), cl.files) - node = repo.commit(ustr(opts['message']), ustr(userline), opts.get('date'), m) - if not node: + global commit_okay + commit_okay = True + ret = hg_commit(ui, repo, *['path:'+f for f in cl.files], message=message, user=userline) + commit_okay = False + if ret: return "nothing changed" - + node = repo["-1"].node() # push to remote; if it fails for any reason, roll back try: - log = repo.changelog - rev = log.rev(node) - parents = log.parentrevs(rev) - if (rev-1 not in parents and - (parents == (nullrev, nullrev) or - len(log.heads(log.node(parents[0]))) > 1 and - (parents[1] == nullrev or len(log.heads(log.node(parents[1]))) > 1))): - # created new head - raise util.Abort("local repository out of date; must sync before submit") - - # push changes to remote. - # if it works, we're committed. - # if not, roll back - r = repo.push(other, False, None) - if r == 0: - raise util.Abort("local repository out of date; must sync before submit") + new_heads = len(hg_heads(ui, repo).split()) + if old_heads != new_heads and not (old_heads == 0 and new_heads == 1): + # Created new head, so we weren't up to date. + need_sync() + + # Push changes to remote. If it works, we're committed. If not, roll back. + try: + hg_push(ui, repo) + except hg_error.Abort, e: + if e.message.find("push creates new heads") >= 0: + # Remote repository had changes we missed. + need_sync() + raise except: real_rollback() raise - # we're committed. upload final patch, close review, add commit message - changeURL = short(node) - url = other.url() - m = re.match("^https?://([^@/]+@)?([^.]+)\.googlecode\.com/hg/?", url) + # We're committed. Upload final patch, close review, add commit message. + changeURL = hg_node.short(node) + url = ui.expandpath("default") + m = re.match("(^https?://([^@/]+@)?([^.]+)\.googlecode\.com/hg/?)" + "|" + + "(^https?://([^@/]+@)?code\.google\.com/p/([^/.]+)(\.[^./]+)?/?)", url) if m: - changeURL = "http://code.google.com/p/%s/source/detail?r=%s" % (m.group(2), changeURL) + if m.group(1): # prj.googlecode.com/hg/ case + changeURL = "http://code.google.com/p/%s/source/detail?r=%s" % (m.group(3), changeURL) + elif m.group(4) and m.group(7): # code.google.com/p/prj.subrepo/ case + changeURL = "http://code.google.com/p/%s/source/detail?r=%s&repo=%s" % (m.group(6), changeURL, m.group(7)[1:]) + elif m.group(4): # code.google.com/p/prj/ case + changeURL = "http://code.google.com/p/%s/source/detail?r=%s" % (m.group(6), changeURL) + else: + print >>sys.stderr, "URL: ", url else: print >>sys.stderr, "URL: ", url - pmsg = "*** Submitted as " + changeURL + " ***\n\n" + opts['message'] + pmsg = "*** Submitted as " + changeURL + " ***\n\n" + message # When posting, move reviewers to CC line, # so that the issue stops showing up in their "My Issues" page. @@ -1808,53 +2003,39 @@ def submit(ui, repo, *pats, **opts): if not cl.copied_from: EditDesc(cl.name, closed=True, private=cl.private) cl.Delete(ui, repo) - + c = repo[None] if c.branch() == releaseBranch and not c.modified() and not c.added() and not c.removed(): ui.write("switching from %s to default branch.\n" % releaseBranch) - err = hg.clean(repo, "default") + err = hg_clean(repo, "default") if err: return err return None +####################################################################### +# hg sync + +@hgcommand def sync(ui, repo, **opts): """synchronize with remote repository Incorporates recent changes from the remote repository into the local repository. """ - if missing_codereview: - return missing_codereview + if codereview_disabled: + return codereview_disabled if not opts["local"]: - ui.status = sync_note - ui.note = sync_note - other = getremote(ui, repo, opts) - modheads = repo.pull(other) - err = commands.postincoming(ui, repo, modheads, True, "tip") + err = hg_pull(ui, repo, update=True) if err: return err - commands.update(ui, repo, rev="default") sync_changes(ui, repo) -def sync_note(msg): - # we run sync (pull -u) in verbose mode to get the - # list of files being updated, but that drags along - # a bunch of messages we don't care about. - # omit them. - if msg == 'resolving manifests\n': - return - if msg == 'searching for changes\n': - return - if msg == "couldn't find merge tool hgmerge\n": - return - sys.stdout.write(msg) - def sync_changes(ui, repo): # Look through recent change log descriptions to find # potential references to http://.*/our-CL-number. # Double-check them by looking at the Rietveld log. - def Rev(rev): + for rev in hg_log(ui, repo, limit=100, template="{node}\n").split(): desc = repo[rev].description().strip() for clname in re.findall('(?m)^http://(?:[^\n]+)/([0-9]+)$', desc): if IsLocalCL(ui, repo, clname) and IsRietveldSubmitted(ui, clname, repo[rev].hex()): @@ -1867,28 +2048,10 @@ def sync_changes(ui, repo): EditDesc(cl.name, closed=True, private=cl.private) cl.Delete(ui, repo) - if hgversion < '1.4': - get = util.cachefunc(lambda r: repo[r].changeset()) - changeiter, matchfn = cmdutil.walkchangerevs(ui, repo, [], get, {'rev': None}) - n = 0 - for st, rev, fns in changeiter: - if st != 'iter': - continue - n += 1 - if n > 100: - break - Rev(rev) - else: - matchfn = scmutil.match(repo, [], {'rev': None}) - def prep(ctx, fns): - pass - for ctx in cmdutil.walkchangerevs(repo, matchfn, {'rev': None}, prep): - Rev(ctx.rev()) - # Remove files that are not modified from the CLs in which they appear. all = LoadAllCL(ui, repo, web=False) - changed = ChangedFiles(ui, repo, [], {}) - for _, cl in all.items(): + changed = ChangedFiles(ui, repo, []) + for cl in all.values(): extra = Sub(cl.files, changed) if extra: ui.warn("Removing unmodified files from CL %s:\n" % (cl.name,)) @@ -1903,13 +2066,17 @@ def sync_changes(ui, repo): ui.warn("CL %s has no files; delete locally with hg change -D %s\n" % (cl.name, cl.name)) return +####################################################################### +# hg upload + +@hgcommand def upload(ui, repo, name, **opts): """upload diffs to the code review server Uploads the current modifications for a given change to the server. """ - if missing_codereview: - return missing_codereview + if codereview_disabled: + return codereview_disabled repo.ui.quiet = True cl, err = LoadCL(ui, repo, name, web=True) @@ -1921,6 +2088,9 @@ def upload(ui, repo, name, **opts): print "%s%s\n" % (server_url_base, cl.name) return +####################################################################### +# Table of commands, supplied to Mercurial for installation. + review_opts = [ ('r', 'reviewer', '', 'add reviewer'), ('', 'cc', '', 'add cc'), @@ -1979,13 +2149,26 @@ cmdtable = { ), "^pending|p": ( pending, + [ + ('s', 'short', False, 'show short result form'), + ('', 'quick', False, 'do not consult codereview server'), + ], + "[FILE ...]" + ), + "^ps": ( + ps, + [], + "[FILE ...]" + ), + "^pq": ( + pq, [], "[FILE ...]" ), "^mail": ( mail, review_opts + [ - ] + commands.walkopts, + ] + hg_commands.walkopts, "[-r reviewer] [--cc cc] [change# | file ...]" ), "^release-apply": ( @@ -2001,8 +2184,7 @@ cmdtable = { submit, review_opts + [ ('', 'no_incoming', None, 'disable initial incoming check (for testing)'), - ('n', 'dryrun', None, 'make change only locally (for testing)'), - ] + commands.walkopts + commands.commitopts + commands.commitopts2, + ] + hg_commands.walkopts + hg_commands.commitopts + hg_commands.commitopts2, "[-r reviewer] [--cc cc] [change# | file ...]" ), "^sync": ( @@ -2027,10 +2209,77 @@ cmdtable = { ), } +####################################################################### +# Mercurial extension initialization + +def norollback(*pats, **opts): + """(disabled when using this extension)""" + raise hg_util.Abort("codereview extension enabled; use undo instead of rollback") + +codereview_init = False + +def reposetup(ui, repo): + global codereview_disabled + global defaultcc + + # reposetup gets called both for the local repository + # and also for any repository we are pulling or pushing to. + # Only initialize the first time. + global codereview_init + if codereview_init: + return + codereview_init = True + + # Read repository-specific options from lib/codereview/codereview.cfg or codereview.cfg. + root = '' + try: + root = repo.root + except: + # Yes, repo might not have root; see issue 959. + codereview_disabled = 'codereview disabled: repository has no root' + return + + repo_config_path = '' + p1 = root + '/lib/codereview/codereview.cfg' + p2 = root + '/codereview.cfg' + if os.access(p1, os.F_OK): + repo_config_path = p1 + else: + repo_config_path = p2 + try: + f = open(repo_config_path) + for line in f: + if line.startswith('defaultcc:'): + defaultcc = SplitCommaSpace(line[len('defaultcc:'):]) + if line.startswith('contributors:'): + global contributorsURL + contributorsURL = line[len('contributors:'):].strip() + except: + codereview_disabled = 'codereview disabled: cannot open ' + repo_config_path + return + + remote = ui.config("paths", "default", "") + if remote.find("://") < 0: + raise hg_util.Abort("codereview: default path '%s' is not a URL" % (remote,)) + + InstallMatch(ui, repo) + RietveldSetup(ui, repo) + + # Disable the Mercurial commands that might change the repository. + # Only commands in this extension are supposed to do that. + ui.setconfig("hooks", "precommit.codereview", precommithook) + + # Rollback removes an existing commit. Don't do that either. + global real_rollback + real_rollback = repo.rollback + repo.rollback = norollback + ####################################################################### # Wrappers around upload.py for interacting with Rietveld +from HTMLParser import HTMLParser + # HTML form parser class FormParser(HTMLParser): def __init__(self): @@ -2106,7 +2355,7 @@ def fix_json(x): for k in todel: del x[k] else: - raise util.Abort("unknown type " + str(type(x)) + " in fix_json") + raise hg_util.Abort("unknown type " + str(type(x)) + " in fix_json") if type(x) is str: x = x.replace('\r\n', '\n') return x @@ -2309,68 +2558,13 @@ def PostMessage(ui, issue, message, reviewers=None, cc=None, send_mail=True, sub class opt(object): pass -def nocommit(*pats, **opts): - """(disabled when using this extension)""" - raise util.Abort("codereview extension enabled; use mail, upload, or submit instead of commit") - -def nobackout(*pats, **opts): - """(disabled when using this extension)""" - raise util.Abort("codereview extension enabled; use undo instead of backout") - -def norollback(*pats, **opts): - """(disabled when using this extension)""" - raise util.Abort("codereview extension enabled; use undo instead of rollback") - def RietveldSetup(ui, repo): - global defaultcc, upload_options, rpc, server, server_url_base, force_google_account, verbosity, contributors - global missing_codereview - - repo_config_path = '' - # Read repository-specific options from lib/codereview/codereview.cfg - try: - repo_config_path = repo.root + '/lib/codereview/codereview.cfg' - f = open(repo_config_path) - for line in f: - if line.startswith('defaultcc: '): - defaultcc = SplitCommaSpace(line[10:]) - except: - # If there are no options, chances are good this is not - # a code review repository; stop now before we foul - # things up even worse. Might also be that repo doesn't - # even have a root. See issue 959. - if repo_config_path == '': - missing_codereview = 'codereview disabled: repository has no root' - else: - missing_codereview = 'codereview disabled: cannot open ' + repo_config_path - return - - # Should only modify repository with hg submit. - # Disable the built-in Mercurial commands that might - # trip things up. - cmdutil.commit = nocommit - global real_rollback - real_rollback = repo.rollback - repo.rollback = norollback - # would install nobackout if we could; oh well - - try: - f = open(repo.root + '/CONTRIBUTORS', 'r') - except: - raise util.Abort("cannot open %s: %s" % (repo.root+'/CONTRIBUTORS', ExceptionDetail())) - for line in f: - # CONTRIBUTORS is a list of lines like: - # Person - # Person - # The first email address is the one used in commit logs. - if line.startswith('#'): - continue - m = re.match(r"([^<>]+\S)\s+(<[^<>\s]+>)((\s+<[^<>\s]+>)*)\s*$", line) - if m: - name = m.group(1) - email = m.group(2)[1:-1] - contributors[email.lower()] = (name, email) - for extra in m.group(3).split(): - contributors[extra[1:-1].lower()] = (name, email) + global force_google_account + global rpc + global server + global server_url_base + global upload_options + global verbosity if not ui.verbose: verbosity = 0 @@ -2416,15 +2610,14 @@ def RietveldSetup(ui, repo): global releaseBranch tags = repo.branchtags().keys() - if 'release-branch.r100' in tags: + if 'release-branch.go10' in tags: # NOTE(rsc): This tags.sort is going to get the wrong - # answer when comparing release-branch.r99 with - # release-branch.r100. If we do ten releases a year - # that gives us 4 years before we have to worry about this. - raise util.Abort('tags.sort needs to be fixed for release-branch.r100') + # answer when comparing release-branch.go9 with + # release-branch.go10. It will be a while before we care. + raise hg_util.Abort('tags.sort needs to be fixed for release-branch.go10') tags.sort() for t in tags: - if t.startswith('release-branch.'): + if t.startswith('release-branch.go'): releaseBranch = t ####################################################################### @@ -3030,7 +3223,7 @@ class VersionControlSystem(object): unused, filename = line.split(':', 1) # On Windows if a file has property changes its filename uses '\' # instead of '/'. - filename = filename.strip().replace('\\', '/') + filename = to_slash(filename.strip()) files[filename] = self.GetBaseFile(filename) return files @@ -3150,6 +3343,10 @@ class FakeMercurialUI(object): return self def status(self, *args, **opts): pass + + def formatter(self, topic, opts): + from mercurial.formatter import plainformatter + return plainformatter(self, topic, opts) def readconfig(self, *args, **opts): pass @@ -3183,7 +3380,11 @@ class MercurialVCS(VersionControlSystem): if not err and mqparent != "": self.base_rev = mqparent else: - self.base_rev = RunShell(["hg", "parents", "-q"]).split(':')[1].strip() + out = RunShell(["hg", "parents", "-q"], silent_ok=True).strip() + if not out: + # No revisions; use 0 to mean a repository with nothing. + out = "0:0" + self.base_rev = out.split(':')[1].strip() def _GetRelPath(self, filename): """Get relative path of a file according to the current directory, given its logical path in the repo.""" @@ -3238,9 +3439,9 @@ class MercurialVCS(VersionControlSystem): out = RunShell(["hg", "status", "-C", "--rev", rev]) else: fui = FakeMercurialUI() - ret = commands.status(fui, self.repo, *[], **{'rev': [rev], 'copies': True}) + ret = hg_commands.status(fui, self.repo, *[], **{'rev': [rev], 'copies': True}) if ret: - raise util.Abort(ret) + raise hg_util.Abort(ret) out = fui.output self.status = out.splitlines() for i in range(len(self.status)): @@ -3248,12 +3449,12 @@ class MercurialVCS(VersionControlSystem): # A path # M path # etc - line = self.status[i].replace('\\', '/') + line = to_slash(self.status[i]) if line[2:] == path: if i+1 < len(self.status) and self.status[i+1][:2] == ' ': return self.status[i:i+2] return self.status[i:i+1] - raise util.Abort("no status for " + path) + raise hg_util.Abort("no status for " + path) def GetBaseFile(self, filename): set_status("inspecting " + filename) @@ -3315,7 +3516,7 @@ def SplitPatch(data): # When a file is modified, paths use '/' between directories, however # when a property is modified '\' is used on Windows. Make them the same # otherwise the file shows up twice. - temp_filename = temp_filename.strip().replace('\\', '/') + temp_filename = to_slash(temp_filename.strip()) if temp_filename != filename: # File has property changes but no modifications, create a new diff. new_filename = temp_filename diff --git a/re2/compile.cc b/re2/compile.cc index 67c4c2c..9cddb71 100644 --- a/re2/compile.cc +++ b/re2/compile.cc @@ -44,7 +44,7 @@ struct PatchList { static PatchList Append(Prog::Inst *inst0, PatchList l1, PatchList l2); }; -static PatchList nullPatchList = { 0 }; +static PatchList nullPatchList; // Returns patch list containing just p. PatchList PatchList::Mk(uint32 p) { @@ -106,11 +106,12 @@ struct Frag { uint32 begin; PatchList end; + explicit Frag(LinkerInitialized) {} Frag() : begin(0) { end.p = 0; } // needed so Frag can go in vector Frag(uint32 begin, PatchList end) : begin(begin), end(end) {} }; -static Frag kNullFrag; +static Frag kNullFrag(LINKER_INITIALIZED); // Input encodings. enum Encoding { @@ -458,7 +459,7 @@ Frag Compiler::Capture(Frag a, int n) { // A Rune is a name for a Unicode code point. // Returns maximum rune encoded by UTF-8 sequence of length len. static int MaxRune(int len) { - int b; // number of Rune blents lenn len-byte UTF-8 sequence (len < UTFmax) + int b; // number of Rune bits in len-byte UTF-8 sequence (len < UTFmax) if (len == 1) b = 7; else @@ -588,7 +589,7 @@ static struct ByteRangeProg { }; void Compiler::Add_80_10ffff() { - int inst[arraysize(prog_80_10ffff)]; + int inst[arraysize(prog_80_10ffff)] = { 0 }; // does not need to be initialized; silences gcc warning for (int i = 0; i < arraysize(prog_80_10ffff); i++) { const ByteRangeProg& p = prog_80_10ffff[i]; int next = 0; @@ -732,7 +733,7 @@ Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags, Frag f = Match(re->match_id()); // Remember unanchored match to end of string. if (anchor_ != RE2::ANCHOR_BOTH) - f = Cat(DotStar(), f); + f = Cat(DotStar(), Cat(EmptyWidth(kEmptyEndText), f)); return f; } diff --git a/re2/dfa.cc b/re2/dfa.cc index 7d206fb..2556c0f 100644 --- a/re2/dfa.cc +++ b/re2/dfa.cc @@ -115,6 +115,7 @@ class DFA { kFlagNeedShift = 16, // needed kEmpty bits are or'ed in shifted left }; +#ifndef STL_MSVC // STL function structures for use with unordered_set. struct StateEqual { bool operator()(const State* a, const State* b) const { @@ -132,6 +133,7 @@ class DFA { return true; // they're equal } }; +#endif // STL_MSVC struct StateHash { size_t operator()(const State* a) const { if (a == NULL) @@ -143,9 +145,34 @@ class DFA { else return Hash64StringWithSeed(s, len, a->flag_); } +#ifdef STL_MSVC + // Less than operator. + bool operator()(const State* a, const State* b) const { + if (a == b) + return false; + if (a == NULL || b == NULL) + return a == NULL; + if (a->ninst_ != b->ninst_) + return a->ninst_ < b->ninst_; + if (a->flag_ != b->flag_) + return a->flag_ < b->flag_; + for (int i = 0; i < a->ninst_; ++i) + if (a->inst_[i] != b->inst_[i]) + return a->inst_[i] < b->inst_[i]; + return false; // they're equal + } + // The two public members are required by msvc. 4 and 8 are default values. + // Reference: http://msdn.microsoft.com/en-us/library/1s1byw77.aspx + static const size_t bucket_size = 4; + static const size_t min_buckets = 8; +#endif // STL_MSVC }; +#ifdef STL_MSVC + typedef unordered_set StateSet; +#else // !STL_MSVC typedef unordered_set StateSet; +#endif // STL_MSVC private: @@ -441,8 +468,8 @@ DFA::DFA(Prog* prog, Prog::MatchKind kind, int64 max_mem) // At minimum, the search requires room for two states in order // to limp along, restarting frequently. We'll get better performance // if there is room for a larger number of states, say 20. - int one_state = sizeof(State) + (prog_->size()+nmark)*sizeof(int) + - (prog_->bytemap_range()+1)*sizeof(State*); + int64 one_state = sizeof(State) + (prog_->size()+nmark)*sizeof(int) + + (prog_->bytemap_range()+1)*sizeof(State*); if (state_budget_ < 20*one_state) { LOG(INFO) << StringPrintf("DFA out of memory: prog size %lld mem %lld", prog_->size(), max_mem); @@ -962,8 +989,10 @@ DFA::State* DFA::RunStateOnByte(State* state, int c) { // If someone else already computed this, return it. MaybeReadMemoryBarrier(); // On alpha we need to ensure read ordering - if (state->next_[ByteMap(c)]) - return state->next_[ByteMap(c)]; + State* ns = state->next_[ByteMap(c)]; + ANNOTATE_HAPPENS_AFTER(ns); + if (ns != NULL) + return ns; // Convert state into Workq. StateToWorkq(state, q0_); @@ -1006,7 +1035,17 @@ DFA::State* DFA::RunStateOnByte(State* state, int c) { } bool ismatch = false; RunWorkqOnByte(q0_, q1_, c, afterflag, &ismatch, kind_, start_unanchored_); - swap(q0_, q1_); + + // Most of the time, we build the state from the output of + // RunWorkqOnByte, so swap q0_ and q1_ here. However, so that + // RE2::Set can tell exactly which match instructions + // contributed to the match, don't swap if c is kByteEndText. + // The resulting state wouldn't be correct for further processing + // of the string, but we're at the end of the text so that's okay. + // Leaving q0_ alone preseves the match instructions that led to + // the current setting of ismatch. + if (c != kByteEndText || kind_ != Prog::kManyMatch) + swap(q0_, q1_); // Save afterflag along with ismatch and isword in new state. uint flag = afterflag; @@ -1015,7 +1054,7 @@ DFA::State* DFA::RunStateOnByte(State* state, int c) { if (isword) flag |= kFlagLastWord; - State* ns = WorkqToCachedState(q0_, flag); + ns = WorkqToCachedState(q0_, flag); // Write barrier before updating state->next_ so that the // main search loop can proceed without any locking, for speed. @@ -1024,9 +1063,9 @@ DFA::State* DFA::RunStateOnByte(State* state, int c) { // a) the access to next_ should be ignored, // b) 'ns' is properly published. WriteMemoryBarrier(); // Flush ns before linking to it. - ANNOTATE_PUBLISH_MEMORY_RANGE(ns, sizeof(*ns)); ANNOTATE_IGNORE_WRITES_BEGIN(); + ANNOTATE_HAPPENS_BEFORE(ns); state->next_[ByteMap(c)] = ns; ANNOTATE_IGNORE_WRITES_END(); return ns; @@ -1351,6 +1390,7 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params, MaybeReadMemoryBarrier(); // On alpha we need to ensure read ordering State* ns = s->next_[bytemap[c]]; + ANNOTATE_HAPPENS_AFTER(ns); if (ns == NULL) { ns = RunStateOnByteUnlocked(s, c); if (ns == NULL) { @@ -1422,20 +1462,6 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params, } } - // Peek in state to see if a match is coming up. - if (params->matches && kind_ == Prog::kManyMatch) { - vector* v = params->matches; - v->clear(); - if (s > SpecialStateMax) { - for (int i = 0; i < s->ninst_; i++) { - Prog::Inst* ip = prog_->inst(s->inst_[i]); - if (ip->opcode() == kInstMatch) - v->push_back(ip->match_id()); - } - } - } - - // Process one more byte to see if it triggers a match. // (Remember, matches are delayed one byte.) int lastbyte; @@ -1453,6 +1479,7 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params, MaybeReadMemoryBarrier(); // On alpha we need to ensure read ordering State* ns = s->next_[ByteMap(lastbyte)]; + ANNOTATE_HAPPENS_AFTER(ns); if (ns == NULL) { ns = RunStateOnByteUnlocked(s, lastbyte); if (ns == NULL) { @@ -1480,6 +1507,15 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params, if (s > SpecialStateMax && s->IsMatch()) { matched = true; lastmatch = p; + if (params->matches && kind_ == Prog::kManyMatch) { + vector* v = params->matches; + v->clear(); + for (int i = 0; i < s->ninst_; i++) { + Prog::Inst* ip = prog_->inst(s->inst_[i]); + if (ip->opcode() == kInstMatch) + v->push_back(ip->match_id()); + } + } if (DebugDFA) fprintf(stderr, "match @%d! [%s]\n", static_cast(lastmatch - bp), DumpState(s).c_str()); @@ -1637,7 +1673,7 @@ bool DFA::AnalyzeSearch(SearchParams* params) { DumpState(info->start).c_str(), info->firstbyte); params->start = info->start; - params->firstbyte = info->firstbyte; + params->firstbyte = ANNOTATE_UNPROTECTED_READ(info->firstbyte); return true; } @@ -1646,12 +1682,16 @@ bool DFA::AnalyzeSearch(SearchParams* params) { bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info, uint flags) { // Quick check; okay because of memory barriers below. - if (info->firstbyte != kFbUnknown) + if (ANNOTATE_UNPROTECTED_READ(info->firstbyte) != kFbUnknown) { + ANNOTATE_HAPPENS_AFTER(&info->firstbyte); return true; + } MutexLock l(&mutex_); - if (info->firstbyte != kFbUnknown) + if (info->firstbyte != kFbUnknown) { + ANNOTATE_HAPPENS_AFTER(&info->firstbyte); return true; + } q0_->clear(); AddToQueue(q0_, @@ -1662,12 +1702,14 @@ bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info, return false; if (info->start == DeadState) { + ANNOTATE_HAPPENS_BEFORE(&info->firstbyte); WriteMemoryBarrier(); // Synchronize with "quick check" above. info->firstbyte = kFbNone; return true; } if (info->start == FullMatchState) { + ANNOTATE_HAPPENS_BEFORE(&info->firstbyte); WriteMemoryBarrier(); // Synchronize with "quick check" above. info->firstbyte = kFbNone; // will be ignored return true; @@ -1680,6 +1722,7 @@ bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info, for (int i = 0; i < 256; i++) { State* s = RunStateOnByte(info->start, i); if (s == NULL) { + ANNOTATE_HAPPENS_BEFORE(&info->firstbyte); WriteMemoryBarrier(); // Synchronize with "quick check" above. info->firstbyte = firstbyte; return false; @@ -1694,6 +1737,7 @@ bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info, break; } } + ANNOTATE_HAPPENS_BEFORE(&info->firstbyte); WriteMemoryBarrier(); // Synchronize with "quick check" above. info->firstbyte = firstbyte; return true; @@ -1734,7 +1778,7 @@ bool DFA::Search(const StringPiece& text, return false; } if (params.start == DeadState) - return NULL; + return false; if (params.start == FullMatchState) { if (run_forward == want_earliest_match) *epp = text.begin(); @@ -1776,7 +1820,7 @@ DFA* Prog::GetDFA(MatchKind kind) { } // Quick check; okay because of memory barrier below. - DFA *dfa = *pdfa; + DFA *dfa = ANNOTATE_UNPROTECTED_READ(*pdfa); if (dfa != NULL) { ANNOTATE_HAPPENS_AFTER(dfa); return dfa; @@ -1784,8 +1828,10 @@ DFA* Prog::GetDFA(MatchKind kind) { MutexLock l(&dfa_mutex_); dfa = *pdfa; - if (dfa != NULL) + if (dfa != NULL) { + ANNOTATE_HAPPENS_AFTER(dfa); return dfa; + } // For a forward DFA, half the memory goes to each DFA. // For a reverse DFA, all the memory goes to the diff --git a/re2/filtered_re2.cc b/re2/filtered_re2.cc index 9269cee..f576258 100644 --- a/re2/filtered_re2.cc +++ b/re2/filtered_re2.cc @@ -27,8 +27,10 @@ RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern, RE2::ErrorCode code = re->error_code(); if (!re->ok()) { - LOG(ERROR) << "Couldn't compile regular expression, skipping: " - << re << " due to error " << re->error(); + if (options.log_errors()) { + LOG(ERROR) << "Couldn't compile regular expression, skipping: " + << re << " due to error " << re->error(); + } delete re; } else { *id = re2_vec_.size(); diff --git a/re2/nfa.cc b/re2/nfa.cc index 61a4ecf..8c4f761 100644 --- a/re2/nfa.cc +++ b/re2/nfa.cc @@ -84,7 +84,7 @@ class NFA { inline Thread* AllocThread(); inline void FreeThread(Thread*); - // Add r (or its children, following unlabeled arrows) + // Add id (or its children, following unlabeled arrows) // to the workqueue q with associated capture info. void AddToThreadq(Threadq* q, int id, int flag, const char* p, const char** capture); @@ -179,7 +179,7 @@ void NFA::CopyCapture(const char** dst, const char** src) { } } -// Follows all empty arrows from r and enqueues all the states reached. +// Follows all empty arrows from id0 and enqueues all the states reached. // The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match. // The pointer p is the current input position, and m is the // current set of match boundaries. @@ -214,7 +214,7 @@ void NFA::AddToThreadq(Threadq* q, int id0, int flag, // Create entry in q no matter what. We might fill it in below, // or we might not. Even if not, it is necessary to have it, - // so that we don't revisit r during the recursion. + // so that we don't revisit id0 during the recursion. q->set_new(id, NULL); Thread** tp = &q->find(id)->second; diff --git a/re2/parse.cc b/re2/parse.cc index 4f4ef89..0cf4ab4 100644 --- a/re2/parse.cc +++ b/re2/parse.cc @@ -16,7 +16,6 @@ // and recognizes the Perl escape sequences \d, \s, \w, \D, \S, and \W. // See regexp.h for rationale. -#include #include "util/util.h" #include "re2/regexp.h" #include "re2/stringpiece.h" @@ -1454,6 +1453,13 @@ static void AddUGroup(CharClassBuilder *cc, UGroup *g, int sign, // to what's already missing. Too hard, so do in two steps. CharClassBuilder ccb1; AddUGroup(&ccb1, g, +1, parse_flags); + // If the flags say to take out \n, put it in, so that negating will take it out. + // Normally AddRangeFlags does this, but we're bypassing AddRangeFlags. + bool cutnl = !(parse_flags & Regexp::ClassNL) || + (parse_flags & Regexp::NeverNL); + if (cutnl) { + ccb1.AddRange('\n', '\n'); + } ccb1.Negate(); cc->AddCharClass(&ccb1); return; @@ -1996,8 +2002,13 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, return NULL; break; } - if (!ps.DoLeftParen(NULL)) - return NULL; + if (ps.flags() & NeverCapture) { + if (!ps.DoLeftParenNoCapture()) + return NULL; + } else { + if (!ps.DoLeftParen(NULL)) + return NULL; + } t.remove_prefix(1); // '(' break; diff --git a/re2/prefilter.cc b/re2/prefilter.cc index 30e4c01..4b9c35d 100644 --- a/re2/prefilter.cc +++ b/re2/prefilter.cc @@ -181,6 +181,12 @@ static Rune ToLowerRune(Rune r) { return ApplyFold(f, r); } +static Rune ToLowerRuneLatin1(Rune r) { + if ('A' <= r && r <= 'Z') + r += 'a' - 'A'; + return r; +} + Prefilter* Prefilter::FromString(const string& str) { Prefilter* m = new Prefilter(Prefilter::ATOM); m->atom_ = str; @@ -205,8 +211,9 @@ class Prefilter::Info { static Info* EmptyString(); static Info* NoMatch(); static Info* AnyChar(); - static Info* CClass(CharClass* cc); + static Info* CClass(CharClass* cc, bool latin1); static Info* Literal(Rune r); + static Info* LiteralLatin1(Rune r); static Info* AnyMatch(); // Format Info as a string. @@ -390,6 +397,11 @@ static string RuneToString(Rune r) { return string(buf, n); } +static string RuneToStringLatin1(Rune r) { + char c = r & 0xff; + return string(&c, 1); +} + // Constructs Info for literal rune. Prefilter::Info* Prefilter::Info::Literal(Rune r) { Info* info = new Info(); @@ -398,6 +410,14 @@ Prefilter::Info* Prefilter::Info::Literal(Rune r) { return info; } +// Constructs Info for literal rune for Latin1 encoded string. +Prefilter::Info* Prefilter::Info::LiteralLatin1(Rune r) { + Info* info = new Info(); + info->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r))); + info->is_exact_ = true; + return info; +} + // Constructs Info for dot (any character). Prefilter::Info* Prefilter::Info::AnyChar() { Prefilter::Info* info = new Prefilter::Info(); @@ -432,7 +452,8 @@ Prefilter::Info* Prefilter::Info::EmptyString() { // Constructs Prefilter::Info for a character class. typedef CharClass::iterator CCIter; -Prefilter::Info* Prefilter::Info::CClass(CharClass *cc) { +Prefilter::Info* Prefilter::Info::CClass(CharClass *cc, + bool latin1) { if (Trace) { VLOG(0) << "CharClassInfo:"; for (CCIter i = cc->begin(); i != cc->end(); ++i) @@ -445,8 +466,14 @@ Prefilter::Info* Prefilter::Info::CClass(CharClass *cc) { Prefilter::Info *a = new Prefilter::Info(); for (CCIter i = cc->begin(); i != cc->end(); ++i) - for (Rune r = i->lo; r <= i->hi; r++) - a->exact_.insert(RuneToString(ToLowerRune(r))); + for (Rune r = i->lo; r <= i->hi; r++) { + if (latin1) { + a->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r))); + } else { + a->exact_.insert(RuneToString(ToLowerRune(r))); + } + } + a->is_exact_ = true; @@ -459,7 +486,7 @@ Prefilter::Info* Prefilter::Info::CClass(CharClass *cc) { class Prefilter::Info::Walker : public Regexp::Walker { public: - Walker() {} + Walker(bool latin1) : latin1_(latin1) {} virtual Info* PostVisit( Regexp* re, Info* parent_arg, @@ -470,7 +497,9 @@ class Prefilter::Info::Walker : public Regexp::Walker { Regexp* re, Info* parent_arg); + bool latin1() { return latin1_; } private: + bool latin1_; DISALLOW_EVIL_CONSTRUCTORS(Walker); }; @@ -478,7 +507,9 @@ Prefilter::Info* Prefilter::BuildInfo(Regexp* re) { if (Trace) { LOG(INFO) << "BuildPrefilter::Info: " << re->ToString(); } - Prefilter::Info::Walker w; + + bool latin1 = re->parse_flags() & Regexp::Latin1; + Prefilter::Info::Walker w(latin1); Prefilter::Info* info = w.WalkExponential(re, NULL, 100000); if (w.stopped_early()) { @@ -524,7 +555,12 @@ Prefilter::Info* Prefilter::Info::Walker::PostVisit( break; case kRegexpLiteral: - info = Literal(re->rune()); + if (latin1()) { + info = LiteralLatin1(re->rune()); + } + else { + info = Literal(re->rune()); + } break; case kRegexpLiteralString: @@ -532,9 +568,17 @@ Prefilter::Info* Prefilter::Info::Walker::PostVisit( info = NoMatch(); break; } - info = Literal(re->runes()[0]); - for (int i = 1; i < re->nrunes(); i++) - info = Concat(info, Literal(re->runes()[i])); + if (latin1()) { + info = LiteralLatin1(re->runes()[0]); + for (int i = 1; i < re->nrunes(); i++) { + info = Concat(info, LiteralLatin1(re->runes()[i])); + } + } else { + info = Literal(re->runes()[0]); + for (int i = 1; i < re->nrunes(); i++) { + info = Concat(info, Literal(re->runes()[i])); + } + } break; case kRegexpConcat: { @@ -585,7 +629,7 @@ Prefilter::Info* Prefilter::Info::Walker::PostVisit( break; case kRegexpCharClass: - info = CClass(re->cc()); + info = CClass(re->cc(), latin1()); break; case kRegexpCapture: diff --git a/re2/re2.cc b/re2/re2.cc index 448f28e..8d1d468 100644 --- a/re2/re2.cc +++ b/re2/re2.cc @@ -10,7 +10,6 @@ #include "re2/re2.h" #include -#include #include #include #include @@ -32,20 +31,42 @@ const VariadicFunction2 RE2::Consume; const VariadicFunction2 RE2::FindAndConsume; +// This will trigger LNK2005 error in MSVC. +#ifndef COMPILER_MSVC const int RE2::Options::kDefaultMaxMem; // initialized in re2.h +#endif // COMPILER_MSVC + +RE2::Options::Options(RE2::CannedOptions opt) + : encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8), + posix_syntax_(opt == RE2::POSIX), + longest_match_(opt == RE2::POSIX), + log_errors_(opt != RE2::Quiet), + max_mem_(kDefaultMaxMem), + literal_(false), + never_nl_(false), + never_capture_(false), + case_sensitive_(true), + perl_classes_(false), + word_boundary_(false), + one_line_(false) { +} -// Commonly-used option sets; arguments to constructor are: -// utf8 input -// posix syntax -// longest match -// log errors -const RE2::Options RE2::DefaultOptions; // EncodingUTF8, false, false, true -const RE2::Options RE2::Latin1(RE2::Options::EncodingLatin1, false, false, true); -const RE2::Options RE2::POSIX(RE2::Options::EncodingUTF8, true, true, true); -const RE2::Options RE2::Quiet(RE2::Options::EncodingUTF8, false, false, false); - -// If a regular expression has no error, its error_ field points here -static const string empty_string; +// static empty things for use as const references. +// To avoid global constructors, initialized on demand. +GLOBAL_MUTEX(empty_mutex); +static const string *empty_string; +static const map *empty_named_groups; +static const map *empty_group_names; + +static void InitEmpty() { + GLOBAL_MUTEX_LOCK(empty_mutex); + if (empty_string == NULL) { + empty_string = new string; + empty_named_groups = new map; + empty_group_names = new map; + } + GLOBAL_MUTEX_UNLOCK(empty_mutex); +} // Converts from Regexp error code to RE2 error code. // Maybe some day they will diverge. In any event, this @@ -111,7 +132,8 @@ int RE2::Options::ParseFlags() const { int flags = Regexp::ClassNL; switch (encoding()) { default: - LOG(ERROR) << "Unknown encoding " << encoding(); + if (log_errors()) + LOG(ERROR) << "Unknown encoding " << encoding(); break; case RE2::Options::EncodingUTF8: break; @@ -129,6 +151,9 @@ int RE2::Options::ParseFlags() const { if (never_nl()) flags |= Regexp::NeverNL; + if (never_capture()) + flags |= Regexp::NeverCapture; + if (!case_sensitive()) flags |= Regexp::FoldCase; @@ -148,7 +173,8 @@ void RE2::Init(const StringPiece& pattern, const Options& options) { mutex_ = new Mutex; pattern_ = pattern.as_string(); options_.Copy(options); - error_ = &empty_string; + InitEmpty(); + error_ = empty_string; error_code_ = NoError; suffix_regexp_ = NULL; entire_regexp_ = NULL; @@ -164,7 +190,7 @@ void RE2::Init(const StringPiece& pattern, const Options& options) { static_cast(options_.ParseFlags()), &status); if (entire_regexp_ == NULL) { - if (error_ == &empty_string) + if (error_ == empty_string) error_ = new string(status.Text()); if (options_.log_errors()) { LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': " @@ -206,7 +232,7 @@ void RE2::Init(const StringPiece& pattern, const Options& options) { // Returns rprog_, computing it if needed. re2::Prog* RE2::ReverseProg() const { MutexLock l(mutex_); - if (rprog_ == NULL && error_ == &empty_string) { + if (rprog_ == NULL && error_ == empty_string) { rprog_ = suffix_regexp_->CompileToReverseProg(options_.max_mem()/3); if (rprog_ == NULL) { if (options_.log_errors()) @@ -219,9 +245,6 @@ re2::Prog* RE2::ReverseProg() const { return rprog_; } -static const map empty_named_groups; -static const map empty_group_names; - RE2::~RE2() { if (suffix_regexp_) suffix_regexp_->Decref(); @@ -230,11 +253,11 @@ RE2::~RE2() { delete mutex_; delete prog_; delete rprog_; - if (error_ != &empty_string) + if (error_ != empty_string) delete error_; - if (named_groups_ != NULL && named_groups_ != &empty_named_groups) + if (named_groups_ != NULL && named_groups_ != empty_named_groups) delete named_groups_; - if (group_names_ != NULL && group_names_ != &empty_group_names) + if (group_names_ != NULL && group_names_ != empty_group_names) delete group_names_; } @@ -248,11 +271,11 @@ int RE2::ProgramSize() const { const map& RE2::NamedCapturingGroups() const { MutexLock l(mutex_); if (!ok()) - return empty_named_groups; + return *empty_named_groups; if (named_groups_ == NULL) { named_groups_ = suffix_regexp_->NamedCaptures(); if (named_groups_ == NULL) - named_groups_ = &empty_named_groups; + named_groups_ = empty_named_groups; } return *named_groups_; } @@ -261,11 +284,11 @@ const map& RE2::NamedCapturingGroups() const { const map& RE2::CapturingGroupNames() const { MutexLock l(mutex_); if (!ok()) - return empty_group_names; + return *empty_group_names; if (group_names_ == NULL) { group_names_ = suffix_regexp_->CaptureNames(); if (group_names_ == NULL) - group_names_ = &empty_group_names; + group_names_ = empty_group_names; } return *group_names_; } @@ -306,7 +329,7 @@ bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re, // Returns the maximum submatch needed for the rewrite to be done by Replace(). // E.g. if rewrite == "foo \\2,\\1", returns 2. -static int MaxSubmatch(const StringPiece& rewrite) { +int RE2::MaxSubmatch(const StringPiece& rewrite) { int max = 0; for (const char *s = rewrite.data(), *end = s + rewrite.size(); s < end; s++) { @@ -512,10 +535,11 @@ bool RE2::Match(const StringPiece& text, } if (startpos < 0 || startpos > endpos || endpos > text.size()) { - LOG(ERROR) << "RE2: invalid startpos, endpos pair."; + if (options_.log_errors()) + LOG(ERROR) << "RE2: invalid startpos, endpos pair."; return false; } - + StringPiece subtext = text; subtext.remove_prefix(startpos); subtext.remove_suffix(text.size() - endpos); @@ -631,7 +655,8 @@ bool RE2::Match(const StringPiece& text, LOG(INFO) << "Match " << trunc(pattern_) << " [" << CEscape(subtext) << "]" << " DFA inconsistency."; - LOG(ERROR) << "DFA inconsistency"; + if (options_.log_errors()) + LOG(ERROR) << "DFA inconsistency"; return false; } if (FLAGS_trace_re2) @@ -715,7 +740,7 @@ bool RE2::Match(const StringPiece& text, << " [" << CEscape(subtext) << "]" << " using OnePass."; if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) { - if (!skipped_test) + if (!skipped_test && options_.log_errors()) LOG(ERROR) << "SearchOnePass inconsistency"; return false; } @@ -726,7 +751,7 @@ bool RE2::Match(const StringPiece& text, << " using BitState."; if (!prog_->SearchBitState(subtext1, text, anchor, kind, submatch, ncap)) { - if (!skipped_test) + if (!skipped_test && options_.log_errors()) LOG(ERROR) << "SearchBitState inconsistency"; return false; } @@ -736,7 +761,7 @@ bool RE2::Match(const StringPiece& text, << " [" << CEscape(subtext) << "]" << " using NFA."; if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) { - if (!skipped_test) + if (!skipped_test && options_.log_errors()) LOG(ERROR) << "SearchNFA inconsistency"; return false; } @@ -835,8 +860,10 @@ bool RE2::Rewrite(string *out, const StringPiece &rewrite, if (isdigit(c)) { int n = (c - '0'); if (n >= veclen) { - LOG(ERROR) << "requested group " << n - << " in regexp " << rewrite.data(); + if (options_.log_errors()) { + LOG(ERROR) << "requested group " << n + << " in regexp " << rewrite.data(); + } return false; } StringPiece snip = vec[n]; @@ -845,7 +872,8 @@ bool RE2::Rewrite(string *out, const StringPiece &rewrite, } else if (c == '\\') { out->push_back('\\'); } else { - LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data(); + if (options_.log_errors()) + LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data(); return false; } } else { diff --git a/re2/re2.h b/re2/re2.h index 9dbc99c..272028b 100644 --- a/re2/re2.h +++ b/re2/re2.h @@ -187,12 +187,28 @@ #include "re2/variadic_function.h" namespace re2 { + using std::string; using std::map; class Mutex; class Prog; class Regexp; +// The following enum should be used only as a constructor argument to indicate +// that the variable has static storage class, and that the constructor should +// do nothing to its state. It indicates to the reader that it is legal to +// declare a static instance of the class, provided the constructor is given +// the LINKER_INITIALIZED argument. Normally, it is unsafe to declare a +// static variable that has a constructor or a destructor because invocation +// order is undefined. However, IF the type can be initialized by filling with +// zeroes (which the loader does for static variables), AND the type's +// destructor does nothing to the storage, then a constructor for static +// initialization can be declared as +// explicit MyClass(LinkerInitialized x) {} +// and invoked as +// static MyClass my_variable_name(LINKER_INITIALIZED); +enum LinkerInitialized { LINKER_INITIALIZED }; + // Interface for regular expression matching. Also corresponds to a // pre-compiled regular expression. An "RE2" object is safe for // concurrent use by multiple threads. @@ -229,12 +245,15 @@ class RE2 { // Predefined common options. // If you need more complicated things, instantiate - // an Option class, change the settings, and pass it to the - // RE2 constructor. - static const Options DefaultOptions; - static const Options Latin1; // treat input as Latin-1 (default UTF-8) - static const Options POSIX; // POSIX syntax, leftmost-longest match - static const Options Quiet; // do not log about regexp parse errors + // an Option class, possibly passing one of these to + // the Option constructor, change the settings, and pass that + // Option class to the RE2 constructor. + enum CannedOptions { + DefaultOptions = 0, + Latin1, // treat input as Latin-1 (default UTF-8) + POSIX, // POSIX syntax, leftmost-longest match + Quiet // do not log about regexp parse errors + }; // Need to have the const char* and const string& forms for implicit // conversions when passing string literals to FullMatch and PartialMatch. @@ -467,6 +486,20 @@ class RE2 { // fail because of a bad rewrite string. bool CheckRewriteString(const StringPiece& rewrite, string* error) const; + // Returns the maximum submatch needed for the rewrite to be done by + // Replace(). E.g. if rewrite == "foo \\2,\\1", returns 2. + static int MaxSubmatch(const StringPiece& rewrite); + + // Append the "rewrite" string, with backslash subsitutions from "vec", + // to string "out". + // Returns true on success. This method can fail because of a malformed + // rewrite string. CheckRewriteString guarantees that the rewrite will + // be sucessful. + bool Rewrite(string *out, + const StringPiece &rewrite, + const StringPiece* vec, + int veclen) const; + // Constructor options class Options { public: @@ -479,6 +512,7 @@ class RE2 { // max_mem (see below) approx. max memory footprint of RE2 // literal (false) interpret string as literal, not regexp // never_nl (false) never match \n, even if it is in regexp + // never_capture (false) parse all parens as non-capturing // case_sensitive (true) match is case-sensitive (regexp can override // with (?i) unless in posix_syntax mode) // @@ -533,11 +567,14 @@ class RE2 { max_mem_(kDefaultMaxMem), literal_(false), never_nl_(false), + never_capture_(false), case_sensitive_(true), perl_classes_(false), word_boundary_(false), one_line_(false) { } + + /*implicit*/ Options(CannedOptions); Encoding encoding() const { return encoding_; } void set_encoding(Encoding encoding) { encoding_ = encoding; } @@ -571,6 +608,9 @@ class RE2 { bool never_nl() const { return never_nl_; } void set_never_nl(bool b) { never_nl_ = b; } + bool never_capture() const { return never_capture_; } + void set_never_capture(bool b) { never_capture_ = b; } + bool case_sensitive() const { return case_sensitive_; } void set_case_sensitive(bool b) { case_sensitive_ = b; } @@ -591,6 +631,7 @@ class RE2 { max_mem_ = src.max_mem_; literal_ = src.literal_; never_nl_ = src.never_nl_; + never_capture_ = src.never_capture_; case_sensitive_ = src.case_sensitive_; perl_classes_ = src.perl_classes_; word_boundary_ = src.word_boundary_; @@ -600,25 +641,6 @@ class RE2 { int ParseFlags() const; private: - // Private constructor for defining constants like RE2::Latin1. - friend class RE2; - Options(Encoding encoding, - bool posix_syntax, - bool longest_match, - bool log_errors) : - encoding_(encoding), - posix_syntax_(posix_syntax), - longest_match_(longest_match), - log_errors_(log_errors), - max_mem_(kDefaultMaxMem), - literal_(false), - never_nl_(false), - case_sensitive_(true), - perl_classes_(false), - word_boundary_(false), - one_line_(false) { - } - Encoding encoding_; bool posix_syntax_; bool longest_match_; @@ -626,6 +648,7 @@ class RE2 { int64_t max_mem_; bool literal_; bool never_nl_; + bool never_capture_; bool case_sensitive_; bool perl_classes_; bool word_boundary_; @@ -670,11 +693,6 @@ class RE2 { private: void Init(const StringPiece& pattern, const Options& options); - bool Rewrite(string *out, - const StringPiece &rewrite, - const StringPiece* vec, - int veclen) const; - bool DoMatch(const StringPiece& text, Anchor anchor, int* consumed, diff --git a/re2/regexp.cc b/re2/regexp.cc index 9486b3c..a74ceec 100644 --- a/re2/regexp.cc +++ b/re2/regexp.cc @@ -59,29 +59,39 @@ bool Regexp::QuickDestroy() { return false; } -static map ref_map; -static Mutex ref_mutex; +static map *ref_map; +GLOBAL_MUTEX(ref_mutex); int Regexp::Ref() { if (ref_ < kMaxRef) return ref_; - MutexLock l(&ref_mutex); - return ref_map[this]; + GLOBAL_MUTEX_LOCK(ref_mutex); + int r = 0; + if (ref_map != NULL) { + r = (*ref_map)[this]; + } + GLOBAL_MUTEX_UNLOCK(ref_mutex); + return r; } // Increments reference count, returns object as convenience. Regexp* Regexp::Incref() { if (ref_ >= kMaxRef-1) { // Store ref count in overflow map. - MutexLock l(&ref_mutex); - if (ref_ == kMaxRef) { // already overflowed - ref_map[this]++; - return this; + GLOBAL_MUTEX_LOCK(ref_mutex); + if (ref_map == NULL) { + ref_map = new map; + } + if (ref_ == kMaxRef) { + // already overflowed + (*ref_map)[this]++; + } else { + // overflowing now + (*ref_map)[this] = kMaxRef; + ref_ = kMaxRef; } - // overflowing now - ref_map[this] = kMaxRef; - ref_ = kMaxRef; + GLOBAL_MUTEX_UNLOCK(ref_mutex); return this; } @@ -93,14 +103,15 @@ Regexp* Regexp::Incref() { void Regexp::Decref() { if (ref_ == kMaxRef) { // Ref count is stored in overflow map. - MutexLock l(&ref_mutex); - int r = ref_map[this] - 1; + GLOBAL_MUTEX_LOCK(ref_mutex); + int r = (*ref_map)[this] - 1; if (r < kMaxRef) { ref_ = r; - ref_map.erase(this); + ref_map->erase(this); } else { - ref_map[this] = r; + (*ref_map)[this] = r; } + GLOBAL_MUTEX_UNLOCK(ref_mutex); return; } ref_--; @@ -447,7 +458,7 @@ bool Regexp::Equal(Regexp* a, Regexp* b) { } // Keep in sync with enum RegexpStatusCode in regexp.h -static const string kErrorStrings[] = { +static const char *kErrorStrings[] = { "no error", "unexpected error", "invalid escape sequence", @@ -464,7 +475,7 @@ static const string kErrorStrings[] = { "invalid named capture group", }; -const string& RegexpStatus::CodeText(enum RegexpStatusCode code) { +string RegexpStatus::CodeText(enum RegexpStatusCode code) { if (code < 0 || code >= arraysize(kErrorStrings)) code = kRegexpInternalError; return kErrorStrings[code]; diff --git a/re2/regexp.h b/re2/regexp.h index 1aebc16..331c017 100644 --- a/re2/regexp.h +++ b/re2/regexp.h @@ -197,7 +197,7 @@ class RegexpStatus { // Returns text equivalent of code, e.g.: // "Bad character class" - static const string& CodeText(enum RegexpStatusCode code); + static string CodeText(enum RegexpStatusCode code); // Returns text describing error, e.g.: // "Bad character class: [z-a]" @@ -299,6 +299,7 @@ class Regexp { // and \P{Han} for its negation. NeverNL = 1<<11, // Never match NL, even if the regexp mentions // it explicitly. + NeverCapture = 1<<12, // Parse all parens as non-capturing. // As close to Perl as we can get. LikePerl = ClassNL | OneLine | PerlClasses | PerlB | PerlX | diff --git a/re2/testing/backtrack.cc b/re2/testing/backtrack.cc new file mode 100644 index 0000000..b2dd6db --- /dev/null +++ b/re2/testing/backtrack.cc @@ -0,0 +1,254 @@ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Tested by search_test.cc, exhaustive_test.cc, tester.cc +// +// Prog::BadSearchBacktrack is a backtracking regular expression search, +// except that it remembers where it has been, trading a lot of +// memory for a lot of time. It exists only for testing purposes. +// +// Let me repeat that. +// +// THIS CODE SHOULD NEVER BE USED IN PRODUCTION: +// - It uses a ton of memory. +// - It uses a ton of stack. +// - It uses CHECK and LOG(FATAL). +// - It implements unanchored search by repeated anchored search. +// +// On the other hand, it is very simple and a good reference +// implementation for the more complicated regexp packages. +// +// In BUILD, this file is linked into the ":testing" library, +// not the main library, in order to make it harder to pick up +// accidentally. + +#include "util/util.h" +#include "re2/prog.h" +#include "re2/regexp.h" + +namespace re2 { + +// Backtracker holds the state for a backtracking search. +// +// Excluding the search parameters, the main search state +// is just the "capture registers", which record, for the +// current execution, the string position at which each +// parenthesis was passed. cap_[0] and cap_[1] are the +// left and right parenthesis in $0, cap_[2] and cap_[3] in $1, etc. +// +// To avoid infinite loops during backtracking on expressions +// like (a*)*, the visited_[] bitmap marks the (state, string-position) +// pairs that have already been explored and are thus not worth +// re-exploring if we get there via another path. Modern backtracking +// libraries engineer their program representation differently, to make +// such infinite loops possible to avoid without keeping a giant visited_ +// bitmap, but visited_ works fine for a reference implementation +// and it has the nice benefit of making the search run in linear time. +class Backtracker { + public: + explicit Backtracker(Prog* prog); + ~Backtracker(); + + bool Search(const StringPiece& text, const StringPiece& context, + bool anchored, bool longest, + StringPiece* submatch, int nsubmatch); + + private: + // Explores from instruction ip at string position p looking for a match. + // Returns true if found (so that caller can stop trying other possibilities). + bool Visit(int id, const char* p); + + // Search parameters + Prog* prog_; // program being run + StringPiece text_; // text being searched + StringPiece context_; // greater context of text being searched + bool anchored_; // whether search is anchored at text.begin() + bool longest_; // whether search wants leftmost-longest match + bool endmatch_; // whether search must end at text.end() + StringPiece *submatch_; // submatches to fill in + int nsubmatch_; // # of submatches to fill in + + // Search state + const char* cap_[64]; // capture registers + uint32 *visited_; // bitmap: (Inst*, char*) pairs already backtracked + int nvisited_; // # of words in bitmap +}; + +Backtracker::Backtracker(Prog* prog) + : prog_(prog), + anchored_(false), + longest_(false), + endmatch_(false), + submatch_(NULL), + nsubmatch_(0), + visited_(NULL), + nvisited_(0) { +} + +Backtracker::~Backtracker() { + delete[] visited_; +} + +// Runs a backtracking search. +bool Backtracker::Search(const StringPiece& text, const StringPiece& context, + bool anchored, bool longest, + StringPiece* submatch, int nsubmatch) { + text_ = text; + context_ = context; + if (context_.begin() == NULL) + context_ = text; + if (prog_->anchor_start() && text.begin() > context_.begin()) + return false; + if (prog_->anchor_end() && text.end() < context_.end()) + return false; + anchored_ = anchored | prog_->anchor_start(); + longest_ = longest | prog_->anchor_end(); + endmatch_ = prog_->anchor_end(); + submatch_ = submatch; + nsubmatch_ = nsubmatch; + CHECK(2*nsubmatch_ < arraysize(cap_)); + memset(cap_, 0, sizeof cap_); + + // We use submatch_[0] for our own bookkeeping, + // so it had better exist. + StringPiece sp0; + if (nsubmatch < 1) { + submatch_ = &sp0; + nsubmatch_ = 1; + } + submatch_[0] = NULL; + + // Allocate new visited_ bitmap -- size is proportional + // to text, so have to reallocate on each call to Search. + delete[] visited_; + nvisited_ = (prog_->size()*(text.size()+1) + 31)/32; + visited_ = new uint32[nvisited_]; + memset(visited_, 0, nvisited_*sizeof visited_[0]); + + // Anchored search must start at text.begin(). + if (anchored_) { + cap_[0] = text.begin(); + return Visit(prog_->start(), text.begin()); + } + + // Unanchored search, starting from each possible text position. + // Notice that we have to try the empty string at the end of + // the text, so the loop condition is p <= text.end(), not p < text.end(). + for (const char* p = text.begin(); p <= text.end(); p++) { + cap_[0] = p; + if (Visit(prog_->start(), p)) // Match must be leftmost; done. + return true; + } + return false; +} + +// Explores from instruction ip at string position p looking for a match. +// Return true if found (so that caller can stop trying other possibilities). +bool Backtracker::Visit(int id, const char* p) { + // Check bitmap. If we've already explored from here, + // either it didn't match or it did but we're hoping for a better match. + // Either way, don't go down that road again. + CHECK(p <= text_.end()); + int n = id*(text_.size()+1) + (p - text_.begin()); + CHECK_LT(n/32, nvisited_); + if (visited_[n/32] & (1 << (n&31))) + return false; + visited_[n/32] |= 1 << (n&31); + + // Pick out byte at current position. If at end of string, + // have to explore in hope of finishing a match. Use impossible byte -1. + int c = -1; + if (p < text_.end()) + c = *p & 0xFF; + + Prog::Inst* ip = prog_->inst(id); + switch (ip->opcode()) { + default: + LOG(FATAL) << "Unexpected opcode: " << (int)ip->opcode(); + return false; // not reached + + case kInstAlt: + case kInstAltMatch: + // Try both possible next states: out is preferred to out1. + if (Visit(ip->out(), p)) { + if (longest_) + Visit(ip->out1(), p); + return true; + } + return Visit(ip->out1(), p); + + case kInstByteRange: + if (ip->Matches(c)) + return Visit(ip->out(), p+1); + return false; + + case kInstCapture: + if (0 <= ip->cap() && ip->cap() < arraysize(cap_)) { + // Capture p to register, but save old value. + const char* q = cap_[ip->cap()]; + cap_[ip->cap()] = p; + bool ret = Visit(ip->out(), p); + // Restore old value as we backtrack. + cap_[ip->cap()] = q; + return ret; + } + return Visit(ip->out(), p); + + case kInstEmptyWidth: + if (ip->empty() & ~Prog::EmptyFlags(context_, p)) + return false; + return Visit(ip->out(), p); + + case kInstNop: + return Visit(ip->out(), p); + + case kInstMatch: + // We found a match. If it's the best so far, record the + // parameters in the caller's submatch_ array. + if (endmatch_ && p != context_.end()) + return false; + cap_[1] = p; + if (submatch_[0].data() == NULL || // First match so far ... + (longest_ && p > submatch_[0].end())) { // ... or better match + for (int i = 0; i < nsubmatch_; i++) + submatch_[i] = StringPiece(cap_[2*i], cap_[2*i+1] - cap_[2*i]); + } + return true; + + case kInstFail: + return false; + } +} + +// Runs a backtracking search. +bool Prog::UnsafeSearchBacktrack(const StringPiece& text, + const StringPiece& context, + Anchor anchor, + MatchKind kind, + StringPiece* match, + int nmatch) { + // If full match, we ask for an anchored longest match + // and then check that match[0] == text. + // So make sure match[0] exists. + StringPiece sp0; + if (kind == kFullMatch) { + anchor = kAnchored; + if (nmatch < 1) { + match = &sp0; + nmatch = 1; + } + } + + // Run the search. + Backtracker b(this); + bool anchored = anchor == kAnchored; + bool longest = kind != kFirstMatch; + if (!b.Search(text, context, anchored, longest, match, nmatch)) + return false; + if (kind == kFullMatch && match[0].end() != text.end()) + return false; + return true; +} + +} // namespace re2 diff --git a/re2/testing/charclass_test.cc b/re2/testing/charclass_test.cc new file mode 100644 index 0000000..a3764d4 --- /dev/null +++ b/re2/testing/charclass_test.cc @@ -0,0 +1,223 @@ +// Copyright 2006 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Test character class manipulations. + +#include "util/test.h" +#include "re2/regexp.h" + +namespace re2 { + +struct CCTest { + struct { + Rune lo; + Rune hi; + } add[10]; + int remove; + struct { + Rune lo; + Rune hi; + } final[10]; +}; + +static CCTest tests[] = { + { { { 10, 20 }, {-1} }, -1, + { { 10, 20 }, {-1} } }, + + { { { 10, 20 }, { 20, 30 }, {-1} }, -1, + { { 10, 30 }, {-1} } }, + + { { { 10, 20 }, { 30, 40 }, { 20, 30 }, {-1} }, -1, + { { 10, 40 }, {-1} } }, + + { { { 0, 50 }, { 20, 30 }, {-1} }, -1, + { { 0, 50 }, {-1} } }, + + { { { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} }, -1, + { { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } }, + + { { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, {-1} }, -1, + { { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } }, + + { { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, {-1} }, -1, + { { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } }, + + { { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, { 5, 25 }, {-1} }, -1, + { { 5, 25 }, {-1} } }, + + { { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, { 12, 21 }, {-1} }, -1, + { { 10, 23 }, {-1} } }, + + // These check boundary cases during negation. + { { { 0, Runemax }, {-1} }, -1, + { { 0, Runemax }, {-1} } }, + + { { { 0, 50 }, {-1} }, -1, + { { 0, 50 }, {-1} } }, + + { { { 50, Runemax }, {-1} }, -1, + { { 50, Runemax }, {-1} } }, + + // Check RemoveAbove. + { { { 50, Runemax }, {-1} }, 255, + { { 50, 255 }, {-1} } }, + + { { { 50, Runemax }, {-1} }, 65535, + { { 50, 65535 }, {-1} } }, + + { { { 50, Runemax }, {-1} }, Runemax, + { { 50, Runemax }, {-1} } }, + + { { { 50, 60 }, { 250, 260 }, { 350, 360 }, {-1} }, 255, + { { 50, 60 }, { 250, 255 }, {-1} } }, + + { { { 50, 60 }, {-1} }, 255, + { { 50, 60 }, {-1} } }, + + { { { 350, 360 }, {-1} }, 255, + { {-1} } }, + + { { {-1} }, 255, + { {-1} } }, +}; + +template +static void Broke(const char *desc, const CCTest* t, CharClass* cc) { + if (t == NULL) { + printf("\t%s:", desc); + } else { + printf("\n"); + printf("CharClass added: [%s]", desc); + for (int k = 0; t->add[k].lo >= 0; k++) + printf(" %d-%d", t->add[k].lo, t->add[k].hi); + printf("\n"); + if (t->remove >= 0) + printf("Removed > %d\n", t->remove); + printf("\twant:"); + for (int k = 0; t->final[k].lo >= 0; k++) + printf(" %d-%d", t->final[k].lo, t->final[k].hi); + printf("\n"); + printf("\thave:"); + } + + for (typename CharClass::iterator it = cc->begin(); it != cc->end(); ++it) + printf(" %d-%d", it->lo, it->hi); + printf("\n"); +} + +bool ShouldContain(CCTest *t, int x) { + for (int j = 0; t->final[j].lo >= 0; j++) + if (t->final[j].lo <= x && x <= t->final[j].hi) + return true; + return false; +} + +// Helpers to make templated CorrectCC work with both CharClass and CharClassBuilder. + +CharClass* Negate(CharClass *cc) { + return cc->Negate(); +} + +void Delete(CharClass* cc) { + cc->Delete(); +} + +CharClassBuilder* Negate(CharClassBuilder* cc) { + CharClassBuilder* ncc = cc->Copy(); + ncc->Negate(); + return ncc; +} + +void Delete(CharClassBuilder* cc) { + delete cc; +} + +template +bool CorrectCC(CharClass *cc, CCTest *t, const char *desc) { + typename CharClass::iterator it = cc->begin(); + int size = 0; + for (int j = 0; t->final[j].lo >= 0; j++, ++it) { + if (it == cc->end() || + it->lo != t->final[j].lo || + it->hi != t->final[j].hi) { + Broke(desc, t, cc); + return false; + } + size += it->hi - it->lo + 1; + } + if (it != cc->end()) { + Broke(desc, t, cc); + return false; + } + if (cc->size() != size) { + Broke(desc, t, cc); + printf("wrong size: want %d have %d\n", size, cc->size()); + return false; + } + + for (int j = 0; j < 101; j++) { + if (j == 100) + j = Runemax; + if (ShouldContain(t, j) != cc->Contains(j)) { + Broke(desc, t, cc); + printf("want contains(%d)=%d, got %d\n", + j, ShouldContain(t, j), cc->Contains(j)); + return false; + } + } + + CharClass* ncc = Negate(cc); + for (int j = 0; j < 101; j++) { + if (j == 100) + j = Runemax; + if (ShouldContain(t, j) == ncc->Contains(j)) { + Broke(desc, t, cc); + Broke("ncc", NULL, ncc); + printf("want ncc contains(%d)!=%d, got %d\n", + j, ShouldContain(t, j), ncc->Contains(j)); + Delete(ncc); + return false; + } + if (ncc->size() != Runemax+1 - cc->size()) { + Broke(desc, t, cc); + Broke("ncc", NULL, ncc); + printf("ncc size should be %d is %d\n", + Runemax+1 - cc->size(), ncc->size()); + Delete(ncc); + return false; + } + } + Delete(ncc); + return true; +} + +TEST(TestCharClassBuilder, Adds) { + int nfail = 0; + for (int i = 0; i < arraysize(tests); i++) { + CharClassBuilder ccb; + CCTest* t = &tests[i]; + for (int j = 0; t->add[j].lo >= 0; j++) + ccb.AddRange(t->add[j].lo, t->add[j].hi); + if (t->remove >= 0) + ccb.RemoveAbove(t->remove); + if (!CorrectCC(&ccb, t, "before copy (CharClassBuilder)")) + nfail++; + CharClass* cc = ccb.GetCharClass(); + if (!CorrectCC(cc, t, "before copy (CharClass)")) + nfail++; + cc->Delete(); + + CharClassBuilder *ccb1 = ccb.Copy(); + if (!CorrectCC(ccb1, t, "after copy (CharClassBuilder)")) + nfail++; + cc = ccb.GetCharClass(); + if (!CorrectCC(cc, t, "after copy (CharClass)")) + nfail++; + cc->Delete(); + delete ccb1; + } + EXPECT_EQ(nfail, 0); +} + +} // namespace re2 diff --git a/re2/testing/compile_test.cc b/re2/testing/compile_test.cc new file mode 100644 index 0000000..8d92105 --- /dev/null +++ b/re2/testing/compile_test.cc @@ -0,0 +1,171 @@ +// Copyright 2007 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Test prog.cc, compile.cc + +#include +#include +#include "util/test.h" +#include "re2/regexp.h" +#include "re2/prog.h" + +DEFINE_string(show, "", "regular expression to compile and dump"); + +namespace re2 { + +// Simple input/output tests checking that +// the regexp compiles to the expected code. +// These are just to sanity check the basic implementation. +// The real confidence tests happen by testing the NFA/DFA +// that run the compiled code. + +struct Test { + const char* regexp; + const char* code; +}; + +static Test tests[] = { + { "a", + "1. byte [61-61] -> 2\n" + "2. match! 0\n" }, + { "ab", + "1. byte [61-61] -> 2\n" + "2. byte [62-62] -> 3\n" + "3. match! 0\n" }, + { "a|c", + "3. alt -> 1 | 2\n" + "1. byte [61-61] -> 4\n" + "2. byte [63-63] -> 4\n" + "4. match! 0\n" }, + { "a|b", + "1. byte [61-62] -> 2\n" + "2. match! 0\n" }, + { "[ab]", + "1. byte [61-62] -> 2\n" + "2. match! 0\n" }, + { "a+", + "1. byte [61-61] -> 2\n" + "2. alt -> 1 | 3\n" + "3. match! 0\n" }, + { "a+?", + "1. byte [61-61] -> 2\n" + "2. alt -> 3 | 1\n" + "3. match! 0\n" }, + { "a*", + "2. alt -> 1 | 3\n" + "1. byte [61-61] -> 2\n" + "3. match! 0\n" }, + { "a*?", + "2. alt -> 3 | 1\n" + "3. match! 0\n" + "1. byte [61-61] -> 2\n" }, + { "a?", + "2. alt -> 1 | 3\n" + "1. byte [61-61] -> 3\n" + "3. match! 0\n" }, + { "a??", + "2. alt -> 3 | 1\n" + "3. match! 0\n" + "1. byte [61-61] -> 3\n" }, + { "a{4}", + "1. byte [61-61] -> 2\n" + "2. byte [61-61] -> 3\n" + "3. byte [61-61] -> 4\n" + "4. byte [61-61] -> 5\n" + "5. match! 0\n" }, + { "(a)", + "2. capture 2 -> 1\n" + "1. byte [61-61] -> 3\n" + "3. capture 3 -> 4\n" + "4. match! 0\n" }, + { "(?:a)", + "1. byte [61-61] -> 2\n" + "2. match! 0\n" }, + { "", + "2. match! 0\n" }, + { ".", + "3. alt -> 1 | 2\n" + "1. byte [00-09] -> 4\n" + "2. byte [0b-ff] -> 4\n" + "4. match! 0\n" }, + { "[^ab]", + "5. alt -> 3 | 4\n" + "3. alt -> 1 | 2\n" + "4. byte [63-ff] -> 6\n" + "1. byte [00-09] -> 6\n" + "2. byte [0b-60] -> 6\n" + "6. match! 0\n" }, + { "[Aa]", + "1. byte/i [61-61] -> 2\n" + "2. match! 0\n" }, +}; + +TEST(TestRegexpCompileToProg, Simple) { + int failed = 0; + for (int i = 0; i < arraysize(tests); i++) { + const re2::Test& t = tests[i]; + Regexp* re = Regexp::Parse(t.regexp, Regexp::PerlX|Regexp::Latin1, NULL); + if (re == NULL) { + LOG(ERROR) << "Cannot parse: " << t.regexp; + failed++; + continue; + } + Prog* prog = re->CompileToProg(0); + if (prog == NULL) { + LOG(ERROR) << "Cannot compile: " << t.regexp; + re->Decref(); + failed++; + continue; + } + CHECK(re->CompileToProg(1) == NULL); + string s = prog->Dump(); + if (s != t.code) { + LOG(ERROR) << "Incorrect compiled code for: " << t.regexp; + LOG(ERROR) << "Want:\n" << t.code; + LOG(ERROR) << "Got:\n" << s; + failed++; + } + delete prog; + re->Decref(); + } + EXPECT_EQ(failed, 0); +} + +// The distinct byte ranges involved in the UTF-8 dot ([^\n]). +// Once, erroneously split between 0x3f and 0x40 because it is +// a 6-bit boundary. +static struct UTF8ByteRange { + int lo; + int hi; +} utf8ranges[] = { + { 0x00, 0x09 }, + { 0x0A, 0x0A }, + { 0x10, 0x7F }, + { 0x80, 0x8F }, + { 0x90, 0x9F }, + { 0xA0, 0xBF }, + { 0xC0, 0xC1 }, + { 0xC2, 0xDF }, + { 0xE0, 0xE0 }, + { 0xE1, 0xEF }, + { 0xF0, 0xF0 }, + { 0xF1, 0xF3 }, + { 0xF4, 0xF4 }, + { 0xF5, 0xFF }, +}; + +TEST(TestCompile, ByteRanges) { + Regexp* re = Regexp::Parse(".", Regexp::PerlX, NULL); + EXPECT_TRUE(re != NULL); + Prog* prog = re->CompileToProg(0); + EXPECT_TRUE(prog != NULL); + EXPECT_EQ(prog->bytemap_range(), arraysize(utf8ranges)); + for (int i = 0; i < arraysize(utf8ranges); i++) + for (int j = utf8ranges[i].lo; j <= utf8ranges[i].hi; j++) + EXPECT_EQ(prog->bytemap()[j], i) << " byte " << j; + delete prog; + re->Decref(); +} + +} // namespace re2 diff --git a/re2/testing/dfa_test.cc b/re2/testing/dfa_test.cc new file mode 100644 index 0000000..8e95ae4 --- /dev/null +++ b/re2/testing/dfa_test.cc @@ -0,0 +1,344 @@ +// Copyright 2006-2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "util/test.h" +#include "util/thread.h" +#include "re2/prog.h" +#include "re2/re2.h" +#include "re2/regexp.h" +#include "re2/testing/regexp_generator.h" +#include "re2/testing/string_generator.h" + +DECLARE_bool(re2_dfa_bail_when_slow); + +DEFINE_int32(size, 8, "log2(number of DFA nodes)"); +DEFINE_int32(repeat, 2, "Repetition count."); +DEFINE_int32(threads, 4, "number of threads"); + +namespace re2 { + +// Check that multithreaded access to DFA class works. + +// Helper thread: builds entire DFA for prog. +class BuildThread : public Thread { + public: + BuildThread(Prog* prog) : prog_(prog) {} + virtual void Run() { + CHECK(prog_->BuildEntireDFA(Prog::kFirstMatch)); + } + + private: + Prog* prog_; +}; + +TEST(Multithreaded, BuildEntireDFA) { + // Create regexp with 2^FLAGS_size states in DFA. + string s = "a"; + for (int i = 0; i < FLAGS_size; i++) + s += "[ab]"; + s += "b"; + + // Check that single-threaded code works. + { + //LOG(INFO) << s; + Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + BuildThread* t = new BuildThread(prog); + t->SetJoinable(true); + t->Start(); + t->Join(); + delete t; + delete prog; + re->Decref(); + } + + // Build the DFA simultaneously in a bunch of threads. + for (int i = 0; i < FLAGS_repeat; i++) { + Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + + vector threads; + for (int j = 0; j < FLAGS_threads; j++) { + BuildThread *t = new BuildThread(prog); + t->SetJoinable(true); + threads.push_back(t); + } + for (int j = 0; j < FLAGS_threads; j++) + threads[j]->Start(); + for (int j = 0; j < FLAGS_threads; j++) { + threads[j]->Join(); + delete threads[j]; + } + + // One more compile, to make sure everything is okay. + prog->BuildEntireDFA(Prog::kFirstMatch); + delete prog; + re->Decref(); + } +} + +// Check that DFA size requirements are followed. +// BuildEntireDFA will, like SearchDFA, stop building out +// the DFA once the memory limits are reached. +TEST(SingleThreaded, BuildEntireDFA) { + // Create regexp with 2^30 states in DFA. + string s = "a"; + for (int i = 0; i < 30; i++) + s += "[ab]"; + s += "b"; + + //LOG(INFO) << s; + Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL); + CHECK(re); + int max = 24; + for (int i = 17; i < max; i++) { + int limit = 1<CompileToProg(limit); + CHECK(prog); + //progusage = m.HeapGrowth(); + //dfamem = prog->dfa_mem(); + prog->BuildEntireDFA(Prog::kFirstMatch); + prog->BuildEntireDFA(Prog::kLongestMatch); + usage = m.HeapGrowth(); + delete prog; + } + if (!UsingMallocCounter) + continue; + //LOG(INFO) << StringPrintf("Limit %d: prog used %d, DFA budget %d, total %d\n", + // limit, progusage, dfamem, usage); + CHECK_GT(usage, limit*9/10); + CHECK_LT(usage, limit + (16<<10)); // 16kB of slop okay + } + re->Decref(); +} + +// Generates and returns a string over binary alphabet {0,1} that contains +// all possible binary sequences of length n as subsequences. The obvious +// brute force method would generate a string of length n * 2^n, but this +// generates a string of length n + 2^n - 1 called a De Bruijn cycle. +// See Knuth, The Art of Computer Programming, Vol 2, Exercise 3.2.2 #17. +// Such a string is useful for testing a DFA. If you have a DFA +// where distinct last n bytes implies distinct states, then running on a +// DeBruijn string causes the DFA to need to create a new state at every +// position in the input, never reusing any states until it gets to the +// end of the string. This is the worst possible case for DFA execution. +static string DeBruijnString(int n) { + CHECK_LT(n, 8*sizeof(int)); + CHECK_GT(n, 0); + + vector did(1<CompileToProg(1<SearchDFA(match, NULL, + Prog::kUnanchored, Prog::kFirstMatch, + NULL, &failed, NULL); + CHECK(!failed); + CHECK(matched); + matched = prog->SearchDFA(no_match, NULL, + Prog::kUnanchored, Prog::kFirstMatch, + NULL, &failed, NULL); + CHECK(!failed); + CHECK(!matched); + } + usage = m.HeapGrowth(); + peak_usage = m.PeakHeapGrowth(); + delete prog; + } + re->Decref(); + + if (!UsingMallocCounter) + return; + //LOG(INFO) << "usage " << usage << " " << peak_usage; + CHECK_LT(usage, 1<SearchDFA(match_, NULL, + Prog::kUnanchored, Prog::kFirstMatch, + NULL, &failed, NULL); + CHECK(!failed); + CHECK(matched); + matched = prog_->SearchDFA(no_match_, NULL, + Prog::kUnanchored, Prog::kFirstMatch, + NULL, &failed, NULL); + CHECK(!failed); + CHECK(!matched); + } + } + + private: + Prog* prog_; + StringPiece match_; + StringPiece no_match_; +}; + +TEST(Multithreaded, SearchDFA) { + // Same as single-threaded test above. + const int n = 18; + Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n), + Regexp::LikePerl, NULL); + CHECK(re); + string no_match = DeBruijnString(n); + string match = no_match + "0"; + FLAGS_re2_dfa_bail_when_slow = false; + + // Check that single-threaded code works. + { + Prog* prog = re->CompileToProg(1<SetJoinable(true); + t->Start(); + t->Join(); + delete t; + delete prog; + } + + // Run the search simultaneously in a bunch of threads. + // Reuse same flags for Multithreaded.BuildDFA above. + for (int i = 0; i < FLAGS_repeat; i++) { + //LOG(INFO) << "Search " << i; + Prog* prog = re->CompileToProg(1< threads; + for (int j = 0; j < FLAGS_threads; j++) { + SearchThread *t = new SearchThread(prog, match, no_match); + t->SetJoinable(true); + threads.push_back(t); + } + for (int j = 0; j < FLAGS_threads; j++) + threads[j]->Start(); + for (int j = 0; j < FLAGS_threads; j++) { + threads[j]->Join(); + delete threads[j]; + } + delete prog; + } + re->Decref(); +} + +struct ReverseTest { + const char *regexp; + const char *text; + bool match; +}; + +// Test that reverse DFA handles anchored/unanchored correctly. +// It's in the DFA interface but not used by RE2. +ReverseTest reverse_tests[] = { + { "\\A(a|b)", "abc", true }, + { "(a|b)\\z", "cba", true }, + { "\\A(a|b)", "cba", false }, + { "(a|b)\\z", "abc", false }, +}; + +TEST(DFA, ReverseMatch) { + int nfail = 0; + for (int i = 0; i < arraysize(reverse_tests); i++) { + const ReverseTest& t = reverse_tests[i]; + Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog *prog = re->CompileToReverseProg(0); + CHECK(prog); + bool failed = false; + bool matched = prog->SearchDFA(t.text, NULL, Prog::kUnanchored, Prog::kFirstMatch, NULL, &failed, NULL); + if (matched != t.match) { + LOG(ERROR) << t.regexp << " on " << t.text << ": want " << t.match; + nfail++; + } + delete prog; + re->Decref(); + } + EXPECT_EQ(nfail, 0); +} + +} // namespace re2 diff --git a/re2/testing/dump.cc b/re2/testing/dump.cc new file mode 100644 index 0000000..4bdf714 --- /dev/null +++ b/re2/testing/dump.cc @@ -0,0 +1,164 @@ +// Copyright 2006 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Dump the regexp into a string showing structure. +// Tested by parse_unittest.cc + +// This function traverses the regexp recursively, +// meaning that on inputs like Regexp::Simplify of +// a{100}{100}{100}{100}{100}{100}{100}{100}{100}{100}, +// it takes time and space exponential in the size of the +// original regular expression. It can also use stack space +// linear in the size of the regular expression for inputs +// like ((((((((((((((((a*)*)*)*)*)*)*)*)*)*)*)*)*)*)*)*)*. +// IT IS NOT SAFE TO CALL FROM PRODUCTION CODE. +// As a result, Dump is provided only in the testing +// library (see BUILD). + +#include +#include +#include "util/test.h" +#include "re2/stringpiece.h" +#include "re2/regexp.h" + +// Cause a link error if this file is used outside of testing. +DECLARE_string(test_tmpdir); + +namespace re2 { + +static const char* kOpcodeNames[] = { + "bad", + "no", + "emp", + "lit", + "str", + "cat", + "alt", + "star", + "plus", + "que", + "rep", + "cap", + "dot", + "byte", + "bol", + "eol", + "wb", // kRegexpWordBoundary + "nwb", // kRegexpNoWordBoundary + "bot", + "eot", + "cc", + "match", +}; + +// Create string representation of regexp with explicit structure. +// Nothing pretty, just for testing. +static void DumpRegexpAppending(Regexp* re, string* s) { + if (re->op() < 0 || re->op() >= arraysize(kOpcodeNames)) { + StringAppendF(s, "op%d", re->op()); + } else { + switch (re->op()) { + default: + break; + case kRegexpStar: + case kRegexpPlus: + case kRegexpQuest: + case kRegexpRepeat: + if (re->parse_flags() & Regexp::NonGreedy) + s->append("n"); + break; + } + s->append(kOpcodeNames[re->op()]); + if (re->op() == kRegexpLiteral && (re->parse_flags() & Regexp::FoldCase)) { + Rune r = re->rune(); + if ('a' <= r && r <= 'z') + s->append("fold"); + } + if (re->op() == kRegexpLiteralString && (re->parse_flags() & Regexp::FoldCase)) { + for (int i = 0; i < re->nrunes(); i++) { + Rune r = re->runes()[i]; + if ('a' <= r && r <= 'z') { + s->append("fold"); + break; + } + } + } + } + s->append("{"); + switch (re->op()) { + default: + break; + case kRegexpEndText: + if (!(re->parse_flags() & Regexp::WasDollar)) { + s->append("\\z"); + } + break; + case kRegexpLiteral: { + Rune r = re->rune(); + char buf[UTFmax+1]; + buf[runetochar(buf, &r)] = 0; + s->append(buf); + break; + } + case kRegexpLiteralString: + for (int i = 0; i < re->nrunes(); i++) { + Rune r = re->runes()[i]; + char buf[UTFmax+1]; + buf[runetochar(buf, &r)] = 0; + s->append(buf); + } + break; + case kRegexpConcat: + case kRegexpAlternate: + for (int i = 0; i < re->nsub(); i++) + DumpRegexpAppending(re->sub()[i], s); + break; + case kRegexpStar: + case kRegexpPlus: + case kRegexpQuest: + DumpRegexpAppending(re->sub()[0], s); + break; + case kRegexpCapture: + if (re->name()) { + s->append(*re->name()); + s->append(":"); + } + DumpRegexpAppending(re->sub()[0], s); + break; + case kRegexpRepeat: + s->append(StringPrintf("%d,%d ", re->min(), re->max())); + DumpRegexpAppending(re->sub()[0], s); + break; + case kRegexpCharClass: { + string sep; + for (CharClass::iterator it = re->cc()->begin(); + it != re->cc()->end(); ++it) { + RuneRange rr = *it; + s->append(sep); + if (rr.lo == rr.hi) + s->append(StringPrintf("%#x", rr.lo)); + else + s->append(StringPrintf("%#x-%#x", rr.lo, rr.hi)); + sep = " "; + } + break; + } + } + s->append("}"); +} + +string Regexp::Dump() { + string s; + + // Make sure being called from a unit test. + if (FLAGS_test_tmpdir.empty()) { + LOG(ERROR) << "Cannot use except for testing."; + return s; + } + + DumpRegexpAppending(this, &s); + return s; +} + +} // namespace re2 diff --git a/re2/testing/exhaustive1_test.cc b/re2/testing/exhaustive1_test.cc new file mode 100644 index 0000000..9e057cc --- /dev/null +++ b/re2/testing/exhaustive1_test.cc @@ -0,0 +1,42 @@ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Exhaustive testing of regular expression matching. + +#include "util/test.h" +#include "re2/testing/exhaustive_tester.h" + +DECLARE_string(regexp_engines); + +namespace re2 { + +// Test simple repetition operators +TEST(Repetition, Simple) { + vector ops = Split(" ", + "%s{0} %s{0,} %s{1} %s{1,} %s{0,1} %s{0,2} " + "%s{1,2} %s{2} %s{2,} %s{3,4} %s{4,5} " + "%s* %s+ %s? %s*? %s+? %s??"); + ExhaustiveTest(3, 2, Explode("abc."), ops, + 6, Explode("ab"), "(?:%s)", ""); + ExhaustiveTest(3, 2, Explode("abc."), ops, + 40, Explode("a"), "(?:%s)", ""); +} + +// Test capturing parens -- (a) -- inside repetition operators +TEST(Repetition, Capturing) { + vector ops = Split(" ", + "%s{0} %s{0,} %s{1} %s{1,} %s{0,1} %s{0,2} " + "%s{1,2} %s{2} %s{2,} %s{3,4} %s{4,5} " + "%s* %s+ %s? %s*? %s+? %s??"); + ExhaustiveTest(3, 2, Split(" ", "a (a) b"), ops, + 7, Explode("ab"), "(?:%s)", ""); + + // This would be a great test, but it runs forever when PCRE is enabled. + if (strstr("PCRE", FLAGS_regexp_engines.c_str()) == NULL) + ExhaustiveTest(4, 3, Split(" ", "a (a)"), ops, + 100, Explode("a"), "(?:%s)", ""); +} + +} // namespace re2 + diff --git a/re2/testing/exhaustive2_test.cc b/re2/testing/exhaustive2_test.cc new file mode 100644 index 0000000..c5fec5b --- /dev/null +++ b/re2/testing/exhaustive2_test.cc @@ -0,0 +1,70 @@ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Exhaustive testing of regular expression matching. + +#include "util/test.h" +#include "re2/re2.h" +#include "re2/testing/exhaustive_tester.h" + +DECLARE_string(regexp_engines); + +namespace re2 { + +// Test empty string matches (aka "(?:)") +TEST(EmptyString, Exhaustive) { + ExhaustiveTest(2, 2, Split(" ", "(?:) a"), + RegexpGenerator::EgrepOps(), + 5, Split("", "ab"), "", ""); +} + +// Test escaped versions of regexp syntax. +TEST(Punctuation, Literals) { + vector alphabet = Explode("()*+?{}[]\\^$."); + vector escaped = alphabet; + for (int i = 0; i < escaped.size(); i++) + escaped[i] = "\\" + escaped[i]; + ExhaustiveTest(1, 1, escaped, RegexpGenerator::EgrepOps(), + 2, alphabet, "", ""); +} + +// Test ^ $ . \A \z in presence of line endings. +// Have to wrap the empty-width ones in (?:) so that +// they can be repeated -- PCRE rejects ^* but allows (?:^)* +TEST(LineEnds, Exhaustive) { + ExhaustiveTest(2, 2, Split(" ", "(?:^) (?:$) . a \\n (?:\\A) (?:\\z)"), + RegexpGenerator::EgrepOps(), + 4, Explode("ab\n"), "", ""); +} + +// Test what does and does not match \n. +// This would be a good test, except that PCRE seems to have a bug: +// in single-byte character set mode (the default), +// [^a] matches \n, but in UTF-8 mode it does not. +// So when we run the test, the tester complains that +// we don't agree with PCRE, but it's PCRE that is at fault. +// For what it's worth, Perl gets this right (matches +// regardless of whether UTF-8 input is selected): +// +// #!/usr/bin/perl +// use POSIX qw(locale_h); +// print "matches in latin1\n" if "\n" =~ /[^a]/; +// setlocale("en_US.utf8"); +// print "matches in utf8\n" if "\n" =~ /[^a]/; +// +// The rule chosen for RE2 is that by default, like Perl, +// dot does not match \n but negated character classes [^a] do. +// (?s) will allow dot to match \n; there is no way in RE2 +// to stop [^a] from matching \n, though the underlying library +// provides a mechanism, and RE2 could add new syntax if needed. +// +// TEST(Newlines, Exhaustive) { +// vector empty_vector; +// ExhaustiveTest(1, 1, Split(" ", "\\n . a [^a]"), +// RegexpGenerator::EgrepOps(), +// 4, Explode("a\n"), ""); +// } + +} // namespace re2 + diff --git a/re2/testing/exhaustive3_test.cc b/re2/testing/exhaustive3_test.cc new file mode 100644 index 0000000..5613fcb --- /dev/null +++ b/re2/testing/exhaustive3_test.cc @@ -0,0 +1,94 @@ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Exhaustive testing of regular expression matching. + +#include "util/test.h" +#include "re2/testing/exhaustive_tester.h" + +namespace re2 { + +// Test simple character classes by themselves. +TEST(CharacterClasses, Exhaustive) { + vector atoms = Split(" ", + "[a] [b] [ab] [^bc] [b-d] [^b-d] []a] [-a] [a-] [^-a] [a-b-c] a b ."); + ExhaustiveTest(2, 1, atoms, RegexpGenerator::EgrepOps(), + 5, Explode("ab"), "", ""); +} + +// Test simple character classes inside a___b (for example, a[a]b). +TEST(CharacterClasses, ExhaustiveAB) { + vector atoms = Split(" ", + "[a] [b] [ab] [^bc] [b-d] [^b-d] []a] [-a] [a-] [^-a] [a-b-c] a b ."); + ExhaustiveTest(2, 1, atoms, RegexpGenerator::EgrepOps(), + 5, Explode("ab"), "a%sb", ""); +} + +// Returns UTF8 for Rune r +static string UTF8(Rune r) { + char buf[UTFmax+1]; + buf[runetochar(buf, &r)] = 0; + return string(buf); +} + +// Returns a vector of "interesting" UTF8 characters. +// Unicode is now too big to just return all of them, +// so UTF8Characters return a set likely to be good test cases. +static const vector& InterestingUTF8() { + static bool init; + static vector v; + + if (init) + return v; + + init = true; + // All the Latin1 equivalents are interesting. + for (int i = 1; i < 256; i++) + v.push_back(UTF8(i)); + + // After that, the codes near bit boundaries are + // interesting, because they span byte sequence lengths. + for (int j = 0; j < 8; j++) + v.push_back(UTF8(256 + j)); + for (int i = 512; i < Runemax; i <<= 1) + for (int j = -8; j < 8; j++) + v.push_back(UTF8(i + j)); + + // The codes near Runemax, including Runemax itself, are interesting. + for (int j = -8; j <= 0; j++) + v.push_back(UTF8(Runemax + j)); + + return v; +} + +// Test interesting UTF-8 characters against character classes. +TEST(InterestingUTF8, SingleOps) { + vector atoms = Split(" ", + ". ^ $ \\a \\f \\n \\r \\t \\v \\d \\D \\s \\S \\w \\W \\b \\B " + "[[:alnum:]] [[:alpha:]] [[:blank:]] [[:cntrl:]] [[:digit:]] " + "[[:graph:]] [[:lower:]] [[:print:]] [[:punct:]] [[:space:]] " + "[[:upper:]] [[:xdigit:]] [\\s\\S] [\\d\\D] [^\\w\\W] [^\\d\\D]"); + vector ops; // no ops + ExhaustiveTest(1, 0, atoms, ops, + 1, InterestingUTF8(), "", ""); +} + +// Test interesting UTF-8 characters against character classes, +// but wrap everything inside AB. +TEST(InterestingUTF8, AB) { + vector atoms = Split(" ", + ". ^ $ \\a \\f \\n \\r \\t \\v \\d \\D \\s \\S \\w \\W \\b \\B " + "[[:alnum:]] [[:alpha:]] [[:blank:]] [[:cntrl:]] [[:digit:]] " + "[[:graph:]] [[:lower:]] [[:print:]] [[:punct:]] [[:space:]] " + "[[:upper:]] [[:xdigit:]] [\\s\\S] [\\d\\D] [^\\w\\W] [^\\d\\D]"); + vector ops; // no ops + vector alpha = InterestingUTF8(); + for (int i = 0; i < alpha.size(); i++) + alpha[i] = "a" + alpha[i] + "b"; + ExhaustiveTest(1, 0, atoms, ops, + 1, alpha, "a%sb", ""); +} + +} // namespace re2 + diff --git a/re2/testing/exhaustive_test.cc b/re2/testing/exhaustive_test.cc new file mode 100644 index 0000000..fc40dee --- /dev/null +++ b/re2/testing/exhaustive_test.cc @@ -0,0 +1,38 @@ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Exhaustive testing of regular expression matching. + +#include "util/test.h" +#include "re2/testing/exhaustive_tester.h" + +namespace re2 { + +DECLARE_string(regexp_engines); + +// Test very simple expressions. +TEST(EgrepLiterals, Lowercase) { + EgrepTest(3, 2, "abc.", 3, "abc", ""); +} + +// Test mixed-case expressions. +TEST(EgrepLiterals, MixedCase) { + EgrepTest(3, 2, "AaBb.", 2, "AaBb", ""); +} + +// Test mixed-case in case-insensitive mode. +TEST(EgrepLiterals, FoldCase) { + // The punctuation characters surround A-Z and a-z + // in the ASCII table. This looks for bugs in the + // bytemap range code in the DFA. + EgrepTest(3, 2, "abAB.", 2, "aBc@_~", "(?i:%s)"); +} + +// Test very simple expressions. +TEST(EgrepLiterals, UTF8) { + EgrepTest(3, 2, "ab.", 4, "a\xE2\x98\xBA", ""); +} + +} // namespace re2 + diff --git a/re2/testing/exhaustive_tester.cc b/re2/testing/exhaustive_tester.cc new file mode 100644 index 0000000..54de857 --- /dev/null +++ b/re2/testing/exhaustive_tester.cc @@ -0,0 +1,188 @@ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Exhaustive testing of regular expression matching. + +// Each test picks an alphabet (e.g., "abc"), a maximum string length, +// a maximum regular expression length, and a maximum number of letters +// that can appear in the regular expression. Given these parameters, +// it tries every possible regular expression and string, verifying that +// the NFA, DFA, and a trivial backtracking implementation agree about +// the location of the match. + +#include +#include + +#ifndef LOGGING +#define LOGGING 0 +#endif + +#include "util/test.h" +#include "re2/testing/exhaustive_tester.h" +#include "re2/testing/tester.h" + +DEFINE_bool(show_regexps, false, "show regexps during testing"); + +DEFINE_int32(max_bad_regexp_inputs, 1, + "Stop testing a regular expression after finding this many " + "strings that break it."); + +// Compiled in debug mode, the usual tests run for over an hour. +// Have to cut it down to make the unit test machines happy. +DEFINE_bool(quick_debug_mode, true, "Run fewer tests in debug mode."); + +namespace re2 { + +static char* escape(const StringPiece& sp) { + static char buf[512]; + char* p = buf; + *p++ = '\"'; + for (int i = 0; i < sp.size(); i++) { + if(p+5 >= buf+sizeof buf) + LOG(FATAL) << "ExhaustiveTester escape: too long"; + if(sp[i] == '\\' || sp[i] == '\"') { + *p++ = '\\'; + *p++ = sp[i]; + } else if(sp[i] == '\n') { + *p++ = '\\'; + *p++ = 'n'; + } else { + *p++ = sp[i]; + } + } + *p++ = '\"'; + *p = '\0'; + return buf; +} + +static void PrintResult(const RE2& re, const StringPiece& input, RE2::Anchor anchor, StringPiece *m, int n) { + if (!re.Match(input, 0, input.size(), anchor, m, n)) { + printf("-"); + return; + } + for (int i = 0; i < n; i++) { + if (i > 0) + printf(" "); + if (m[i].begin() == NULL) + printf("-"); + else + printf("%d-%d", static_cast(m[i].begin() - input.begin()), static_cast(m[i].end() - input.begin())); + } +} + +// Processes a single generated regexp. +// Compiles it using Regexp interface and PCRE, and then +// checks that NFA, DFA, and PCRE all return the same results. +void ExhaustiveTester::HandleRegexp(const string& const_regexp) { + regexps_++; + string regexp = const_regexp; + if (!topwrapper_.empty()) + regexp = StringPrintf(topwrapper_.c_str(), regexp.c_str()); + + if (FLAGS_show_regexps) { + printf("\r%s", regexp.c_str()); + fflush(stdout); + } + + if (LOGGING) { + // Write out test cases and answers for use in testing + // other implementations, such as Go's regexp package. + if (randomstrings_) + LOG(ERROR) << "Cannot log with random strings."; + if (regexps_ == 1) { // first + printf("strings\n"); + strgen_.Reset(); + while (strgen_.HasNext()) + printf("%s\n", escape(strgen_.Next())); + printf("regexps\n"); + } + printf("%s\n", escape(regexp)); + + RE2 re(regexp); + RE2::Options longest; + longest.set_longest_match(true); + RE2 relongest(regexp, longest); + int ngroup = re.NumberOfCapturingGroups()+1; + StringPiece* group = new StringPiece[ngroup]; + + strgen_.Reset(); + while (strgen_.HasNext()) { + StringPiece input = strgen_.Next(); + PrintResult(re, input, RE2::ANCHOR_BOTH, group, ngroup); + printf(";"); + PrintResult(re, input, RE2::UNANCHORED, group, ngroup); + printf(";"); + PrintResult(relongest, input, RE2::ANCHOR_BOTH, group, ngroup); + printf(";"); + PrintResult(relongest, input, RE2::UNANCHORED, group, ngroup); + printf("\n"); + } + delete[] group; + return; + } + + Tester tester(regexp); + if (tester.error()) + return; + + strgen_.Reset(); + strgen_.GenerateNULL(); + if (randomstrings_) + strgen_.Random(stringseed_, stringcount_); + int bad_inputs = 0; + while (strgen_.HasNext()) { + tests_++; + if (!tester.TestInput(strgen_.Next())) { + failures_++; + if (++bad_inputs >= FLAGS_max_bad_regexp_inputs) + break; + } + } +} + +// Runs an exhaustive test on the given parameters. +void ExhaustiveTest(int maxatoms, int maxops, + const vector& alphabet, + const vector& ops, + int maxstrlen, const vector& stralphabet, + const string& wrapper, + const string& topwrapper) { + if (DEBUG_MODE && FLAGS_quick_debug_mode) { + if (maxatoms > 1) + maxatoms--; + if (maxops > 1) + maxops--; + if (maxstrlen > 1) + maxstrlen--; + } + ExhaustiveTester t(maxatoms, maxops, alphabet, ops, + maxstrlen, stralphabet, wrapper, + topwrapper); + t.Generate(); + if (!LOGGING) { + printf("%d regexps, %d tests, %d failures [%d/%d str]\n", + t.regexps(), t.tests(), t.failures(), maxstrlen, (int)stralphabet.size()); + } + EXPECT_EQ(0, t.failures()); +} + +// Runs an exhaustive test using the given parameters and +// the basic egrep operators. +void EgrepTest(int maxatoms, int maxops, const string& alphabet, + int maxstrlen, const string& stralphabet, + const string& wrapper) { + const char* tops[] = { "", "^(?:%s)", "(?:%s)$", "^(?:%s)$" }; + + for (int i = 0; i < arraysize(tops); i++) { + ExhaustiveTest(maxatoms, maxops, + Split("", alphabet), + RegexpGenerator::EgrepOps(), + maxstrlen, + Split("", stralphabet), + wrapper, + tops[i]); + } +} + +} // namespace re2 diff --git a/re2/testing/exhaustive_tester.h b/re2/testing/exhaustive_tester.h new file mode 100644 index 0000000..38a139f --- /dev/null +++ b/re2/testing/exhaustive_tester.h @@ -0,0 +1,85 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef RE2_TESTING_EXHAUSTIVE_TESTER_H__ +#define RE2_TESTING_EXHAUSTIVE_TESTER_H__ + +#include +#include +#include "util/util.h" +#include "re2/testing/regexp_generator.h" +#include "re2/testing/string_generator.h" + +namespace re2 { + +// Exhaustive regular expression test: generate all regexps within parameters, +// then generate all strings of a given length over a given alphabet, +// then check that NFA, DFA, and PCRE agree about whether each regexp matches +// each possible string, and if so, where the match is. +// +// Can also be used in a "random" mode that generates a given number +// of random regexp and strings, allowing testing of larger expressions +// and inputs. +class ExhaustiveTester : public RegexpGenerator { + public: + ExhaustiveTester(int maxatoms, + int maxops, + const vector& alphabet, + const vector& ops, + int maxstrlen, + const vector& stralphabet, + const string& wrapper, + const string& topwrapper) + : RegexpGenerator(maxatoms, maxops, alphabet, ops), + strgen_(maxstrlen, stralphabet), + wrapper_(wrapper), + topwrapper_(topwrapper), + regexps_(0), tests_(0), failures_(0), + randomstrings_(0), stringseed_(0), stringcount_(0) { } + + int regexps() { return regexps_; } + int tests() { return tests_; } + int failures() { return failures_; } + + // Needed for RegexpGenerator interface. + void HandleRegexp(const string& regexp); + + // Causes testing to generate random input strings. + void RandomStrings(int32 seed, int32 count) { + randomstrings_ = true; + stringseed_ = seed; + stringcount_ = count; + } + + private: + StringGenerator strgen_; + string wrapper_; // Regexp wrapper - either empty or has one %s. + string topwrapper_; // Regexp top-level wrapper. + int regexps_; // Number of HandleRegexp calls + int tests_; // Number of regexp tests. + int failures_; // Number of tests failed. + + bool randomstrings_; // Whether to use random strings + int32 stringseed_; // If so, the seed. + int stringcount_; // If so, how many to generate. + DISALLOW_EVIL_CONSTRUCTORS(ExhaustiveTester); +}; + +// Runs an exhaustive test on the given parameters. +void ExhaustiveTest(int maxatoms, int maxops, + const vector& alphabet, + const vector& ops, + int maxstrlen, const vector& stralphabet, + const string& wrapper, + const string& topwrapper); + +// Runs an exhaustive test using the given parameters and +// the basic egrep operators. +void EgrepTest(int maxatoms, int maxops, const string& alphabet, + int maxstrlen, const string& stralphabet, + const string& wrapper); + +} // namespace re2 + +#endif // RE2_TESTING_EXHAUSTIVE_TESTER_H__ diff --git a/re2/testing/filtered_re2_test.cc b/re2/testing/filtered_re2_test.cc new file mode 100644 index 0000000..e3a0dd1 --- /dev/null +++ b/re2/testing/filtered_re2_test.cc @@ -0,0 +1,275 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "util/test.h" +#include "re2/filtered_re2.h" +#include "re2/re2.h" + +DECLARE_int32(filtered_re2_min_atom_len); // From prefilter_tree.cc + +namespace re2 { + +struct FilterTestVars { + vector atoms; + vector atom_indices; + vector matches; + RE2::Options opts; + FilteredRE2 f; +}; + +TEST(FilteredRE2Test, EmptyTest) { + FilterTestVars v; + v.f.AllMatches("foo", v.atom_indices, &v.matches); + EXPECT_EQ(0, v.matches.size()); +} + +TEST(FilteredRE2Test, SmallOrTest) { + FLAGS_filtered_re2_min_atom_len = 4; + + FilterTestVars v; + int id; + v.f.Add("(foo|bar)", v.opts, &id); + + v.f.Compile(&v.atoms); + EXPECT_EQ(0, v.atoms.size()); + + v.f.AllMatches("lemurs bar", v.atom_indices, &v.matches); + EXPECT_EQ(1, v.matches.size()); + EXPECT_EQ(id, v.matches[0]); +} + +TEST(FilteredRE2Test, SmallLatinTest) { + FLAGS_filtered_re2_min_atom_len = 3; + FilterTestVars v; + int id; + + v.opts.set_utf8(false); + v.f.Add("\xde\xadQ\xbe\xef", v.opts, &id); + v.f.Compile(&v.atoms); + EXPECT_EQ(1, v.atoms.size()); + EXPECT_EQ(v.atoms[0], "\xde\xadq\xbe\xef"); + + v.atom_indices.push_back(0); + v.f.AllMatches("foo\xde\xadQ\xbe\xeflemur", v.atom_indices, &v.matches); + EXPECT_EQ(1, v.matches.size()); + EXPECT_EQ(id, v.matches[0]); +} + +struct AtomTest { + const char* testname; + // If any test needs more than this many regexps or atoms, increase + // the size of the corresponding array. + const char* regexps[20]; + const char* atoms[20]; +}; + +AtomTest atom_tests[] = { + { + // This test checks to make sure empty patterns are allowed. + "CheckEmptyPattern", + {""}, + {} + }, { + // This test checks that all atoms of length greater than min length + // are found, and no atoms that are of smaller length are found. + "AllAtomsGtMinLengthFound", { + "(abc123|def456|ghi789).*mnop[x-z]+", + "abc..yyy..zz", + "mnmnpp[a-z]+PPP" + }, { + "abc123", + "def456", + "ghi789", + "mnop", + "abc", + "yyy", + "mnmnpp", + "ppp" + } + }, { + // Test to make sure that any atoms that have another atom as a + // substring in an OR are removed; that is, only the shortest + // substring is kept. + "SubstrAtomRemovesSuperStrInOr", { + "(abc123|abc|ghi789|abc1234).*[x-z]+", + "abcd..yyy..yyyzzz", + "mnmnpp[a-z]+PPP" + }, { + "abc", + "ghi789", + "abcd", + "yyy", + "yyyzzz", + "mnmnpp", + "ppp" + } + }, { + // Test character class expansion. + "CharClassExpansion", { + "m[a-c][d-f]n.*[x-z]+", + "[x-y]bcde[ab]" + }, { + "madn", "maen", "mafn", + "mbdn", "mben", "mbfn", + "mcdn", "mcen", "mcfn", + "xbcdea", "xbcdeb", + "ybcdea", "ybcdeb" + } + }, { + // Test upper/lower of non-ASCII. + "UnicodeLower", { + "(?i)ΔδΠϖπΣςσ", + "ΛΜΝΟΠ", + "ψρστυ", + }, { + "δδπππσσσ", + "λμνοπ", + "ψρστυ", + }, + }, +}; + +void AddRegexpsAndCompile(const char* regexps[], + int n, + struct FilterTestVars* v) { + for (int i = 0; i < n; i++) { + int id; + v->f.Add(regexps[i], v->opts, &id); + } + v->f.Compile(&v->atoms); +} + +bool CheckExpectedAtoms(const char* atoms[], + int n, + const char* testname, + struct FilterTestVars* v) { + vector expected; + for (int i = 0; i < n; i++) + expected.push_back(atoms[i]); + + bool pass = expected.size() == v->atoms.size(); + + sort(v->atoms.begin(), v->atoms.end()); + sort(expected.begin(), expected.end()); + for (int i = 0; pass && i < n; i++) + pass = pass && expected[i] == v->atoms[i]; + + if (!pass) { + LOG(WARNING) << "Failed " << testname; + LOG(WARNING) << "Expected #atoms = " << expected.size(); + for (int i = 0; i < expected.size(); i++) + LOG(WARNING) << expected[i]; + LOG(WARNING) << "Found #atoms = " << v->atoms.size(); + for (int i = 0; i < v->atoms.size(); i++) + LOG(WARNING) << v->atoms[i]; + } + + return pass; +} + +TEST(FilteredRE2Test, AtomTests) { + FLAGS_filtered_re2_min_atom_len = 3; + + int nfail = 0; + for (int i = 0; i < arraysize(atom_tests); i++) { + FilterTestVars v; + AtomTest* t = &atom_tests[i]; + int natom, nregexp; + for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++) + if (t->regexps[nregexp] == NULL) + break; + for (natom = 0; natom < arraysize(t->atoms); natom++) + if (t->atoms[natom] == NULL) + break; + AddRegexpsAndCompile(t->regexps, nregexp, &v); + if (!CheckExpectedAtoms(t->atoms, natom, t->testname, &v)) + nfail++; + } + EXPECT_EQ(0, nfail); +} + +void FindAtomIndices(const vector atoms, + const vector matched_atoms, + vector* atom_indices) { + atom_indices->clear(); + for (int i = 0; i < matched_atoms.size(); i++) { + int j = 0; + for (; j < atoms.size(); j++) { + if (matched_atoms[i] == atoms[j]) { + atom_indices->push_back(j); + break; + } + EXPECT_LT(j, atoms.size()); + } + } +} + +TEST(FilteredRE2Test, MatchEmptyPattern) { + FLAGS_filtered_re2_min_atom_len = 3; + + FilterTestVars v; + AtomTest* t = &atom_tests[0]; + // We are using the regexps used in one of the atom tests + // for this test. Adding the EXPECT here to make sure + // the index we use for the test is for the correct test. + EXPECT_EQ("CheckEmptyPattern", string(t->testname)); + int nregexp; + for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++) + if (t->regexps[nregexp] == NULL) + break; + AddRegexpsAndCompile(t->regexps, nregexp, &v); + string text = "0123"; + vector atom_ids; + vector matching_regexps; + EXPECT_EQ(0, v.f.FirstMatch(text, atom_ids)); +} + +TEST(FilteredRE2Test, MatchTests) { + FLAGS_filtered_re2_min_atom_len = 3; + + FilterTestVars v; + AtomTest* t = &atom_tests[2]; + // We are using the regexps used in one of the atom tests + // for this test. + EXPECT_EQ("SubstrAtomRemovesSuperStrInOr", string(t->testname)); + int nregexp; + for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++) + if (t->regexps[nregexp] == NULL) + break; + AddRegexpsAndCompile(t->regexps, nregexp, &v); + + string text = "abc121212xyz"; + // atoms = abc + vector atom_ids; + vector atoms; + atoms.push_back("abc"); + FindAtomIndices(v.atoms, atoms, &atom_ids); + vector matching_regexps; + v.f.AllMatches(text, atom_ids, &matching_regexps); + EXPECT_EQ(1, matching_regexps.size()); + + text = "abc12312yyyzzz"; + atoms.clear(); + atoms.push_back("abc"); + atoms.push_back("yyy"); + atoms.push_back("yyyzzz"); + FindAtomIndices(v.atoms, atoms, &atom_ids); + v.f.AllMatches(text, atom_ids, &matching_regexps); + EXPECT_EQ(1, matching_regexps.size()); + + text = "abcd12yyy32yyyzzz"; + atoms.clear(); + atoms.push_back("abc"); + atoms.push_back("abcd"); + atoms.push_back("yyy"); + atoms.push_back("yyyzzz"); + FindAtomIndices(v.atoms, atoms, &atom_ids); + LOG(INFO) << "S: " << atom_ids.size(); + for (int i = 0; i < atom_ids.size(); i++) + LOG(INFO) << "i: " << i << " : " << atom_ids[i]; + v.f.AllMatches(text, atom_ids, &matching_regexps); + EXPECT_EQ(2, matching_regexps.size()); +} + +} // namespace re2 diff --git a/re2/testing/mimics_pcre_test.cc b/re2/testing/mimics_pcre_test.cc new file mode 100644 index 0000000..f965092 --- /dev/null +++ b/re2/testing/mimics_pcre_test.cc @@ -0,0 +1,76 @@ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "util/test.h" +#include "re2/prog.h" +#include "re2/regexp.h" + +namespace re2 { + +struct PCRETest { + const char* regexp; + bool should_match; +}; + +static PCRETest tests[] = { + // Most things should behave exactly. + { "abc", true }, + { "(a|b)c", true }, + { "(a*|b)c", true }, + { "(a|b*)c", true }, + { "a(b|c)d", true }, + { "a(()|())c", true }, + { "ab*c", true }, + { "ab+c", true }, + { "a(b*|c*)d", true }, + { "\\W", true }, + { "\\W{1,2}", true }, + { "\\d", true }, + + // Check that repeated empty strings do not. + { "(a*)*", false }, + { "x(a*)*y", false }, + { "(a*)+", false }, + { "(a+)*", true }, + { "(a+)+", true }, + { "(a+)+", true }, + + // \v is the only character class that shouldn't. + { "\\b", true }, + { "\\v", false }, + { "\\d", true }, + + // The handling of ^ in multi-line mode is different, as is + // the handling of $ in single-line mode. (Both involve + // boundary cases if the string ends with \n.) + { "\\A", true }, + { "\\z", true }, + { "(?m)^", false }, + { "(?m)$", true }, + { "(?-m)^", true }, + { "(?-m)$", false }, // In PCRE, == \Z + { "(?m)\\A", true }, + { "(?m)\\z", true }, + { "(?-m)\\A", true }, + { "(?-m)\\z", true }, +}; + +TEST(MimicsPCRE, SimpleTests) { + for (int i = 0; i < arraysize(tests); i++) { + const PCRETest& t = tests[i]; + for (int j = 0; j < 2; j++) { + Regexp::ParseFlags flags = Regexp::LikePerl; + if (j == 0) + flags = flags | Regexp::Latin1; + Regexp* re = Regexp::Parse(t.regexp, flags, NULL); + CHECK(re) << " " << t.regexp; + CHECK_EQ(t.should_match, re->MimicsPCRE()) + << " " << t.regexp << " " + << (j==0 ? "latin1" : "utf"); + re->Decref(); + } + } +} + +} // namespace re2 diff --git a/re2/testing/null_walker.cc b/re2/testing/null_walker.cc new file mode 100644 index 0000000..09b53cb --- /dev/null +++ b/re2/testing/null_walker.cc @@ -0,0 +1,44 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "util/test.h" +#include "re2/regexp.h" +#include "re2/walker-inl.h" + +namespace re2 { + +// Null walker. For benchmarking the walker itself. + +class NullWalker : public Regexp::Walker { + public: + NullWalker() { } + bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, + bool* child_args, int nchild_args); + + bool ShortVisit(Regexp* re, bool a) { + // Should never be called: we use Walk not WalkExponential. + LOG(DFATAL) << "NullWalker::ShortVisit called"; + return a; + } + + private: + DISALLOW_EVIL_CONSTRUCTORS(NullWalker); +}; + +// Called after visiting re's children. child_args contains the return +// value from each of the children's PostVisits (i.e., whether each child +// can match an empty string). Returns whether this clause can match an +// empty string. +bool NullWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg, + bool* child_args, int nchild_args) { + return false; +} + +// Returns whether re can match an empty string. +void Regexp::NullWalk() { + NullWalker w; + w.Walk(this, false); +} + +} // namespace re2 diff --git a/re2/testing/parse_test.cc b/re2/testing/parse_test.cc new file mode 100644 index 0000000..f67b477 --- /dev/null +++ b/re2/testing/parse_test.cc @@ -0,0 +1,433 @@ +// Copyright 2006 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Test parse.cc, dump.cc, and tostring.cc. + +#include +#include +#include "util/test.h" +#include "re2/regexp.h" + +namespace re2 { + +static const Regexp::ParseFlags TestZeroFlags = Regexp::ParseFlags(1<<30); + +struct Test { + const char* regexp; + const char* parse; + Regexp::ParseFlags flags; +}; + +static Regexp::ParseFlags kTestFlags = Regexp::MatchNL | + Regexp::PerlX | + Regexp::PerlClasses | + Regexp::UnicodeGroups; + +static Test tests[] = { + // Base cases + { "a", "lit{a}" }, + { "a.", "cat{lit{a}dot{}}" }, + { "a.b", "cat{lit{a}dot{}lit{b}}" }, + { "ab", "str{ab}" }, + { "a.b.c", "cat{lit{a}dot{}lit{b}dot{}lit{c}}" }, + { "abc", "str{abc}" }, + { "a|^", "alt{lit{a}bol{}}" }, + { "a|b", "cc{0x61-0x62}" }, + { "(a)", "cap{lit{a}}" }, + { "(a)|b", "alt{cap{lit{a}}lit{b}}" }, + { "a*", "star{lit{a}}" }, + { "a+", "plus{lit{a}}" }, + { "a?", "que{lit{a}}" }, + { "a{2}", "rep{2,2 lit{a}}" }, + { "a{2,3}", "rep{2,3 lit{a}}" }, + { "a{2,}", "rep{2,-1 lit{a}}" }, + { "a*?", "nstar{lit{a}}" }, + { "a+?", "nplus{lit{a}}" }, + { "a??", "nque{lit{a}}" }, + { "a{2}?", "nrep{2,2 lit{a}}" }, + { "a{2,3}?", "nrep{2,3 lit{a}}" }, + { "a{2,}?", "nrep{2,-1 lit{a}}" }, + { "", "emp{}" }, + { "|", "emp{}" }, // alt{emp{}emp{}} but got factored + { "|x|", "alt{emp{}lit{x}emp{}}" }, + { ".", "dot{}" }, + { "^", "bol{}" }, + { "$", "eol{}" }, + { "\\|", "lit{|}" }, + { "\\(", "lit{(}" }, + { "\\)", "lit{)}" }, + { "\\*", "lit{*}" }, + { "\\+", "lit{+}" }, + { "\\?", "lit{?}" }, + { "{", "lit{{}" }, + { "}", "lit{}}" }, + { "\\.", "lit{.}" }, + { "\\^", "lit{^}" }, + { "\\$", "lit{$}" }, + { "\\\\", "lit{\\}" }, + { "[ace]", "cc{0x61 0x63 0x65}" }, + { "[abc]", "cc{0x61-0x63}" }, + { "[a-z]", "cc{0x61-0x7a}" }, + { "[a]", "lit{a}" }, + { "\\-", "lit{-}" }, + { "-", "lit{-}" }, + { "\\_", "lit{_}" }, + + // Posix and Perl extensions + { "[[:lower:]]", "cc{0x61-0x7a}" }, + { "[a-z]", "cc{0x61-0x7a}" }, + { "[^[:lower:]]", "cc{0-0x60 0x7b-0x10ffff}" }, + { "[[:^lower:]]", "cc{0-0x60 0x7b-0x10ffff}" }, + { "(?i)[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" }, + { "(?i)[a-z]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" }, + { "(?i)[^[:lower:]]", "cc{0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" }, + { "(?i)[[:^lower:]]", "cc{0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" }, + { "\\d", "cc{0x30-0x39}" }, + { "\\D", "cc{0-0x2f 0x3a-0x10ffff}" }, + { "\\s", "cc{0x9-0xa 0xc-0xd 0x20}" }, + { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}" }, + { "\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}" }, + { "\\W", "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}" }, + { "(?i)\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}" }, + { "(?i)\\W", "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" }, + { "[^\\\\]", "cc{0-0x5b 0x5d-0x10ffff}" }, + { "\\C", "byte{}" }, + + // Unicode, negatives, and a double negative. + { "\\p{Braille}", "cc{0x2800-0x28ff}" }, + { "\\P{Braille}", "cc{0-0x27ff 0x2900-0x10ffff}" }, + { "\\p{^Braille}", "cc{0-0x27ff 0x2900-0x10ffff}" }, + { "\\P{^Braille}", "cc{0x2800-0x28ff}" }, + + // More interesting regular expressions. + { "a{,2}", "str{a{,2}}" }, + { "\\.\\^\\$\\\\", "str{.^$\\}" }, + { "[a-zABC]", "cc{0x41-0x43 0x61-0x7a}" }, + { "[^a]", "cc{0-0x60 0x62-0x10ffff}" }, + { "[\xce\xb1-\xce\xb5\xe2\x98\xba]", "cc{0x3b1-0x3b5 0x263a}" }, // utf-8 + { "a*{", "cat{star{lit{a}}lit{{}}" }, + + // Test precedences + { "(?:ab)*", "star{str{ab}}" }, + { "(ab)*", "star{cap{str{ab}}}" }, + { "ab|cd", "alt{str{ab}str{cd}}" }, + { "a(b|c)d", "cat{lit{a}cap{cc{0x62-0x63}}lit{d}}" }, + + // Test flattening. + { "(?:a)", "lit{a}" }, + { "(?:ab)(?:cd)", "str{abcd}" }, + { "(?:a|b)|(?:c|d)", "cc{0x61-0x64}" }, + { "a|.", "dot{}" }, + { ".|a", "dot{}" }, + + // Test Perl quoted literals + { "\\Q+|*?{[\\E", "str{+|*?{[}" }, + { "\\Q+\\E+", "plus{lit{+}}" }, + { "\\Q\\\\E", "lit{\\}" }, + { "\\Q\\\\\\E", "str{\\\\}" }, + + // Test Perl \A and \z + { "(?m)^", "bol{}" }, + { "(?m)$", "eol{}" }, + { "(?-m)^", "bot{}" }, + { "(?-m)$", "eot{}" }, + { "(?m)\\A", "bot{}" }, + { "(?m)\\z", "eot{\\z}" }, + { "(?-m)\\A", "bot{}" }, + { "(?-m)\\z", "eot{\\z}" }, + + // Test named captures + { "(?Pa)", "cap{name:lit{a}}" }, + + // Case-folded literals + { "[Aa]", "litfold{a}" }, + + // Strings + { "abcde", "str{abcde}" }, + { "[Aa][Bb]cd", "cat{strfold{ab}str{cd}}" }, + + // Reported bug involving \n leaking in despite use of NeverNL. + { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", TestZeroFlags }, + { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::FoldCase }, + { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, + { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, + { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", TestZeroFlags }, + { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::FoldCase }, + { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, + { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, + { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", TestZeroFlags }, + { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::FoldCase }, + { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, + { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, + { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", TestZeroFlags }, + { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::FoldCase }, + { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, + { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, + { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", TestZeroFlags }, + { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::FoldCase }, + { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, + { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, + { "[^ \r\f\v]", "cc{0-0x9 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, + { "[^ \r\f\v]", "cc{0-0x9 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, + { "[^ \r\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, + { "[^ \r\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, + { "[^ \r\n\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, + { "[^ \r\n\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, + { "[^ \r\n\f\t]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, + { "[^ \r\n\f\t]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, + { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", + Regexp::PerlClasses }, + { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", + Regexp::PerlClasses | Regexp::FoldCase }, + { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", + Regexp::PerlClasses | Regexp::NeverNL }, + { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", + Regexp::PerlClasses | Regexp::NeverNL | Regexp::FoldCase }, + { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", + Regexp::PerlClasses }, + { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", + Regexp::PerlClasses | Regexp::FoldCase }, + { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", + Regexp::PerlClasses | Regexp::NeverNL }, + { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", + Regexp::PerlClasses | Regexp::NeverNL | Regexp::FoldCase }, +}; + +bool RegexpEqualTestingOnly(Regexp* a, Regexp* b) { + return Regexp::Equal(a, b); +} + +void TestParse(const Test* tests, int ntests, Regexp::ParseFlags flags, + const string& title) { + Regexp** re = new Regexp*[ntests]; + for (int i = 0; i < ntests; i++) { + RegexpStatus status; + Regexp::ParseFlags f = flags; + if (tests[i].flags != 0) { + f = tests[i].flags & ~TestZeroFlags; + } + re[i] = Regexp::Parse(tests[i].regexp, f, &status); + CHECK(re[i] != NULL) << " " << tests[i].regexp << " " + << status.Text(); + string s = re[i]->Dump(); + EXPECT_EQ(string(tests[i].parse), s) << "Regexp: " << tests[i].regexp + << "\nparse: " << tests[i].parse << " s: " << s << " flag=" << f; + } + + for (int i = 0; i < ntests; i++) { + for (int j = 0; j < ntests; j++) { + EXPECT_EQ(string(tests[i].parse) == tests[j].parse, + RegexpEqualTestingOnly(re[i], re[j])) + << "Regexp: " << tests[i].regexp << " " << tests[j].regexp; + } + } + + for (int i = 0; i < ntests; i++) + re[i]->Decref(); + delete[] re; +} + +// Test that regexps parse to expected structures. +TEST(TestParse, SimpleRegexps) { + TestParse(tests, arraysize(tests), kTestFlags, "simple"); +} + +Test foldcase_tests[] = { + { "AbCdE", "strfold{abcde}" }, + { "[Aa]", "litfold{a}" }, + { "a", "litfold{a}" }, + + // 0x17F is an old English long s (looks like an f) and folds to s. + // 0x212A is the Kelvin symbol and folds to k. + { "A[F-g]", "cat{litfold{a}cc{0x41-0x7a 0x17f 0x212a}}" }, // [Aa][A-z...] + { "[[:upper:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" }, + { "[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" }, +}; + +// Test that parsing with FoldCase works. +TEST(TestParse, FoldCase) { + TestParse(foldcase_tests, arraysize(foldcase_tests), Regexp::FoldCase, "foldcase"); +} + +Test literal_tests[] = { + { "(|)^$.[*+?]{5,10},\\", "str{(|)^$.[*+?]{5,10},\\}" }, +}; + +// Test that parsing with Literal works. +TEST(TestParse, Literal) { + TestParse(literal_tests, arraysize(literal_tests), Regexp::Literal, "literal"); +} + +Test matchnl_tests[] = { + { ".", "dot{}" }, + { "\n", "lit{\n}" }, + { "[^a]", "cc{0-0x60 0x62-0x10ffff}" }, + { "[a\\n]", "cc{0xa 0x61}" }, +}; + +// Test that parsing with MatchNL works. +// (Also tested above during simple cases.) +TEST(TestParse, MatchNL) { + TestParse(matchnl_tests, arraysize(matchnl_tests), Regexp::MatchNL, "with MatchNL"); +} + +Test nomatchnl_tests[] = { + { ".", "cc{0-0x9 0xb-0x10ffff}" }, + { "\n", "lit{\n}" }, + { "[^a]", "cc{0-0x9 0xb-0x60 0x62-0x10ffff}" }, + { "[a\\n]", "cc{0xa 0x61}" }, +}; + +// Test that parsing without MatchNL works. +TEST(TestParse, NoMatchNL) { + TestParse(nomatchnl_tests, arraysize(nomatchnl_tests), Regexp::NoParseFlags, "without MatchNL"); +} + +Test prefix_tests[] = { + { "abc|abd", "cat{str{ab}cc{0x63-0x64}}" }, + { "a(?:b)c|abd", "cat{str{ab}cc{0x63-0x64}}" }, + { "abc|abd|aef|bcx|bcy", + "alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}" + "cat{str{bc}cc{0x78-0x79}}}" }, + { "abc|x|abd", "alt{str{abc}lit{x}str{abd}}" }, + { "(?i)abc|ABD", "cat{strfold{ab}cc{0x43-0x44 0x63-0x64}}" }, + { "[ab]c|[ab]d", "cat{cc{0x61-0x62}cc{0x63-0x64}}" }, + { "(?:xx|yy)c|(?:xx|yy)d", + "cat{alt{str{xx}str{yy}}cc{0x63-0x64}}" }, + { "x{2}|x{2}[0-9]", + "cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}" }, + { "x{2}y|x{2}[0-9]y", + "cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}" }, +}; + +// Test that prefix factoring works. +TEST(TestParse, Prefix) { + TestParse(prefix_tests, arraysize(prefix_tests), Regexp::PerlX, "prefix"); +} + +// Invalid regular expressions +const char* badtests[] = { + "(", + ")", + "(a", + "(a|b|", + "(a|b", + "[a-z", + "([a-z)", + "x{1001}", + "\xff", // Invalid UTF-8 + "[\xff]", + "[\\\xff]", + "\\\xff", + "(?Pa", + "(?P", + "(?Pa)", + "(?P<>a)", + "[a-Z]", + "(?i)[a-Z]", + "a{100000}", + "a{100000,}", +}; + +// Valid in Perl, bad in POSIX +const char* only_perl[] = { + "[a-b-c]", + "\\Qabc\\E", + "\\Q*+?{[\\E", + "\\Q\\\\E", + "\\Q\\\\\\E", + "\\Q\\\\\\\\E", + "\\Q\\\\\\\\\\E", + "(?:a)", + "(?Pa)", +}; + +// Valid in POSIX, bad in Perl. +const char* only_posix[] = { + "a++", + "a**", + "a?*", + "a+*", + "a{1}*", +}; + +// Test that parser rejects bad regexps. +TEST(TestParse, InvalidRegexps) { + for (int i = 0; i < arraysize(badtests); i++) { + CHECK(Regexp::Parse(badtests[i], Regexp::PerlX, NULL) == NULL) + << " " << badtests[i]; + CHECK(Regexp::Parse(badtests[i], Regexp::NoParseFlags, NULL) == NULL) + << " " << badtests[i]; + } + for (int i = 0; i < arraysize(only_posix); i++) { + CHECK(Regexp::Parse(only_posix[i], Regexp::PerlX, NULL) == NULL) + << " " << only_posix[i]; + Regexp* re = Regexp::Parse(only_posix[i], Regexp::NoParseFlags, NULL); + CHECK(re) << " " << only_posix[i]; + re->Decref(); + } + for (int i = 0; i < arraysize(only_perl); i++) { + CHECK(Regexp::Parse(only_perl[i], Regexp::NoParseFlags, NULL) == NULL) + << " " << only_perl[i]; + Regexp* re = Regexp::Parse(only_perl[i], Regexp::PerlX, NULL); + CHECK(re) << " " << only_perl[i]; + re->Decref(); + } +} + +// Test that ToString produces original regexp or equivalent one. +TEST(TestToString, EquivalentParse) { + for (int i = 0; i < arraysize(tests); i++) { + RegexpStatus status; + Regexp::ParseFlags f = kTestFlags; + if (tests[i].flags != 0) { + f = tests[i].flags & ~TestZeroFlags; + } + Regexp* re = Regexp::Parse(tests[i].regexp, f, &status); + CHECK(re != NULL) << " " << tests[i].regexp << " " << status.Text(); + string s = re->Dump(); + EXPECT_EQ(string(tests[i].parse), s) << " " << tests[i].regexp << " " << string(tests[i].parse) << " " << s; + string t = re->ToString(); + if (t != tests[i].regexp) { + // If ToString didn't return the original regexp, + // it must have found one with fewer parens. + // Unfortunately we can't check the length here, because + // ToString produces "\\{" for a literal brace, + // but "{" is a shorter equivalent. + // CHECK_LT(t.size(), strlen(tests[i].regexp)) + // << " t=" << t << " regexp=" << tests[i].regexp; + + // Test that if we parse the new regexp we get the same structure. + Regexp* nre = Regexp::Parse(t, Regexp::MatchNL | Regexp::PerlX, &status); + CHECK(nre != NULL) << " reparse " << t << " " << status.Text(); + string ss = nre->Dump(); + string tt = nre->ToString(); + if (s != ss || t != tt) + LOG(INFO) << "ToString(" << tests[i].regexp << ") = " << t; + EXPECT_EQ(s, ss); + EXPECT_EQ(t, tt); + nre->Decref(); + } + re->Decref(); + } +} + +// Test that capture error args are correct. +TEST(NamedCaptures, ErrorArgs) { + RegexpStatus status; + Regexp* re; + + re = Regexp::Parse("test(?Pz)", Regexp::LikePerl, &status); + EXPECT_TRUE(re == NULL); + EXPECT_EQ(status.code(), kRegexpBadNamedCapture); + EXPECT_EQ(status.error_arg(), "(?P"); +} + +} // namespace re2 diff --git a/re2/testing/possible_match_test.cc b/re2/testing/possible_match_test.cc new file mode 100644 index 0000000..7c2400e --- /dev/null +++ b/re2/testing/possible_match_test.cc @@ -0,0 +1,240 @@ +// Copyright 2006-2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include "util/test.h" +#include "re2/prog.h" +#include "re2/re2.h" +#include "re2/regexp.h" +#include "re2/testing/regexp_generator.h" +#include "re2/testing/string_generator.h" + +namespace re2 { + +// Test that C++ strings are compared as uint8s, not int8s. +// PossibleMatchRange doesn't depend on this, but callers probably will. +TEST(CplusplusStrings, EightBit) { + string s = "\x70"; + string t = "\xA0"; + EXPECT_LT(s, t); +} + +struct PrefixTest { + const char* regexp; + int maxlen; + const char* min; + const char* max; +}; + +static PrefixTest tests[] = { + { "", 10, "", "", }, + { "Abcdef", 10, "Abcdef", "Abcdef" }, + { "abc(def|ghi)", 10, "abcdef", "abcghi" }, + { "a+hello", 10, "aa", "ahello" }, + { "a*hello", 10, "a", "hello" }, + { "def|abc", 10, "abc", "def" }, + { "a(b)(c)[d]", 10, "abcd", "abcd" }, + { "ab(cab|cat)", 10, "abcab", "abcat" }, + { "ab(cab|ca)x", 10, "abcabx", "abcax" }, + { "(ab|x)(c|de)", 10, "abc", "xde" }, + { "(ab|x)?(c|z)?", 10, "", "z" }, + { "[^\\s\\S]", 10, "", "" }, + { "(abc)+", 5, "abc", "abcac" }, + { "(abc)+", 2, "ab", "ac" }, + { "(abc)+", 1, "a", "b" }, + { "[a\xC3\xA1]", 4, "a", "\xC3\xA1" }, + { "a*", 10, "", "ab" }, + + { "(?i)Abcdef", 10, "ABCDEF", "abcdef" }, + { "(?i)abc(def|ghi)", 10, "ABCDEF", "abcghi" }, + { "(?i)a+hello", 10, "AA", "ahello" }, + { "(?i)a*hello", 10, "A", "hello" }, + { "(?i)def|abc", 10, "ABC", "def" }, + { "(?i)a(b)(c)[d]", 10, "ABCD", "abcd" }, + { "(?i)ab(cab|cat)", 10, "ABCAB", "abcat" }, + { "(?i)ab(cab|ca)x", 10, "ABCABX", "abcax" }, + { "(?i)(ab|x)(c|de)", 10, "ABC", "xde" }, + { "(?i)(ab|x)?(c|z)?", 10, "", "z" }, + { "(?i)[^\\s\\S]", 10, "", "" }, + { "(?i)(abc)+", 5, "ABC", "abcac" }, + { "(?i)(abc)+", 2, "AB", "ac" }, + { "(?i)(abc)+", 1, "A", "b" }, + { "(?i)[a\xC3\xA1]", 4, "A", "\xC3\xA1" }, + { "(?i)a*", 10, "", "ab" }, + { "(?i)A*", 10, "", "ab" }, + + { "\\AAbcdef", 10, "Abcdef", "Abcdef" }, + { "\\Aabc(def|ghi)", 10, "abcdef", "abcghi" }, + { "\\Aa+hello", 10, "aa", "ahello" }, + { "\\Aa*hello", 10, "a", "hello" }, + { "\\Adef|abc", 10, "abc", "def" }, + { "\\Aa(b)(c)[d]", 10, "abcd", "abcd" }, + { "\\Aab(cab|cat)", 10, "abcab", "abcat" }, + { "\\Aab(cab|ca)x", 10, "abcabx", "abcax" }, + { "\\A(ab|x)(c|de)", 10, "abc", "xde" }, + { "\\A(ab|x)?(c|z)?", 10, "", "z" }, + { "\\A[^\\s\\S]", 10, "", "" }, + { "\\A(abc)+", 5, "abc", "abcac" }, + { "\\A(abc)+", 2, "ab", "ac" }, + { "\\A(abc)+", 1, "a", "b" }, + { "\\A[a\xC3\xA1]", 4, "a", "\xC3\xA1" }, + { "\\Aa*", 10, "", "ab" }, + + { "(?i)\\AAbcdef", 10, "ABCDEF", "abcdef" }, + { "(?i)\\Aabc(def|ghi)", 10, "ABCDEF", "abcghi" }, + { "(?i)\\Aa+hello", 10, "AA", "ahello" }, + { "(?i)\\Aa*hello", 10, "A", "hello" }, + { "(?i)\\Adef|abc", 10, "ABC", "def" }, + { "(?i)\\Aa(b)(c)[d]", 10, "ABCD", "abcd" }, + { "(?i)\\Aab(cab|cat)", 10, "ABCAB", "abcat" }, + { "(?i)\\Aab(cab|ca)x", 10, "ABCABX", "abcax" }, + { "(?i)\\A(ab|x)(c|de)", 10, "ABC", "xde" }, + { "(?i)\\A(ab|x)?(c|z)?", 10, "", "z" }, + { "(?i)\\A[^\\s\\S]", 10, "", "" }, + { "(?i)\\A(abc)+", 5, "ABC", "abcac" }, + { "(?i)\\A(abc)+", 2, "AB", "ac" }, + { "(?i)\\A(abc)+", 1, "A", "b" }, + { "(?i)\\A[a\xC3\xA1]", 4, "A", "\xC3\xA1" }, + { "(?i)\\Aa*", 10, "", "ab" }, + { "(?i)\\AA*", 10, "", "ab" }, +}; + +TEST(PossibleMatchRange, HandWritten) { + for (int i = 0; i < arraysize(tests); i++) { + for (int j = 0; j < 2; j++) { + const PrefixTest& t = tests[i]; + string min, max; + if (j == 0) { + LOG(INFO) << "Checking regexp=" << CEscape(t.regexp); + Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + CHECK(prog->PossibleMatchRange(&min, &max, t.maxlen)) + << " " << t.regexp; + delete prog; + re->Decref(); + } else { + CHECK(RE2(t.regexp).PossibleMatchRange(&min, &max, t.maxlen)); + } + EXPECT_EQ(t.min, min) << t.regexp; + EXPECT_EQ(t.max, max) << t.regexp; + } + } +} + +// Test cases where PossibleMatchRange should return false. +TEST(PossibleMatchRange, Failures) { + string min, max; + + // Fails because no room to write max. + EXPECT_FALSE(RE2("abc").PossibleMatchRange(&min, &max, 0)); + + // Fails because there is no max -- any non-empty string matches + // or begins a match. Have to use Latin-1 input, because there + // are no valid UTF-8 strings beginning with byte 0xFF. + EXPECT_FALSE(RE2("[\\s\\S]+", RE2::Latin1). + PossibleMatchRange(&min, &max, 10)) + << "min=" << CEscape(min) << ", max=" << CEscape(max); + EXPECT_FALSE(RE2("[\\0-\xFF]+", RE2::Latin1). + PossibleMatchRange(&min, &max, 10)) + << "min=" << CEscape(min) << ", max=" << CEscape(max); + EXPECT_FALSE(RE2(".+hello", RE2::Latin1). + PossibleMatchRange(&min, &max, 10)) + << "min=" << CEscape(min) << ", max=" << CEscape(max); + EXPECT_FALSE(RE2(".*hello", RE2::Latin1). + PossibleMatchRange(&min, &max, 10)) + << "min=" << CEscape(min) << ", max=" << CEscape(max); + EXPECT_FALSE(RE2(".*", RE2::Latin1). + PossibleMatchRange(&min, &max, 10)) + << "min=" << CEscape(min) << ", max=" << CEscape(max); + EXPECT_FALSE(RE2("\\C*"). + PossibleMatchRange(&min, &max, 10)) + << "min=" << CEscape(min) << ", max=" << CEscape(max); + + // Fails because it's a malformed regexp. + EXPECT_FALSE(RE2("*hello").PossibleMatchRange(&min, &max, 10)) + << "min=" << CEscape(min) << ", max=" << CEscape(max); +} + +// Exhaustive test: generate all regexps within parameters, +// then generate all strings of a given length over a given alphabet, +// then check that the prefix information agrees with whether +// the regexp matches each of the strings. +class PossibleMatchTester : public RegexpGenerator { + public: + PossibleMatchTester(int maxatoms, + int maxops, + const vector& alphabet, + const vector& ops, + int maxstrlen, + const vector& stralphabet) + : RegexpGenerator(maxatoms, maxops, alphabet, ops), + strgen_(maxstrlen, stralphabet), + regexps_(0), tests_(0) { } + + int regexps() { return regexps_; } + int tests() { return tests_; } + + // Needed for RegexpGenerator interface. + void HandleRegexp(const string& regexp); + + private: + StringGenerator strgen_; + + int regexps_; // Number of HandleRegexp calls + int tests_; // Number of regexp tests. + + DISALLOW_EVIL_CONSTRUCTORS(PossibleMatchTester); +}; + +// Processes a single generated regexp. +// Checks that all accepted strings agree with the prefix range. +void PossibleMatchTester::HandleRegexp(const string& regexp) { + regexps_++; + + VLOG(3) << CEscape(regexp); + + RE2 re(regexp, RE2::Latin1); + CHECK_EQ(re.error(), ""); + + string min, max; + if(!re.PossibleMatchRange(&min, &max, 10)) { + // There's no good max for "\\C*". Can't use strcmp + // because sometimes it gets embedded in more + // complicated expressions. + if(strstr(regexp.c_str(), "\\C*")) + return; + LOG(QFATAL) << "PossibleMatchRange failed on: " << CEscape(regexp); + } + + strgen_.Reset(); + while (strgen_.HasNext()) { + const StringPiece& s = strgen_.Next(); + tests_++; + if (!RE2::FullMatch(s, re)) + continue; + CHECK_GE(s, min) << " regexp: " << regexp << " max: " << max; + CHECK_LE(s, max) << " regexp: " << regexp << " min: " << min; + } +} + +TEST(PossibleMatchRange, Exhaustive) { + int natom = 3; + int noperator = 3; + int stringlen = 5; + if (DEBUG_MODE) { + natom = 2; + noperator = 3; + stringlen = 3; + } + PossibleMatchTester t(natom, noperator, Split(" ", "a b [0-9]"), + RegexpGenerator::EgrepOps(), + stringlen, Explode("ab4")); + t.Generate(); + LOG(INFO) << t.regexps() << " regexps, " + << t.tests() << " tests"; +} + +} // namespace re2 diff --git a/re2/testing/random_test.cc b/re2/testing/random_test.cc new file mode 100644 index 0000000..91d2b32 --- /dev/null +++ b/re2/testing/random_test.cc @@ -0,0 +1,95 @@ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Random testing of regular expression matching. + +#include +#include "util/test.h" +#include "re2/testing/exhaustive_tester.h" + +DEFINE_int32(regexpseed, 404, "Random regexp seed."); +DEFINE_int32(regexpcount, 100, "How many random regexps to generate."); +DEFINE_int32(stringseed, 200, "Random string seed."); +DEFINE_int32(stringcount, 100, "How many random strings to generate."); + +namespace re2 { + +// Runs a random test on the given parameters. +// (Always uses the same random seeds for reproducibility. +// Can give different seeds on command line.) +static void RandomTest(int maxatoms, int maxops, + const vector& alphabet, + const vector& ops, + int maxstrlen, const vector& stralphabet, + const string& wrapper) { + // Limit to smaller test cases in debug mode, + // because everything is so much slower. + if (DEBUG_MODE) { + maxatoms--; + maxops--; + maxstrlen /= 2; + } + + ExhaustiveTester t(maxatoms, maxops, alphabet, ops, + maxstrlen, stralphabet, wrapper, ""); + t.RandomStrings(FLAGS_stringseed, FLAGS_stringcount); + t.GenerateRandom(FLAGS_regexpseed, FLAGS_regexpcount); + printf("%d regexps, %d tests, %d failures [%d/%d str]\n", + t.regexps(), t.tests(), t.failures(), maxstrlen, (int)stralphabet.size()); + EXPECT_EQ(0, t.failures()); +} + +// Tests random small regexps involving literals and egrep operators. +TEST(Random, SmallEgrepLiterals) { + RandomTest(5, 5, Explode("abc."), RegexpGenerator::EgrepOps(), + 15, Explode("abc"), + ""); +} + +// Tests random bigger regexps involving literals and egrep operators. +TEST(Random, BigEgrepLiterals) { + RandomTest(10, 10, Explode("abc."), RegexpGenerator::EgrepOps(), + 15, Explode("abc"), + ""); +} + +// Tests random small regexps involving literals, capturing parens, +// and egrep operators. +TEST(Random, SmallEgrepCaptures) { + RandomTest(5, 5, Split(" ", "a (b) ."), RegexpGenerator::EgrepOps(), + 15, Explode("abc"), + ""); +} + +// Tests random bigger regexps involving literals, capturing parens, +// and egrep operators. +TEST(Random, BigEgrepCaptures) { + RandomTest(10, 10, Split(" ", "a (b) ."), RegexpGenerator::EgrepOps(), + 15, Explode("abc"), + ""); +} + +// Tests random large complicated expressions, using all the possible +// operators, some literals, some parenthesized literals, and predefined +// character classes like \d. (Adding larger character classes would +// make for too many possibilities.) +TEST(Random, Complicated) { + vector ops = Split(" ", + "%s%s %s|%s %s* %s*? %s+ %s+? %s? %s?? " + "%s{0} %s{0,} %s{1} %s{1,} %s{0,1} %s{0,2} %s{1,2} " + "%s{2} %s{2,} %s{3,4} %s{4,5}"); + + // Use (?:\b) and (?:\B) instead of \b and \B, + // because PCRE rejects \b* but accepts (?:\b)*. + // Ditto ^ and $. + vector atoms = Split(" ", + ". (?:^) (?:$) \\a \\f \\n \\r \\t \\v " + "\\d \\D \\s \\S \\w \\W (?:\\b) (?:\\B) " + "a (a) b c - \\\\"); + vector alphabet = Explode("abc123\001\002\003\t\r\n\v\f\a"); + RandomTest(10, 10, atoms, ops, 20, alphabet, ""); +} + +} // namespace re2 + diff --git a/re2/testing/re2_arg_test.cc b/re2/testing/re2_arg_test.cc new file mode 100644 index 0000000..ae7a7b0 --- /dev/null +++ b/re2/testing/re2_arg_test.cc @@ -0,0 +1,133 @@ +// Copyright 2005 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This tests to make sure numbers are parsed from strings +// correctly. +// Todo: Expand the test to validate strings parsed to the other types +// supported by RE2::Arg class + +#include "util/test.h" +#include "re2/re2.h" + +namespace re2 { + +struct SuccessTable { + const char * value_string; + int64 value; + bool success[6]; +}; + +// Test boundary cases for different integral sizes. +// Specifically I want to make sure that values outside the boundries +// of an integral type will fail and that negative numbers will fail +// for unsigned types. The following table contains the boundaries for +// the various integral types and has entries for whether or not each +// type can contain the given value. +const SuccessTable kSuccessTable[] = { +// string integer value short ushort int uint int64 uint64 +// 0 to 2^7-1 +{ "0", 0, { true, true, true, true, true, true }}, +{ "127", 127, { true, true, true, true, true, true }}, + +// -1 to -2^7 +{ "-1", -1, { true, false, true, false, true, false }}, +{ "-128", -128, { true, false, true, false, true, false }}, + +// 2^7 to 2^8-1 +{ "128", 128, { true, true, true, true, true, true }}, +{ "255", 255, { true, true, true, true, true, true }}, + +// 2^8 to 2^15-1 +{ "256", 256, { true, true, true, true, true, true }}, +{ "32767", 32767, { true, true, true, true, true, true }}, + +// -2^7-1 to -2^15 +{ "-129", -129, { true, false, true, false, true, false }}, +{ "-32768", -32768, { true, false, true, false, true, false }}, + +// 2^15 to 2^16-1 +{ "32768", 32768, { false, true, true, true, true, true }}, +{ "65535", 65535, { false, true, true, true, true, true }}, + +// 2^16 to 2^31-1 +{ "65536", 65536, { false, false, true, true, true, true }}, +{ "2147483647", 2147483647, { false, false, true, true, true, true }}, + +// -2^15-1 to -2^31 +{ "-32769", -32769, { false, false, true, false, true, false }}, +{ "-2147483648", + static_cast(0xFFFFFFFF80000000LL), +{ false, false, true, false, true, false }}, + +// 2^31 to 2^32-1 +{ "2147483648", 2147483648U, { false, false, false, true, true, true }}, +{ "4294967295", 4294967295U, { false, false, false, true, true, true }}, + +// 2^32 to 2^63-1 +{ "4294967296", 4294967296LL, { false, false, false, false, true, true }}, +{ "9223372036854775807", + 9223372036854775807LL, { false, false, false, false, true, true }}, + +// -2^31-1 to -2^63 +{ "-2147483649", -2147483649LL, { false, false, false, false, true, false }}, +{ "-9223372036854775808", static_cast(0x8000000000000000LL), + { false, false, false, false, true, false }}, + +// 2^63 to 2^64-1 +{ "9223372036854775808", static_cast(9223372036854775808ULL), + { false, false, false, false, false, true }}, +{ "18446744073709551615", static_cast(18446744073709551615ULL), + { false, false, false, false, false, true }}, + +// >= 2^64 +{ "18446744073709551616", 0, { false, false, false, false, false, false }}, +}; + +const int kNumStrings = ARRAYSIZE(kSuccessTable); + +// It's ugly to use a macro, but we apparently can't use the ASSERT_TRUE_M +// macro outside of a TEST block and this seems to be the only way to +// avoid code duplication. I can also pull off a couple nice tricks +// using concatenation for the type I'm checking against. +#define PARSE_FOR_TYPE(type, column) { \ + type r; \ + for ( int i = 0; i < kNumStrings; ++i ) { \ + RE2::Arg arg(&r); \ + const char* const p = kSuccessTable[i].value_string; \ + bool retval = arg.Parse(p, strlen(p)); \ + bool success = kSuccessTable[i].success[column]; \ + ASSERT_TRUE_M(retval == success, \ + StringPrintf("Parsing '%s' for type " #type " should return %d", \ + p, success).c_str()); \ + if ( success ) { \ + ASSERT_EQUALS(r, kSuccessTable[i].value); \ + } \ + } \ +} + +TEST(REArgTest, Int16Test) { + PARSE_FOR_TYPE(int16, 0); +} + +TEST(REArgTest, Uint16Test) { + PARSE_FOR_TYPE(uint16, 1); +} + +TEST(REArgTest, IntTest) { + PARSE_FOR_TYPE(int, 2); +} + +TEST(REArgTest, UInt32Test) { + PARSE_FOR_TYPE(uint32, 3); +} + +TEST(REArgTest, Iint64Test) { + PARSE_FOR_TYPE(int64, 4); +} + +TEST(REArgTest, Uint64Test) { + PARSE_FOR_TYPE(uint64, 5); +} + +} // namespace re2 diff --git a/re2/testing/re2_test.cc b/re2/testing/re2_test.cc new file mode 100644 index 0000000..b99cacf --- /dev/null +++ b/re2/testing/re2_test.cc @@ -0,0 +1,1371 @@ +// -*- coding: utf-8 -*- +// Copyright 2002-2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// TODO: Test extractions for PartialMatch/Consume + +#include +#include +#include +#include +#include +#include "util/test.h" +#include "re2/re2.h" +#include "re2/regexp.h" + +DECLARE_bool(logtostderr); + +namespace re2 { + +TEST(RE2, HexTests) { + + VLOG(1) << "hex tests"; + +#define CHECK_HEX(type, value) \ + do { \ + type v; \ + CHECK(RE2::FullMatch(#value, "([0-9a-fA-F]+)[uUlL]*", RE2::Hex(&v))); \ + CHECK_EQ(v, 0x ## value); \ + CHECK(RE2::FullMatch("0x" #value, "([0-9a-fA-FxX]+)[uUlL]*", RE2::CRadix(&v))); \ + CHECK_EQ(v, 0x ## value); \ + } while(0) + + CHECK_HEX(short, 2bad); + CHECK_HEX(unsigned short, 2badU); + CHECK_HEX(int, dead); + CHECK_HEX(unsigned int, deadU); + CHECK_HEX(long, 7eadbeefL); + CHECK_HEX(unsigned long, deadbeefUL); + CHECK_HEX(long long, 12345678deadbeefLL); + CHECK_HEX(unsigned long long, cafebabedeadbeefULL); + +#undef CHECK_HEX +} + +TEST(RE2, OctalTests) { + VLOG(1) << "octal tests"; + +#define CHECK_OCTAL(type, value) \ + do { \ + type v; \ + CHECK(RE2::FullMatch(#value, "([0-7]+)[uUlL]*", RE2::Octal(&v))); \ + CHECK_EQ(v, 0 ## value); \ + CHECK(RE2::FullMatch("0" #value, "([0-9a-fA-FxX]+)[uUlL]*", RE2::CRadix(&v))); \ + CHECK_EQ(v, 0 ## value); \ + } while(0) + + CHECK_OCTAL(short, 77777); + CHECK_OCTAL(unsigned short, 177777U); + CHECK_OCTAL(int, 17777777777); + CHECK_OCTAL(unsigned int, 37777777777U); + CHECK_OCTAL(long, 17777777777L); + CHECK_OCTAL(unsigned long, 37777777777UL); + CHECK_OCTAL(long long, 777777777777777777777LL); + CHECK_OCTAL(unsigned long long, 1777777777777777777777ULL); + +#undef CHECK_OCTAL +} + +TEST(RE2, DecimalTests) { + VLOG(1) << "decimal tests"; + +#define CHECK_DECIMAL(type, value) \ + do { \ + type v; \ + CHECK(RE2::FullMatch(#value, "(-?[0-9]+)[uUlL]*", &v)); \ + CHECK_EQ(v, value); \ + CHECK(RE2::FullMatch(#value, "(-?[0-9a-fA-FxX]+)[uUlL]*", RE2::CRadix(&v))); \ + CHECK_EQ(v, value); \ + } while(0) + + CHECK_DECIMAL(short, -1); + CHECK_DECIMAL(unsigned short, 9999); + CHECK_DECIMAL(int, -1000); + CHECK_DECIMAL(unsigned int, 12345U); + CHECK_DECIMAL(long, -10000000L); + CHECK_DECIMAL(unsigned long, 3083324652U); + CHECK_DECIMAL(long long, -100000000000000LL); + CHECK_DECIMAL(unsigned long long, 1234567890987654321ULL); + +#undef CHECK_DECIMAL +} + +TEST(RE2, Replace) { + VLOG(1) << "TestReplace"; + + struct ReplaceTest { + const char *regexp; + const char *rewrite; + const char *original; + const char *single; + const char *global; + int greplace_count; + }; + static const ReplaceTest tests[] = { + { "(qu|[b-df-hj-np-tv-z]*)([a-z]+)", + "\\2\\1ay", + "the quick brown fox jumps over the lazy dogs.", + "ethay quick brown fox jumps over the lazy dogs.", + "ethay ickquay ownbray oxfay umpsjay overay ethay azylay ogsday.", + 9 }, + { "\\w+", + "\\0-NOSPAM", + "abcd.efghi@google.com", + "abcd-NOSPAM.efghi@google.com", + "abcd-NOSPAM.efghi-NOSPAM@google-NOSPAM.com-NOSPAM", + 4 }, + { "^", + "(START)", + "foo", + "(START)foo", + "(START)foo", + 1 }, + { "^", + "(START)", + "", + "(START)", + "(START)", + 1 }, + { "$", + "(END)", + "", + "(END)", + "(END)", + 1 }, + { "b", + "bb", + "ababababab", + "abbabababab", + "abbabbabbabbabb", + 5 }, + { "b", + "bb", + "bbbbbb", + "bbbbbbb", + "bbbbbbbbbbbb", + 6 }, + { "b+", + "bb", + "bbbbbb", + "bb", + "bb", + 1 }, + { "b*", + "bb", + "bbbbbb", + "bb", + "bb", + 1 }, + { "b*", + "bb", + "aaaaa", + "bbaaaaa", + "bbabbabbabbabbabb", + 6 }, + // Check newline handling + { "a.*a", + "(\\0)", + "aba\naba", + "(aba)\naba", + "(aba)\n(aba)", + 2 }, + { "", NULL, NULL, NULL, NULL, 0 } + }; + + for (const ReplaceTest *t = tests; t->original != NULL; ++t) { + VLOG(1) << StringPrintf("\"%s\" =~ s/%s/%s/g", t->original, t->regexp, t->rewrite); + string one(t->original); + CHECK(RE2::Replace(&one, t->regexp, t->rewrite)); + CHECK_EQ(one, t->single); + string all(t->original); + CHECK_EQ(RE2::GlobalReplace(&all, t->regexp, t->rewrite), t->greplace_count) + << "Got: " << all; + CHECK_EQ(all, t->global); + } +} + +static void TestCheckRewriteString(const char* regexp, const char* rewrite, + bool expect_ok) { + string error; + RE2 exp(regexp); + bool actual_ok = exp.CheckRewriteString(rewrite, &error); + EXPECT_EQ(expect_ok, actual_ok) << " for " << rewrite << " error: " << error; +} + +TEST(CheckRewriteString, all) { + TestCheckRewriteString("abc", "foo", true); + TestCheckRewriteString("abc", "foo\\", false); + TestCheckRewriteString("abc", "foo\\0bar", true); + + TestCheckRewriteString("a(b)c", "foo", true); + TestCheckRewriteString("a(b)c", "foo\\0bar", true); + TestCheckRewriteString("a(b)c", "foo\\1bar", true); + TestCheckRewriteString("a(b)c", "foo\\2bar", false); + TestCheckRewriteString("a(b)c", "f\\\\2o\\1o", true); + + TestCheckRewriteString("a(b)(c)", "foo\\12", true); + TestCheckRewriteString("a(b)(c)", "f\\2o\\1o", true); + TestCheckRewriteString("a(b)(c)", "f\\oo\\1", false); +} + +TEST(RE2, Extract) { + VLOG(1) << "TestExtract"; + + string s; + + CHECK(RE2::Extract("boris@kremvax.ru", "(.*)@([^.]*)", "\\2!\\1", &s)); + CHECK_EQ(s, "kremvax!boris"); + + CHECK(RE2::Extract("foo", ".*", "'\\0'", &s)); + CHECK_EQ(s, "'foo'"); + // check that false match doesn't overwrite + CHECK(!RE2::Extract("baz", "bar", "'\\0'", &s)); + CHECK_EQ(s, "'foo'"); +} + +TEST(RE2, Consume) { + VLOG(1) << "TestConsume"; + + RE2 r("\\s*(\\w+)"); // matches a word, possibly proceeded by whitespace + string word; + + string s(" aaa b!@#$@#$cccc"); + StringPiece input(s); + + CHECK(RE2::Consume(&input, r, &word)); + CHECK_EQ(word, "aaa") << " input: " << input; + CHECK(RE2::Consume(&input, r, &word)); + CHECK_EQ(word, "b") << " input: " << input; + CHECK(! RE2::Consume(&input, r, &word)) << " input: " << input; +} + +TEST(RE2, ConsumeN) { + const string s(" one two three 4"); + StringPiece input(s); + + RE2::Arg argv[2]; + const RE2::Arg* const args[2] = { &argv[0], &argv[1] }; + + // 0 arg + EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)", args, 0)); // Skips "one". + + // 1 arg + string word; + argv[0] = &word; + EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)", args, 1)); + EXPECT_EQ("two", word); + + // Multi-args + int n; + argv[1] = &n; + EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)\\s*(\\d+)", args, 2)); + EXPECT_EQ("three", word); + EXPECT_EQ(4, n); +} + +TEST(RE2, FindAndConsume) { + VLOG(1) << "TestFindAndConsume"; + + RE2 r("(\\w+)"); // matches a word + string word; + + string s(" aaa b!@#$@#$cccc"); + StringPiece input(s); + + CHECK(RE2::FindAndConsume(&input, r, &word)); + CHECK_EQ(word, "aaa"); + CHECK(RE2::FindAndConsume(&input, r, &word)); + CHECK_EQ(word, "b"); + CHECK(RE2::FindAndConsume(&input, r, &word)); + CHECK_EQ(word, "cccc"); + CHECK(! RE2::FindAndConsume(&input, r, &word)); + + // Check that FindAndConsume works without any submatches. + // Earlier version used uninitialized data for + // length to consume. + input = "aaa"; + CHECK(RE2::FindAndConsume(&input, "aaa")); + CHECK_EQ(input, ""); +} + +TEST(RE2, FindAndConsumeN) { + const string s(" one two three 4"); + StringPiece input(s); + + RE2::Arg argv[2]; + const RE2::Arg* const args[2] = { &argv[0], &argv[1] }; + + // 0 arg + EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)", args, 0)); // Skips "one". + + // 1 arg + string word; + argv[0] = &word; + EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)", args, 1)); + EXPECT_EQ("two", word); + + // Multi-args + int n; + argv[1] = &n; + EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)\\s*(\\d+)", args, 2)); + EXPECT_EQ("three", word); + EXPECT_EQ(4, n); +} + +TEST(RE2, MatchNumberPeculiarity) { + VLOG(1) << "TestMatchNumberPeculiarity"; + + RE2 r("(foo)|(bar)|(baz)"); + string word1; + string word2; + string word3; + + CHECK(RE2::PartialMatch("foo", r, &word1, &word2, &word3)); + CHECK_EQ(word1, "foo"); + CHECK_EQ(word2, ""); + CHECK_EQ(word3, ""); + CHECK(RE2::PartialMatch("bar", r, &word1, &word2, &word3)); + CHECK_EQ(word1, ""); + CHECK_EQ(word2, "bar"); + CHECK_EQ(word3, ""); + CHECK(RE2::PartialMatch("baz", r, &word1, &word2, &word3)); + CHECK_EQ(word1, ""); + CHECK_EQ(word2, ""); + CHECK_EQ(word3, "baz"); + CHECK(!RE2::PartialMatch("f", r, &word1, &word2, &word3)); + + string a; + CHECK(RE2::FullMatch("hello", "(foo)|hello", &a)); + CHECK_EQ(a, ""); +} + +TEST(RE2, Match) { + RE2 re("((\\w+):([0-9]+))"); // extracts host and port + StringPiece group[4]; + + // No match. + StringPiece s = "zyzzyva"; + CHECK(!re.Match(s, 0, s.size(), RE2::UNANCHORED, + group, arraysize(group))); + + // Matches and extracts. + s = "a chrisr:9000 here"; + CHECK(re.Match(s, 0, s.size(), RE2::UNANCHORED, + group, arraysize(group))); + CHECK_EQ(group[0], "chrisr:9000"); + CHECK_EQ(group[1], "chrisr:9000"); + CHECK_EQ(group[2], "chrisr"); + CHECK_EQ(group[3], "9000"); + + string all, host; + int port; + CHECK(RE2::PartialMatch("a chrisr:9000 here", re, &all, &host, &port)); + CHECK_EQ(all, "chrisr:9000"); + CHECK_EQ(host, "chrisr"); + CHECK_EQ(port, 9000); +} + +static void TestRecursion(int size, const char *pattern) { + // Fill up a string repeating the pattern given + string domain; + domain.resize(size); + int patlen = strlen(pattern); + for (int i = 0; i < size; ++i) { + domain[i] = pattern[i % patlen]; + } + // Just make sure it doesn't crash due to too much recursion. + RE2 re("([a-zA-Z0-9]|-)+(\\.([a-zA-Z0-9]|-)+)*(\\.)?", RE2::Quiet); + RE2::FullMatch(domain, re); +} + +// A meta-quoted string, interpreted as a pattern, should always match +// the original unquoted string. +static void TestQuoteMeta(string unquoted, + const RE2::Options& options = RE2::DefaultOptions) { + string quoted = RE2::QuoteMeta(unquoted); + RE2 re(quoted, options); + EXPECT_TRUE_M(RE2::FullMatch(unquoted, re), + "Unquoted='" + unquoted + "', quoted='" + quoted + "'."); +} + +// A meta-quoted string, interpreted as a pattern, should always match +// the original unquoted string. +static void NegativeTestQuoteMeta(string unquoted, string should_not_match, + const RE2::Options& options = RE2::DefaultOptions) { + string quoted = RE2::QuoteMeta(unquoted); + RE2 re(quoted, options); + EXPECT_FALSE_M(RE2::FullMatch(should_not_match, re), + "Unquoted='" + unquoted + "', quoted='" + quoted + "'."); +} + +// Tests that quoted meta characters match their original strings, +// and that a few things that shouldn't match indeed do not. +TEST(QuoteMeta, Simple) { + TestQuoteMeta("foo"); + TestQuoteMeta("foo.bar"); + TestQuoteMeta("foo\\.bar"); + TestQuoteMeta("[1-9]"); + TestQuoteMeta("1.5-2.0?"); + TestQuoteMeta("\\d"); + TestQuoteMeta("Who doesn't like ice cream?"); + TestQuoteMeta("((a|b)c?d*e+[f-h]i)"); + TestQuoteMeta("((?!)xxx).*yyy"); + TestQuoteMeta("(["); +} +TEST(QuoteMeta, SimpleNegative) { + NegativeTestQuoteMeta("foo", "bar"); + NegativeTestQuoteMeta("...", "bar"); + NegativeTestQuoteMeta("\\.", "."); + NegativeTestQuoteMeta("\\.", ".."); + NegativeTestQuoteMeta("(a)", "a"); + NegativeTestQuoteMeta("(a|b)", "a"); + NegativeTestQuoteMeta("(a|b)", "(a)"); + NegativeTestQuoteMeta("(a|b)", "a|b"); + NegativeTestQuoteMeta("[0-9]", "0"); + NegativeTestQuoteMeta("[0-9]", "0-9"); + NegativeTestQuoteMeta("[0-9]", "[9]"); + NegativeTestQuoteMeta("((?!)xxx)", "xxx"); +} + +TEST(QuoteMeta, Latin1) { + TestQuoteMeta("3\xb2 = 9", RE2::Latin1); +} + +TEST(QuoteMeta, UTF8) { + TestQuoteMeta("Plácido Domingo"); + TestQuoteMeta("xyz"); // No fancy utf8. + TestQuoteMeta("\xc2\xb0"); // 2-byte utf8 -- a degree symbol. + TestQuoteMeta("27\xc2\xb0 degrees"); // As a middle character. + TestQuoteMeta("\xe2\x80\xb3"); // 3-byte utf8 -- a double prime. + TestQuoteMeta("\xf0\x9d\x85\x9f"); // 4-byte utf8 -- a music note. + TestQuoteMeta("27\xc2\xb0"); // Interpreted as Latin-1, this should + // still work. + NegativeTestQuoteMeta("27\xc2\xb0", + "27\\\xc2\\\xb0"); // 2-byte utf8 -- a degree symbol. +} + +TEST(QuoteMeta, HasNull) { + string has_null; + + // string with one null character + has_null += '\0'; + TestQuoteMeta(has_null); + NegativeTestQuoteMeta(has_null, ""); + + // Don't want null-followed-by-'1' to be interpreted as '\01'. + has_null += '1'; + TestQuoteMeta(has_null); + NegativeTestQuoteMeta(has_null, "\1"); +} + +TEST(ProgramSize, BigProgram) { + RE2 re_simple("simple regexp"); + RE2 re_medium("medium.*regexp"); + RE2 re_complex("hard.{1,128}regexp"); + + CHECK_GT(re_simple.ProgramSize(), 0); + CHECK_GT(re_medium.ProgramSize(), re_simple.ProgramSize()); + CHECK_GT(re_complex.ProgramSize(), re_medium.ProgramSize()); +} + +// Issue 956519: handling empty character sets was +// causing NULL dereference. This tests a few empty character sets. +// (The way to get an empty character set is to negate a full one.) +TEST(EmptyCharset, Fuzz) { + static const char *empties[] = { + "[^\\S\\s]", + "[^\\S[:space:]]", + "[^\\D\\d]", + "[^\\D[:digit:]]" + }; + for (int i = 0; i < arraysize(empties); i++) + CHECK(!RE2(empties[i]).Match("abc", 0, 3, RE2::UNANCHORED, NULL, 0)); +} + +// Test that named groups work correctly. +TEST(Capture, NamedGroups) { + { + RE2 re("(hello world)"); + CHECK_EQ(re.NumberOfCapturingGroups(), 1); + const map& m = re.NamedCapturingGroups(); + CHECK_EQ(m.size(), 0); + } + + { + RE2 re("(?Pexpr(?Pexpr)(?Pexpr))((expr)(?Pexpr))"); + CHECK_EQ(re.NumberOfCapturingGroups(), 6); + const map& m = re.NamedCapturingGroups(); + CHECK_EQ(m.size(), 4); + CHECK_EQ(m.find("A")->second, 1); + CHECK_EQ(m.find("B")->second, 2); + CHECK_EQ(m.find("C")->second, 3); + CHECK_EQ(m.find("D")->second, 6); // $4 and $5 are anonymous + } +} + +TEST(RE2, FullMatchWithNoArgs) { + CHECK(RE2::FullMatch("h", "h")); + CHECK(RE2::FullMatch("hello", "hello")); + CHECK(RE2::FullMatch("hello", "h.*o")); + CHECK(!RE2::FullMatch("othello", "h.*o")); // Must be anchored at front + CHECK(!RE2::FullMatch("hello!", "h.*o")); // Must be anchored at end +} + +TEST(RE2, PartialMatch) { + CHECK(RE2::PartialMatch("x", "x")); + CHECK(RE2::PartialMatch("hello", "h.*o")); + CHECK(RE2::PartialMatch("othello", "h.*o")); + CHECK(RE2::PartialMatch("hello!", "h.*o")); + CHECK(RE2::PartialMatch("x", "((((((((((((((((((((x))))))))))))))))))))")); +} + +TEST(RE2, PartialMatchN) { + RE2::Arg argv[2]; + const RE2::Arg* const args[2] = { &argv[0], &argv[1] }; + + // 0 arg + EXPECT_TRUE(RE2::PartialMatchN("hello", "e.*o", args, 0)); + EXPECT_FALSE(RE2::PartialMatchN("othello", "a.*o", args, 0)); + + // 1 arg + int i; + argv[0] = &i; + EXPECT_TRUE(RE2::PartialMatchN("1001 nights", "(\\d+)", args, 1)); + EXPECT_EQ(1001, i); + EXPECT_FALSE(RE2::PartialMatchN("three", "(\\d+)", args, 1)); + + // Multi-arg + string s; + argv[1] = &s; + EXPECT_TRUE(RE2::PartialMatchN("answer: 42:life", "(\\d+):(\\w+)", args, 2)); + EXPECT_EQ(42, i); + EXPECT_EQ("life", s); + EXPECT_FALSE(RE2::PartialMatchN("hi1", "(\\w+)(1)", args, 2)); +} + +TEST(RE2, FullMatchZeroArg) { + // Zero-arg + CHECK(RE2::FullMatch("1001", "\\d+")); +} + +TEST(RE2, FullMatchOneArg) { + int i; + + // Single-arg + CHECK(RE2::FullMatch("1001", "(\\d+)", &i)); + CHECK_EQ(i, 1001); + CHECK(RE2::FullMatch("-123", "(-?\\d+)", &i)); + CHECK_EQ(i, -123); + CHECK(!RE2::FullMatch("10", "()\\d+", &i)); + CHECK(!RE2::FullMatch("1234567890123456789012345678901234567890", + "(\\d+)", &i)); +} + +TEST(RE2, FullMatchIntegerArg) { + int i; + + // Digits surrounding integer-arg + CHECK(RE2::FullMatch("1234", "1(\\d*)4", &i)); + CHECK_EQ(i, 23); + CHECK(RE2::FullMatch("1234", "(\\d)\\d+", &i)); + CHECK_EQ(i, 1); + CHECK(RE2::FullMatch("-1234", "(-\\d)\\d+", &i)); + CHECK_EQ(i, -1); + CHECK(RE2::PartialMatch("1234", "(\\d)", &i)); + CHECK_EQ(i, 1); + CHECK(RE2::PartialMatch("-1234", "(-\\d)", &i)); + CHECK_EQ(i, -1); +} + +TEST(RE2, FullMatchStringArg) { + string s; + // String-arg + CHECK(RE2::FullMatch("hello", "h(.*)o", &s)); + CHECK_EQ(s, string("ell")); +} + +TEST(RE2, FullMatchStringPieceArg) { + int i; + // StringPiece-arg + StringPiece sp; + CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &sp, &i)); + CHECK_EQ(sp.size(), 4); + CHECK(memcmp(sp.data(), "ruby", 4) == 0); + CHECK_EQ(i, 1234); +} + +TEST(RE2, FullMatchMultiArg) { + int i; + string s; + // Multi-arg + CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i)); + CHECK_EQ(s, string("ruby")); + CHECK_EQ(i, 1234); +} + +TEST(RE2, FullMatchN) { + RE2::Arg argv[2]; + const RE2::Arg* const args[2] = { &argv[0], &argv[1] }; + + // 0 arg + EXPECT_TRUE(RE2::FullMatchN("hello", "h.*o", args, 0)); + EXPECT_FALSE(RE2::FullMatchN("othello", "h.*o", args, 0)); + + // 1 arg + int i; + argv[0] = &i; + EXPECT_TRUE(RE2::FullMatchN("1001", "(\\d+)", args, 1)); + EXPECT_EQ(1001, i); + EXPECT_FALSE(RE2::FullMatchN("three", "(\\d+)", args, 1)); + + // Multi-arg + string s; + argv[1] = &s; + EXPECT_TRUE(RE2::FullMatchN("42:life", "(\\d+):(\\w+)", args, 2)); + EXPECT_EQ(42, i); + EXPECT_EQ("life", s); + EXPECT_FALSE(RE2::FullMatchN("hi1", "(\\w+)(1)", args, 2)); +} + +TEST(RE2, FullMatchIgnoredArg) { + int i; + string s; + // Ignored arg + CHECK(RE2::FullMatch("ruby:1234", "(\\w+)(:)(\\d+)", &s, (void*)NULL, &i)); + CHECK_EQ(s, string("ruby")); + CHECK_EQ(i, 1234); +} + +TEST(RE2, FullMatchTypedNullArg) { + string s; + + // Ignore non-void* NULL arg + CHECK(RE2::FullMatch("hello", "he(.*)lo", (char*)NULL)); + CHECK(RE2::FullMatch("hello", "h(.*)o", (string*)NULL)); + CHECK(RE2::FullMatch("hello", "h(.*)o", (StringPiece*)NULL)); + CHECK(RE2::FullMatch("1234", "(.*)", (int*)NULL)); + CHECK(RE2::FullMatch("1234567890123456", "(.*)", (long long*)NULL)); + CHECK(RE2::FullMatch("123.4567890123456", "(.*)", (double*)NULL)); + CHECK(RE2::FullMatch("123.4567890123456", "(.*)", (float*)NULL)); + + // Fail on non-void* NULL arg if the match doesn't parse for the given type. + CHECK(!RE2::FullMatch("hello", "h(.*)lo", &s, (char*)NULL)); + CHECK(!RE2::FullMatch("hello", "(.*)", (int*)NULL)); + CHECK(!RE2::FullMatch("1234567890123456", "(.*)", (int*)NULL)); + CHECK(!RE2::FullMatch("hello", "(.*)", (double*)NULL)); + CHECK(!RE2::FullMatch("hello", "(.*)", (float*)NULL)); +} + +// Check that numeric parsing code does not read past the end of +// the number being parsed. +TEST(RE2, NULTerminated) { + char *v; + int x; + long pagesize = sysconf(_SC_PAGE_SIZE); + +#ifndef MAP_ANONYMOUS +#define MAP_ANONYMOUS MAP_ANON +#endif + v = static_cast(mmap(NULL, 2*pagesize, PROT_READ|PROT_WRITE, + MAP_ANONYMOUS|MAP_PRIVATE, -1, 0)); + CHECK(v != reinterpret_cast(-1)); + LOG(INFO) << "Memory at " << (void*)v; + CHECK_EQ(munmap(v + pagesize, pagesize), 0) << " error " << errno; + v[pagesize - 1] = '1'; + + x = 0; + CHECK(RE2::FullMatch(StringPiece(v + pagesize - 1, 1), "(.*)", &x)); + CHECK_EQ(x, 1); +} + +TEST(RE2, FullMatchTypeTests) { + // Type tests + string zeros(100, '0'); + { + char c; + CHECK(RE2::FullMatch("Hello", "(H)ello", &c)); + CHECK_EQ(c, 'H'); + } + { + unsigned char c; + CHECK(RE2::FullMatch("Hello", "(H)ello", &c)); + CHECK_EQ(c, static_cast('H')); + } + { + int16 v; + CHECK(RE2::FullMatch("100", "(-?\\d+)", &v)); CHECK_EQ(v, 100); + CHECK(RE2::FullMatch("-100", "(-?\\d+)", &v)); CHECK_EQ(v, -100); + CHECK(RE2::FullMatch("32767", "(-?\\d+)", &v)); CHECK_EQ(v, 32767); + CHECK(RE2::FullMatch("-32768", "(-?\\d+)", &v)); CHECK_EQ(v, -32768); + CHECK(!RE2::FullMatch("-32769", "(-?\\d+)", &v)); + CHECK(!RE2::FullMatch("32768", "(-?\\d+)", &v)); + } + { + uint16 v; + CHECK(RE2::FullMatch("100", "(\\d+)", &v)); CHECK_EQ(v, 100); + CHECK(RE2::FullMatch("32767", "(\\d+)", &v)); CHECK_EQ(v, 32767); + CHECK(RE2::FullMatch("65535", "(\\d+)", &v)); CHECK_EQ(v, 65535); + CHECK(!RE2::FullMatch("65536", "(\\d+)", &v)); + } + { + int32 v; + static const int32 max = 0x7fffffff; + static const int32 min = -max - 1; + CHECK(RE2::FullMatch("100", "(-?\\d+)", &v)); CHECK_EQ(v, 100); + CHECK(RE2::FullMatch("-100", "(-?\\d+)", &v)); CHECK_EQ(v, -100); + CHECK(RE2::FullMatch("2147483647", "(-?\\d+)", &v)); CHECK_EQ(v, max); + CHECK(RE2::FullMatch("-2147483648", "(-?\\d+)", &v)); CHECK_EQ(v, min); + CHECK(!RE2::FullMatch("-2147483649", "(-?\\d+)", &v)); + CHECK(!RE2::FullMatch("2147483648", "(-?\\d+)", &v)); + + CHECK(RE2::FullMatch(zeros + "2147483647", "(-?\\d+)", &v)); + CHECK_EQ(v, max); + CHECK(RE2::FullMatch("-" + zeros + "2147483648", "(-?\\d+)", &v)); + CHECK_EQ(v, min); + + CHECK(!RE2::FullMatch("-" + zeros + "2147483649", "(-?\\d+)", &v)); + CHECK(RE2::FullMatch("0x7fffffff", "(.*)", RE2::CRadix(&v))); + CHECK_EQ(v, max); + CHECK(!RE2::FullMatch("000x7fffffff", "(.*)", RE2::CRadix(&v))); + } + { + uint32 v; + static const uint32 max = 0xfffffffful; + CHECK(RE2::FullMatch("100", "(\\d+)", &v)); CHECK_EQ(v, 100); + CHECK(RE2::FullMatch("4294967295", "(\\d+)", &v)); CHECK_EQ(v, max); + CHECK(!RE2::FullMatch("4294967296", "(\\d+)", &v)); + CHECK(!RE2::FullMatch("-1", "(\\d+)", &v)); + + CHECK(RE2::FullMatch(zeros + "4294967295", "(\\d+)", &v)); CHECK_EQ(v, max); + } + { + int64 v; + static const int64 max = 0x7fffffffffffffffull; + static const int64 min = -max - 1; + char buf[32]; + + CHECK(RE2::FullMatch("100", "(-?\\d+)", &v)); CHECK_EQ(v, 100); + CHECK(RE2::FullMatch("-100", "(-?\\d+)", &v)); CHECK_EQ(v, -100); + + snprintf(buf, sizeof(buf), "%lld", (long long int)max); + CHECK(RE2::FullMatch(buf, "(-?\\d+)", &v)); CHECK_EQ(v, max); + + snprintf(buf, sizeof(buf), "%lld", (long long int)min); + CHECK(RE2::FullMatch(buf, "(-?\\d+)", &v)); CHECK_EQ(v, min); + + snprintf(buf, sizeof(buf), "%lld", (long long int)max); + assert(buf[strlen(buf)-1] != '9'); + buf[strlen(buf)-1]++; + CHECK(!RE2::FullMatch(buf, "(-?\\d+)", &v)); + + snprintf(buf, sizeof(buf), "%lld", (long long int)min); + assert(buf[strlen(buf)-1] != '9'); + buf[strlen(buf)-1]++; + CHECK(!RE2::FullMatch(buf, "(-?\\d+)", &v)); + } + { + uint64 v; + int64 v2; + static const uint64 max = 0xffffffffffffffffull; + char buf[32]; + + CHECK(RE2::FullMatch("100", "(-?\\d+)", &v)); CHECK_EQ(v, 100); + CHECK(RE2::FullMatch("-100", "(-?\\d+)", &v2)); CHECK_EQ(v2, -100); + + snprintf(buf, sizeof(buf), "%llu", (long long unsigned)max); + CHECK(RE2::FullMatch(buf, "(-?\\d+)", &v)); CHECK_EQ(v, max); + + assert(buf[strlen(buf)-1] != '9'); + buf[strlen(buf)-1]++; + CHECK(!RE2::FullMatch(buf, "(-?\\d+)", &v)); + } +} + +TEST(RE2, FloatingPointFullMatchTypes) { + string zeros(100, '0'); + { + float v; + CHECK(RE2::FullMatch("100", "(.*)", &v)); CHECK_EQ(v, 100); + CHECK(RE2::FullMatch("-100.", "(.*)", &v)); CHECK_EQ(v, -100); + CHECK(RE2::FullMatch("1e23", "(.*)", &v)); CHECK_EQ(v, float(1e23)); + + CHECK(RE2::FullMatch(zeros + "1e23", "(.*)", &v)); + CHECK_EQ(v, float(1e23)); + + // 6700000000081920.1 is an edge case. + // 6700000000081920 is exactly halfway between + // two float32s, so the .1 should make it round up. + // However, the .1 is outside the precision possible with + // a float64: the nearest float64 is 6700000000081920. + // So if the code uses strtod and then converts to float32, + // round-to-even will make it round down instead of up. + // To pass the test, the parser must call strtof directly. + // This test case is carefully chosen to use only a 17-digit + // number, since C does not guarantee to get the correctly + // rounded answer for strtod and strtof unless the input is + // short. + CHECK(RE2::FullMatch("0.1", "(.*)", &v)); + CHECK_EQ(v, 0.1f) << StringPrintf("%.8g != %.8g", v, 0.1f); + CHECK(RE2::FullMatch("6700000000081920.1", "(.*)", &v)); + CHECK_EQ(v, 6700000000081920.1f) + << StringPrintf("%.8g != %.8g", v, 6700000000081920.1f); + } + { + double v; + CHECK(RE2::FullMatch("100", "(.*)", &v)); CHECK_EQ(v, 100); + CHECK(RE2::FullMatch("-100.", "(.*)", &v)); CHECK_EQ(v, -100); + CHECK(RE2::FullMatch("1e23", "(.*)", &v)); CHECK_EQ(v, 1e23); + CHECK(RE2::FullMatch(zeros + "1e23", "(.*)", &v)); + CHECK_EQ(v, double(1e23)); + + CHECK(RE2::FullMatch("0.1", "(.*)", &v)); + CHECK_EQ(v, 0.1) << StringPrintf("%.17g != %.17g", v, 0.1); + CHECK(RE2::FullMatch("1.00000005960464485", "(.*)", &v)); + CHECK_EQ(v, 1.0000000596046448) + << StringPrintf("%.17g != %.17g", v, 1.0000000596046448); + } +} + +TEST(RE2, FullMatchAnchored) { + int i; + // Check that matching is fully anchored + CHECK(!RE2::FullMatch("x1001", "(\\d+)", &i)); + CHECK(!RE2::FullMatch("1001x", "(\\d+)", &i)); + CHECK(RE2::FullMatch("x1001", "x(\\d+)", &i)); CHECK_EQ(i, 1001); + CHECK(RE2::FullMatch("1001x", "(\\d+)x", &i)); CHECK_EQ(i, 1001); +} + +TEST(RE2, FullMatchBraces) { + // Braces + CHECK(RE2::FullMatch("0abcd", "[0-9a-f+.-]{5,}")); + CHECK(RE2::FullMatch("0abcde", "[0-9a-f+.-]{5,}")); + CHECK(!RE2::FullMatch("0abc", "[0-9a-f+.-]{5,}")); +} + +TEST(RE2, Complicated) { + // Complicated RE2 + CHECK(RE2::FullMatch("foo", "foo|bar|[A-Z]")); + CHECK(RE2::FullMatch("bar", "foo|bar|[A-Z]")); + CHECK(RE2::FullMatch("X", "foo|bar|[A-Z]")); + CHECK(!RE2::FullMatch("XY", "foo|bar|[A-Z]")); +} + +TEST(RE2, FullMatchEnd) { + // Check full-match handling (needs '$' tacked on internally) + CHECK(RE2::FullMatch("fo", "fo|foo")); + CHECK(RE2::FullMatch("foo", "fo|foo")); + CHECK(RE2::FullMatch("fo", "fo|foo$")); + CHECK(RE2::FullMatch("foo", "fo|foo$")); + CHECK(RE2::FullMatch("foo", "foo$")); + CHECK(!RE2::FullMatch("foo$bar", "foo\\$")); + CHECK(!RE2::FullMatch("fox", "fo|bar")); + + // Uncomment the following if we change the handling of '$' to + // prevent it from matching a trailing newline + if (false) { + // Check that we don't get bitten by pcre's special handling of a + // '\n' at the end of the string matching '$' + CHECK(!RE2::PartialMatch("foo\n", "foo$")); + } +} + +TEST(RE2, FullMatchArgCount) { + // Number of args + int a[16]; + CHECK(RE2::FullMatch("", "")); + + memset(a, 0, sizeof(0)); + CHECK(RE2::FullMatch("1", + "(\\d){1}", + &a[0])); + CHECK_EQ(a[0], 1); + + memset(a, 0, sizeof(0)); + CHECK(RE2::FullMatch("12", + "(\\d)(\\d)", + &a[0], &a[1])); + CHECK_EQ(a[0], 1); + CHECK_EQ(a[1], 2); + + memset(a, 0, sizeof(0)); + CHECK(RE2::FullMatch("123", + "(\\d)(\\d)(\\d)", + &a[0], &a[1], &a[2])); + CHECK_EQ(a[0], 1); + CHECK_EQ(a[1], 2); + CHECK_EQ(a[2], 3); + + memset(a, 0, sizeof(0)); + CHECK(RE2::FullMatch("1234", + "(\\d)(\\d)(\\d)(\\d)", + &a[0], &a[1], &a[2], &a[3])); + CHECK_EQ(a[0], 1); + CHECK_EQ(a[1], 2); + CHECK_EQ(a[2], 3); + CHECK_EQ(a[3], 4); + + memset(a, 0, sizeof(0)); + CHECK(RE2::FullMatch("12345", + "(\\d)(\\d)(\\d)(\\d)(\\d)", + &a[0], &a[1], &a[2], &a[3], + &a[4])); + CHECK_EQ(a[0], 1); + CHECK_EQ(a[1], 2); + CHECK_EQ(a[2], 3); + CHECK_EQ(a[3], 4); + CHECK_EQ(a[4], 5); + + memset(a, 0, sizeof(0)); + CHECK(RE2::FullMatch("123456", + "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)", + &a[0], &a[1], &a[2], &a[3], + &a[4], &a[5])); + CHECK_EQ(a[0], 1); + CHECK_EQ(a[1], 2); + CHECK_EQ(a[2], 3); + CHECK_EQ(a[3], 4); + CHECK_EQ(a[4], 5); + CHECK_EQ(a[5], 6); + + memset(a, 0, sizeof(0)); + CHECK(RE2::FullMatch("1234567", + "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)", + &a[0], &a[1], &a[2], &a[3], + &a[4], &a[5], &a[6])); + CHECK_EQ(a[0], 1); + CHECK_EQ(a[1], 2); + CHECK_EQ(a[2], 3); + CHECK_EQ(a[3], 4); + CHECK_EQ(a[4], 5); + CHECK_EQ(a[5], 6); + CHECK_EQ(a[6], 7); + + memset(a, 0, sizeof(0)); + CHECK(RE2::FullMatch("1234567890123456", + "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)" + "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)", + &a[0], &a[1], &a[2], &a[3], + &a[4], &a[5], &a[6], &a[7], + &a[8], &a[9], &a[10], &a[11], + &a[12], &a[13], &a[14], &a[15])); + CHECK_EQ(a[0], 1); + CHECK_EQ(a[1], 2); + CHECK_EQ(a[2], 3); + CHECK_EQ(a[3], 4); + CHECK_EQ(a[4], 5); + CHECK_EQ(a[5], 6); + CHECK_EQ(a[6], 7); + CHECK_EQ(a[7], 8); + CHECK_EQ(a[8], 9); + CHECK_EQ(a[9], 0); + CHECK_EQ(a[10], 1); + CHECK_EQ(a[11], 2); + CHECK_EQ(a[12], 3); + CHECK_EQ(a[13], 4); + CHECK_EQ(a[14], 5); + CHECK_EQ(a[15], 6); +} + +TEST(RE2, Accessors) { + // Check the pattern() accessor + { + const string kPattern = "http://([^/]+)/.*"; + const RE2 re(kPattern); + CHECK_EQ(kPattern, re.pattern()); + } + + // Check RE2 error field. + { + RE2 re("foo"); + CHECK(re.error().empty()); // Must have no error + CHECK(re.ok()); + CHECK(re.error_code() == RE2::NoError); + } +} + +TEST(RE2, UTF8) { + // Check UTF-8 handling + // Three Japanese characters (nihongo) + const char utf8_string[] = { + 0xe6, 0x97, 0xa5, // 65e5 + 0xe6, 0x9c, 0xac, // 627c + 0xe8, 0xaa, 0x9e, // 8a9e + 0 + }; + const char utf8_pattern[] = { + '.', + 0xe6, 0x9c, 0xac, // 627c + '.', + 0 + }; + + // Both should match in either mode, bytes or UTF-8 + RE2 re_test1(".........", RE2::Latin1); + CHECK(RE2::FullMatch(utf8_string, re_test1)); + RE2 re_test2("..."); + CHECK(RE2::FullMatch(utf8_string, re_test2)); + + // Check that '.' matches one byte or UTF-8 character + // according to the mode. + string s; + RE2 re_test3("(.)", RE2::Latin1); + CHECK(RE2::PartialMatch(utf8_string, re_test3, &s)); + CHECK_EQ(s, string("\xe6")); + RE2 re_test4("(.)"); + CHECK(RE2::PartialMatch(utf8_string, re_test4, &s)); + CHECK_EQ(s, string("\xe6\x97\xa5")); + + // Check that string matches itself in either mode + RE2 re_test5(utf8_string, RE2::Latin1); + CHECK(RE2::FullMatch(utf8_string, re_test5)); + RE2 re_test6(utf8_string); + CHECK(RE2::FullMatch(utf8_string, re_test6)); + + // Check that pattern matches string only in UTF8 mode + RE2 re_test7(utf8_pattern, RE2::Latin1); + CHECK(!RE2::FullMatch(utf8_string, re_test7)); + RE2 re_test8(utf8_pattern); + CHECK(RE2::FullMatch(utf8_string, re_test8)); +} + +TEST(RE2, UngreedyUTF8) { + // Check that ungreedy, UTF8 regular expressions don't match when they + // oughtn't -- see bug 82246. + { + // This code always worked. + const char* pattern = "\\w+X"; + const string target = "a aX"; + RE2 match_sentence(pattern, RE2::Latin1); + RE2 match_sentence_re(pattern); + + CHECK(!RE2::FullMatch(target, match_sentence)); + CHECK(!RE2::FullMatch(target, match_sentence_re)); + } + { + const char* pattern = "(?U)\\w+X"; + const string target = "a aX"; + RE2 match_sentence(pattern, RE2::Latin1); + CHECK_EQ(match_sentence.error(), ""); + RE2 match_sentence_re(pattern); + + CHECK(!RE2::FullMatch(target, match_sentence)); + CHECK(!RE2::FullMatch(target, match_sentence_re)); + } +} + +TEST(RE2, Rejects) { + { RE2 re("a\\1", RE2::Quiet); CHECK(!re.ok()); } + { + RE2 re("a[x", RE2::Quiet); + CHECK(!re.ok()); + } + { + RE2 re("a[z-a]", RE2::Quiet); + CHECK(!re.ok()); + } + { + RE2 re("a[[:foobar:]]", RE2::Quiet); + CHECK(!re.ok()); + } + { + RE2 re("a(b", RE2::Quiet); + CHECK(!re.ok()); + } + { + RE2 re("a\\", RE2::Quiet); + CHECK(!re.ok()); + } +} + +TEST(RE2, NoCrash) { + // Test that using a bad regexp doesn't crash. + { + RE2 re("a\\", RE2::Quiet); + CHECK(!re.ok()); + CHECK(!RE2::PartialMatch("a\\b", re)); + } + + // Test that using an enormous regexp doesn't crash + { + RE2 re("(((.{100}){100}){100}){100}", RE2::Quiet); + CHECK(!re.ok()); + CHECK(!RE2::PartialMatch("aaa", re)); + } + + // Test that a crazy regexp still compiles and runs. + { + RE2 re(".{512}x", RE2::Quiet); + CHECK(re.ok()); + string s; + s.append(515, 'c'); + s.append("x"); + CHECK(RE2::PartialMatch(s, re)); + } +} + +TEST(RE2, Recursion) { + // Test that recursion is stopped. + // This test is PCRE-legacy -- there's no recursion in RE2. + int bytes = 15 * 1024; // enough to crash PCRE + TestRecursion(bytes, "."); + TestRecursion(bytes, "a"); + TestRecursion(bytes, "a."); + TestRecursion(bytes, "ab."); + TestRecursion(bytes, "abc."); +} + +TEST(RE2, BigCountedRepetition) { + // Test that counted repetition works, given tons of memory. + RE2::Options opt; + opt.set_max_mem(256<<20); + + RE2 re(".{512}x", opt); + CHECK(re.ok()); + string s; + s.append(515, 'c'); + s.append("x"); + CHECK(RE2::PartialMatch(s, re)); +} + +TEST(RE2, DeepRecursion) { + // Test for deep stack recursion. This would fail with a + // segmentation violation due to stack overflow before pcre was + // patched. + // Again, a PCRE legacy test. RE2 doesn't recurse. + string comment("x*"); + string a(131072, 'a'); + comment += a; + comment += "*x"; + RE2 re("((?:\\s|xx.*\n|x[*](?:\n|.)*?[*]x)*)"); + CHECK(RE2::FullMatch(comment, re)); +} + +// Suggested by Josh Hyman. Failed when SearchOnePass was +// not implementing case-folding. +TEST(CaseInsensitive, MatchAndConsume) { + string result; + string text = "A fish named *Wanda*"; + StringPiece sp(text); + + EXPECT_TRUE(RE2::PartialMatch(sp, "(?i)([wand]{5})", &result)); + EXPECT_TRUE(RE2::FindAndConsume(&sp, "(?i)([wand]{5})", &result)); +} + +// RE2 should permit implicit conversions from string, StringPiece, const char*, +// and C string literals. +TEST(RE2, ImplicitConversions) { + string re_string("."); + StringPiece re_stringpiece("."); + const char* re_cstring = "."; + EXPECT_TRUE(RE2::PartialMatch("e", re_string)); + EXPECT_TRUE(RE2::PartialMatch("e", re_stringpiece)); + EXPECT_TRUE(RE2::PartialMatch("e", re_cstring)); + EXPECT_TRUE(RE2::PartialMatch("e", ".")); +} + +// Bugs introduced by 8622304 +TEST(RE2, CL8622304) { + // reported by ingow + string dir; + EXPECT_TRUE(RE2::FullMatch("D", "([^\\\\])")); // ok + EXPECT_TRUE(RE2::FullMatch("D", "([^\\\\])", &dir)); // fails + + // reported by jacobsa + string key, val; + EXPECT_TRUE(RE2::PartialMatch("bar:1,0x2F,030,4,5;baz:true;fooby:false,true", + "(\\w+)(?::((?:[^;\\\\]|\\\\.)*))?;?", + &key, + &val)); + EXPECT_EQ(key, "bar"); + EXPECT_EQ(val, "1,0x2F,030,4,5"); +} + + +// Check that RE2 returns correct regexp pieces on error. +// In particular, make sure it returns whole runes +// and that it always reports invalid UTF-8. +// Also check that Perl error flag piece is big enough. +static struct ErrorTest { + const char *regexp; + const char *error; +} error_tests[] = { + { "ab\\αcd", "\\α" }, + { "ef\\x☺01", "\\x☺0" }, + { "gh\\x1☺01", "\\x1☺" }, + { "ij\\x1", "\\x1" }, + { "kl\\x", "\\x" }, + { "uv\\x{0000☺}", "\\x{0000☺" }, + { "wx\\p{ABC", "\\p{ABC" }, + { "yz(?smiUX:abc)", "(?smiUX" }, // used to return (?s but the error is X + { "aa(?sm☺i", "(?sm☺" }, + { "bb[abc", "[abc" }, + + { "mn\\x1\377", "" }, // no argument string returned for invalid UTF-8 + { "op\377qr", "" }, + { "st\\x{00000\377", "" }, + { "zz\\p{\377}", "" }, + { "zz\\x{00\377}", "" }, + { "zz(?Pabc)", "" }, +}; +TEST(RE2, ErrorArgs) { + for (int i = 0; i < arraysize(error_tests); i++) { + RE2 re(error_tests[i].regexp, RE2::Quiet); + EXPECT_FALSE(re.ok()); + EXPECT_EQ(re.error_arg(), error_tests[i].error) << re.error(); + } +} + +// Check that "never match \n" mode never matches \n. +static struct NeverTest { + const char* regexp; + const char* text; + const char* match; +} never_tests[] = { + { "(.*)", "abc\ndef\nghi\n", "abc" }, + { "(?s)(abc.*def)", "abc\ndef\n", NULL }, + { "(abc(.|\n)*def)", "abc\ndef\n", NULL }, + { "(abc[^x]*def)", "abc\ndef\n", NULL }, + { "(abc[^x]*def)", "abczzzdef\ndef\n", "abczzzdef" }, +}; +TEST(RE2, NeverNewline) { + RE2::Options opt; + opt.set_never_nl(true); + for (int i = 0; i < arraysize(never_tests); i++) { + const NeverTest& t = never_tests[i]; + RE2 re(t.regexp, opt); + if (t.match == NULL) { + EXPECT_FALSE(re.PartialMatch(t.text, re)); + } else { + StringPiece m; + EXPECT_TRUE(re.PartialMatch(t.text, re, &m)); + EXPECT_EQ(m, t.match); + } + } +} + +// Check that there are no capturing groups in "never capture" mode. +TEST(RE2, NeverCapture) { + RE2::Options opt; + opt.set_never_capture(true); + RE2 re("(r)(e)", opt); + EXPECT_EQ(0, re.NumberOfCapturingGroups()); +} + +// Bitstate bug was looking at submatch[0] even if nsubmatch == 0. +// Triggered by a failed DFA search falling back to Bitstate when +// using Match with a NULL submatch set. Bitstate tried to read +// the submatch[0] entry even if nsubmatch was 0. +TEST(RE2, BitstateCaptureBug) { + RE2::Options opt; + opt.set_max_mem(20000); + RE2 re("(_________$)", opt); + StringPiece s = "xxxxxxxxxxxxxxxxxxxxxxxxxx_________x"; + EXPECT_FALSE(re.Match(s, 0, s.size(), RE2::UNANCHORED, NULL, 0)); +} + +// C++ version of bug 609710. +TEST(RE2, UnicodeClasses) { + const string str = "ABCDEFGHI譚永鋒"; + string a, b, c; + + EXPECT_TRUE(RE2::FullMatch("A", "\\p{L}")); + EXPECT_TRUE(RE2::FullMatch("A", "\\p{Lu}")); + EXPECT_FALSE(RE2::FullMatch("A", "\\p{Ll}")); + EXPECT_FALSE(RE2::FullMatch("A", "\\P{L}")); + EXPECT_FALSE(RE2::FullMatch("A", "\\P{Lu}")); + EXPECT_TRUE(RE2::FullMatch("A", "\\P{Ll}")); + + EXPECT_TRUE(RE2::FullMatch("譚", "\\p{L}")); + EXPECT_FALSE(RE2::FullMatch("譚", "\\p{Lu}")); + EXPECT_FALSE(RE2::FullMatch("譚", "\\p{Ll}")); + EXPECT_FALSE(RE2::FullMatch("譚", "\\P{L}")); + EXPECT_TRUE(RE2::FullMatch("譚", "\\P{Lu}")); + EXPECT_TRUE(RE2::FullMatch("譚", "\\P{Ll}")); + + EXPECT_TRUE(RE2::FullMatch("永", "\\p{L}")); + EXPECT_FALSE(RE2::FullMatch("永", "\\p{Lu}")); + EXPECT_FALSE(RE2::FullMatch("永", "\\p{Ll}")); + EXPECT_FALSE(RE2::FullMatch("永", "\\P{L}")); + EXPECT_TRUE(RE2::FullMatch("永", "\\P{Lu}")); + EXPECT_TRUE(RE2::FullMatch("永", "\\P{Ll}")); + + EXPECT_TRUE(RE2::FullMatch("鋒", "\\p{L}")); + EXPECT_FALSE(RE2::FullMatch("鋒", "\\p{Lu}")); + EXPECT_FALSE(RE2::FullMatch("鋒", "\\p{Ll}")); + EXPECT_FALSE(RE2::FullMatch("鋒", "\\P{L}")); + EXPECT_TRUE(RE2::FullMatch("鋒", "\\P{Lu}")); + EXPECT_TRUE(RE2::FullMatch("鋒", "\\P{Ll}")); + + EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?(.).*?(.)", &a, &b, &c)); + EXPECT_EQ("A", a); + EXPECT_EQ("B", b); + EXPECT_EQ("C", c); + + EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?([\\p{L}]).*?(.)", &a, &b, &c)); + EXPECT_EQ("A", a); + EXPECT_EQ("B", b); + EXPECT_EQ("C", c); + + EXPECT_FALSE(RE2::PartialMatch(str, "\\P{L}")); + + EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?([\\p{Lu}]).*?(.)", &a, &b, &c)); + EXPECT_EQ("A", a); + EXPECT_EQ("B", b); + EXPECT_EQ("C", c); + + EXPECT_FALSE(RE2::PartialMatch(str, "[^\\p{Lu}\\p{Lo}]")); + + EXPECT_TRUE(RE2::PartialMatch(str, ".*(.).*?([\\p{Lu}\\p{Lo}]).*?(.)", &a, &b, &c)); + EXPECT_EQ("譚", a); + EXPECT_EQ("永", b); + EXPECT_EQ("鋒", c); +} + +// Bug reported by saito. 2009/02/17 +TEST(RE2, NullVsEmptyString) { + RE2 re2(".*"); + StringPiece v1(""); + EXPECT_TRUE(RE2::FullMatch(v1, re2)); + + StringPiece v2; + EXPECT_TRUE(RE2::FullMatch(v2, re2)); +} + +// Issue 1816809 +TEST(RE2, Bug1816809) { + RE2 re("(((((llx((-3)|(4)))(;(llx((-3)|(4))))*))))"); + StringPiece piece("llx-3;llx4"); + string x; + EXPECT_TRUE(RE2::Consume(&piece, re, &x)); +} + +// Issue 3061120 +TEST(RE2, Bug3061120) { + RE2 re("(?i)\\W"); + EXPECT_FALSE(RE2::PartialMatch("x", re)); // always worked + EXPECT_FALSE(RE2::PartialMatch("k", re)); // broke because of kelvin + EXPECT_FALSE(RE2::PartialMatch("s", re)); // broke because of latin long s +} + +TEST(RE2, CapturingGroupNames) { + // Opening parentheses annotated with group IDs: + // 12 3 45 6 7 + RE2 re("((abc)(?P)|((e+)(?P.*)(?Pu+)))"); + EXPECT_TRUE(re.ok()); + const map& have = re.CapturingGroupNames(); + map want; + want[3] = "G2"; + want[6] = "G2"; + want[7] = "G1"; + EXPECT_EQ(want, have); +} + +TEST(RE2, RegexpToStringLossOfAnchor) { + EXPECT_EQ(RE2("^[a-c]at", RE2::POSIX).Regexp()->ToString(), "^[a-c]at"); + EXPECT_EQ(RE2("^[a-c]at").Regexp()->ToString(), "(?-m:^)[a-c]at"); + EXPECT_EQ(RE2("ca[t-z]$", RE2::POSIX).Regexp()->ToString(), "ca[t-z]$"); + EXPECT_EQ(RE2("ca[t-z]$").Regexp()->ToString(), "ca[t-z](?-m:$)"); +} + +} // namespace re2 diff --git a/re2/testing/regexp_benchmark.cc b/re2/testing/regexp_benchmark.cc new file mode 100644 index 0000000..ca7627f --- /dev/null +++ b/re2/testing/regexp_benchmark.cc @@ -0,0 +1,1461 @@ +// Copyright 2006-2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Benchmarks for regular expression implementations. + +#include "util/test.h" +#include "re2/prog.h" +#include "re2/re2.h" +#include "re2/regexp.h" +#include "util/pcre.h" +#include "util/benchmark.h" + +namespace re2 { +void Test(); +void MemoryUsage(); +} // namespace re2 + +typedef testing::MallocCounter MallocCounter; + +namespace re2 { + +void Test() { + Regexp* re = Regexp::Parse("(\\d+)-(\\d+)-(\\d+)", Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + CHECK(prog->IsOnePass()); + const char* text = "650-253-0001"; + StringPiece sp[4]; + CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); + CHECK_EQ(sp[0], "650-253-0001"); + CHECK_EQ(sp[1], "650"); + CHECK_EQ(sp[2], "253"); + CHECK_EQ(sp[3], "0001"); + delete prog; + re->Decref(); + LOG(INFO) << "test passed\n"; +} + +void MemoryUsage() { + const char* regexp = "(\\d+)-(\\d+)-(\\d+)"; + const char* text = "650-253-0001"; + { + MallocCounter mc(MallocCounter::THIS_THREAD_ONLY); + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + // Can't pass mc.HeapGrowth() and mc.PeakHeapGrowth() to LOG(INFO) directly, + // because LOG(INFO) might do a big allocation before they get evaluated. + fprintf(stderr, "Regexp: %7lld bytes (peak=%lld)\n", mc.HeapGrowth(), mc.PeakHeapGrowth()); + mc.Reset(); + + Prog* prog = re->CompileToProg(0); + CHECK(prog); + CHECK(prog->IsOnePass()); + fprintf(stderr, "Prog: %7lld bytes (peak=%lld)\n", mc.HeapGrowth(), mc.PeakHeapGrowth()); + mc.Reset(); + + StringPiece sp[4]; + CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); + fprintf(stderr, "Search: %7lld bytes (peak=%lld)\n", mc.HeapGrowth(), mc.PeakHeapGrowth()); + delete prog; + re->Decref(); + } + + { + MallocCounter mc(MallocCounter::THIS_THREAD_ONLY); + + PCRE re(regexp, PCRE::UTF8); + fprintf(stderr, "RE: %7lld bytes (peak=%lld)\n", mc.HeapGrowth(), mc.PeakHeapGrowth()); + PCRE::FullMatch(text, re); + fprintf(stderr, "RE: %7lld bytes (peak=%lld)\n", mc.HeapGrowth(), mc.PeakHeapGrowth()); + } + + { + MallocCounter mc(MallocCounter::THIS_THREAD_ONLY); + + PCRE* re = new PCRE(regexp, PCRE::UTF8); + fprintf(stderr, "PCRE*: %7lld bytes (peak=%lld)\n", mc.HeapGrowth(), mc.PeakHeapGrowth()); + PCRE::FullMatch(text, *re); + fprintf(stderr, "PCRE*: %7lld bytes (peak=%lld)\n", mc.HeapGrowth(), mc.PeakHeapGrowth()); + delete re; + } + + { + MallocCounter mc(MallocCounter::THIS_THREAD_ONLY); + + RE2 re(regexp); + fprintf(stderr, "RE2: %7lld bytes (peak=%lld)\n", mc.HeapGrowth(), mc.PeakHeapGrowth()); + RE2::FullMatch(text, re); + fprintf(stderr, "RE2: %7lld bytes (peak=%lld)\n", mc.HeapGrowth(), mc.PeakHeapGrowth()); + } + + fprintf(stderr, "sizeof: PCRE=%d RE2=%d Prog=%d Inst=%d\n", + static_cast(sizeof(PCRE)), + static_cast(sizeof(RE2)), + static_cast(sizeof(Prog)), + static_cast(sizeof(Prog::Inst))); +} + +// Regular expression implementation wrappers. +// Defined at bottom of file, but they are repetitive +// and not interesting. + +typedef void SearchImpl(int iters, const char* regexp, const StringPiece& text, + Prog::Anchor anchor, bool expect_match); + +SearchImpl SearchDFA, SearchNFA, SearchOnePass, SearchBitState, + SearchPCRE, SearchRE2, + SearchCachedDFA, SearchCachedNFA, SearchCachedOnePass, SearchCachedBitState, + SearchCachedPCRE, SearchCachedRE2; + +typedef void ParseImpl(int iters, const char* regexp, const StringPiece& text); + +ParseImpl Parse1NFA, Parse1OnePass, Parse1BitState, + Parse1PCRE, Parse1RE2, + Parse1Backtrack, + Parse1CachedNFA, Parse1CachedOnePass, Parse1CachedBitState, + Parse1CachedPCRE, Parse1CachedRE2, + Parse1CachedBacktrack; + +ParseImpl Parse3NFA, Parse3OnePass, Parse3BitState, + Parse3PCRE, Parse3RE2, + Parse3Backtrack, + Parse3CachedNFA, Parse3CachedOnePass, Parse3CachedBitState, + Parse3CachedPCRE, Parse3CachedRE2, + Parse3CachedBacktrack; + +ParseImpl SearchParse2CachedPCRE, SearchParse2CachedRE2; + +ParseImpl SearchParse1CachedPCRE, SearchParse1CachedRE2; + +// Benchmark: failed search for regexp in random text. + +// Generate random text that won't contain the search string, +// to test worst-case search behavior. +void MakeText(string* text, int nbytes) { + text->resize(nbytes); + srand(0); + for (int i = 0; i < nbytes; i++) { + if (!rand()%30) + (*text)[i] = '\n'; + else + (*text)[i] = rand()%(0x7E + 1 - 0x20)+0x20; + } +} + +// Makes text of size nbytes, then calls run to search +// the text for regexp iters times. +void Search(int iters, int nbytes, const char* regexp, SearchImpl* search) { + StopBenchmarkTiming(); + string s; + MakeText(&s, nbytes); + BenchmarkMemoryUsage(); + StartBenchmarkTiming(); + search(iters, regexp, s, Prog::kUnanchored, false); + SetBenchmarkBytesProcessed(static_cast(iters)*nbytes); +} + +// These two are easy because they start with an A, +// giving the search loop something to memchr for. +#define EASY0 "ABCDEFGHIJKLMNOPQRSTUVWXYZ$" +#define EASY1 "A[AB]B[BC]C[CD]D[DE]E[EF]F[FG]G[GH]H[HI]I[IJ]J$" + +// This is a little harder, since it starts with a character class +// and thus can't be memchr'ed. Could look for ABC and work backward, +// but no one does that. +#define MEDIUM "[XYZ]ABCDEFGHIJKLMNOPQRSTUVWXYZ$" + +// This is a fair amount harder, because of the leading [ -~]*. +// A bad backtracking implementation will take O(text^2) time to +// figure out there's no match. +#define HARD "[ -~]*ABCDEFGHIJKLMNOPQRSTUVWXYZ$" + +// This stresses engines that are trying to track parentheses. +#define PARENS "([ -~])*(A)(B)(C)(D)(E)(F)(G)(H)(I)(J)(K)(L)(M)" \ + "(N)(O)(P)(Q)(R)(S)(T)(U)(V)(W)(X)(Y)(Z)$" + +void Search_Easy0_CachedDFA(int i, int n) { Search(i, n, EASY0, SearchCachedDFA); } +void Search_Easy0_CachedNFA(int i, int n) { Search(i, n, EASY0, SearchCachedNFA); } +void Search_Easy0_CachedPCRE(int i, int n) { Search(i, n, EASY0, SearchCachedPCRE); } +void Search_Easy0_CachedRE2(int i, int n) { Search(i, n, EASY0, SearchCachedRE2); } + +BENCHMARK_RANGE(Search_Easy0_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_Easy0_CachedNFA, 8, 256<<10)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK_RANGE(Search_Easy0_CachedPCRE, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(Search_Easy0_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); + +void Search_Easy1_CachedDFA(int i, int n) { Search(i, n, EASY1, SearchCachedDFA); } +void Search_Easy1_CachedNFA(int i, int n) { Search(i, n, EASY1, SearchCachedNFA); } +void Search_Easy1_CachedPCRE(int i, int n) { Search(i, n, EASY1, SearchCachedPCRE); } +void Search_Easy1_CachedRE2(int i, int n) { Search(i, n, EASY1, SearchCachedRE2); } + +BENCHMARK_RANGE(Search_Easy1_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_Easy1_CachedNFA, 8, 256<<10)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK_RANGE(Search_Easy1_CachedPCRE, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(Search_Easy1_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); + +void Search_Medium_CachedDFA(int i, int n) { Search(i, n, MEDIUM, SearchCachedDFA); } +void Search_Medium_CachedNFA(int i, int n) { Search(i, n, MEDIUM, SearchCachedNFA); } +void Search_Medium_CachedPCRE(int i, int n) { Search(i, n, MEDIUM, SearchCachedPCRE); } +void Search_Medium_CachedRE2(int i, int n) { Search(i, n, MEDIUM, SearchCachedRE2); } + +BENCHMARK_RANGE(Search_Medium_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_Medium_CachedNFA, 8, 256<<10)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK_RANGE(Search_Medium_CachedPCRE, 8, 256<<10)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(Search_Medium_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); + +void Search_Hard_CachedDFA(int i, int n) { Search(i, n, HARD, SearchCachedDFA); } +void Search_Hard_CachedNFA(int i, int n) { Search(i, n, HARD, SearchCachedNFA); } +void Search_Hard_CachedPCRE(int i, int n) { Search(i, n, HARD, SearchCachedPCRE); } +void Search_Hard_CachedRE2(int i, int n) { Search(i, n, HARD, SearchCachedRE2); } + +BENCHMARK_RANGE(Search_Hard_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_Hard_CachedNFA, 8, 256<<10)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK_RANGE(Search_Hard_CachedPCRE, 8, 4<<10)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(Search_Hard_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); + +void Search_Parens_CachedDFA(int i, int n) { Search(i, n, PARENS, SearchCachedDFA); } +void Search_Parens_CachedNFA(int i, int n) { Search(i, n, PARENS, SearchCachedNFA); } +void Search_Parens_CachedPCRE(int i, int n) { Search(i, n, PARENS, SearchCachedPCRE); } +void Search_Parens_CachedRE2(int i, int n) { Search(i, n, PARENS, SearchCachedRE2); } + +BENCHMARK_RANGE(Search_Parens_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_Parens_CachedNFA, 8, 256<<10)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK_RANGE(Search_Parens_CachedPCRE, 8, 8)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(Search_Parens_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); + +void SearchBigFixed(int iters, int nbytes, SearchImpl* search) { + StopBenchmarkTiming(); + string s; + s.append(nbytes/2, 'x'); + string regexp = "^" + s + ".*$"; + string t; + MakeText(&t, nbytes/2); + s += t; + BenchmarkMemoryUsage(); + StartBenchmarkTiming(); + search(iters, regexp.c_str(), s, Prog::kUnanchored, true); + SetBenchmarkBytesProcessed(static_cast(iters)*nbytes); +} + +void Search_BigFixed_CachedDFA(int i, int n) { SearchBigFixed(i, n, SearchCachedDFA); } +void Search_BigFixed_CachedNFA(int i, int n) { SearchBigFixed(i, n, SearchCachedNFA); } +void Search_BigFixed_CachedPCRE(int i, int n) { SearchBigFixed(i, n, SearchCachedPCRE); } +void Search_BigFixed_CachedRE2(int i, int n) { SearchBigFixed(i, n, SearchCachedRE2); } + +BENCHMARK_RANGE(Search_BigFixed_CachedDFA, 8, 1<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_BigFixed_CachedNFA, 8, 32<<10)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK_RANGE(Search_BigFixed_CachedPCRE, 8, 32<<10)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(Search_BigFixed_CachedRE2, 8, 1<<20)->ThreadRange(1, NumCPUs()); + +// Benchmark: FindAndConsume +void FindAndConsume(int iters, int nbytes) { + StopBenchmarkTiming(); + string s; + MakeText(&s, nbytes); + s.append("Hello World"); + StartBenchmarkTiming(); + RE2 re("((Hello World))"); + for (int i = 0; i < iters; i++) { + StringPiece t = s; + StringPiece u; + CHECK(RE2::FindAndConsume(&t, re, &u)); + CHECK_EQ(u, "Hello World"); + } + SetBenchmarkBytesProcessed(static_cast(iters)*nbytes); +} + +BENCHMARK_RANGE(FindAndConsume, 8, 16<<20)->ThreadRange(1, NumCPUs()); + +// Benchmark: successful anchored search. + +void SearchSuccess(int iters, int nbytes, const char* regexp, SearchImpl* search) { + string s; + MakeText(&s, nbytes); + BenchmarkMemoryUsage(); + search(iters, regexp, s, Prog::kAnchored, true); + SetBenchmarkBytesProcessed(static_cast(iters)*nbytes); +} + +// Unambiguous search (RE2 can use OnePass). + +void Search_Success_DFA(int i, int n) { SearchSuccess(i, n, ".*$", SearchDFA); } +void Search_Success_OnePass(int i, int n) { SearchSuccess(i, n, ".*$", SearchOnePass); } +void Search_Success_PCRE(int i, int n) { SearchSuccess(i, n, ".*$", SearchPCRE); } +void Search_Success_RE2(int i, int n) { SearchSuccess(i, n, ".*$", SearchRE2); } + +BENCHMARK_RANGE(Search_Success_DFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK_RANGE(Search_Success_PCRE, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(Search_Success_RE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_Success_OnePass, 8, 2<<20)->ThreadRange(1, NumCPUs()); + +void Search_Success_CachedDFA(int i, int n) { SearchSuccess(i, n, ".*$", SearchCachedDFA); } +void Search_Success_CachedOnePass(int i, int n) { SearchSuccess(i, n, ".*$", SearchCachedOnePass); } +void Search_Success_CachedPCRE(int i, int n) { SearchSuccess(i, n, ".*$", SearchCachedPCRE); } +void Search_Success_CachedRE2(int i, int n) { SearchSuccess(i, n, ".*$", SearchCachedRE2); } + +BENCHMARK_RANGE(Search_Success_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK_RANGE(Search_Success_CachedPCRE, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(Search_Success_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_Success_CachedOnePass, 8, 2<<20)->ThreadRange(1, NumCPUs()); + +// Ambiguous search (RE2 cannot use OnePass). + +void Search_Success1_DFA(int i, int n) { SearchSuccess(i, n, ".*.$", SearchDFA); } +void Search_Success1_PCRE(int i, int n) { SearchSuccess(i, n, ".*.$", SearchPCRE); } +void Search_Success1_RE2(int i, int n) { SearchSuccess(i, n, ".*.$", SearchRE2); } +void Search_Success1_BitState(int i, int n) { SearchSuccess(i, n, ".*.$", SearchBitState); } + +BENCHMARK_RANGE(Search_Success1_DFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK_RANGE(Search_Success1_PCRE, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(Search_Success1_RE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_Success1_BitState, 8, 2<<20)->ThreadRange(1, NumCPUs()); + +void Search_Success1_Cached_DFA(int i, int n) { SearchSuccess(i, n, ".*.$", SearchCachedDFA); } +void Search_Success1_Cached_PCRE(int i, int n) { SearchSuccess(i, n, ".*.$", SearchCachedPCRE); } +void Search_Success1_Cached_RE2(int i, int n) { SearchSuccess(i, n, ".*.$", SearchCachedRE2); } + +BENCHMARK_RANGE(Search_Success1_Cached_DFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK_RANGE(Search_Success1_Cached_PCRE, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(Search_Success1_Cached_RE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); + +// Benchmark: use regexp to find phone number. + +void SearchDigits(int iters, SearchImpl* search) { + const char *text = "650-253-0001"; + int len = strlen(text); + BenchmarkMemoryUsage(); + search(iters, "([0-9]+)-([0-9]+)-([0-9]+)", + StringPiece(text, len), Prog::kAnchored, true); + SetBenchmarkItemsProcessed(iters); +} + +void Search_Digits_DFA(int i) { SearchDigits(i, SearchDFA); } +void Search_Digits_NFA(int i) { SearchDigits(i, SearchNFA); } +void Search_Digits_OnePass(int i) { SearchDigits(i, SearchOnePass); } +void Search_Digits_PCRE(int i) { SearchDigits(i, SearchPCRE); } +void Search_Digits_RE2(int i) { SearchDigits(i, SearchRE2); } +void Search_Digits_BitState(int i) { SearchDigits(i, SearchBitState); } + +BENCHMARK(Search_Digits_DFA)->ThreadRange(1, NumCPUs()); +BENCHMARK(Search_Digits_NFA)->ThreadRange(1, NumCPUs()); +BENCHMARK(Search_Digits_OnePass)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK(Search_Digits_PCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(Search_Digits_RE2)->ThreadRange(1, NumCPUs()); +BENCHMARK(Search_Digits_BitState)->ThreadRange(1, NumCPUs()); + +// Benchmark: use regexp to parse digit fields in phone number. + +void Parse3Digits(int iters, + void (*parse3)(int, const char*, const StringPiece&)) { + BenchmarkMemoryUsage(); + parse3(iters, "([0-9]+)-([0-9]+)-([0-9]+)", "650-253-0001"); + SetBenchmarkItemsProcessed(iters); +} + +void Parse_Digits_NFA(int i) { Parse3Digits(i, Parse3NFA); } +void Parse_Digits_OnePass(int i) { Parse3Digits(i, Parse3OnePass); } +void Parse_Digits_PCRE(int i) { Parse3Digits(i, Parse3PCRE); } +void Parse_Digits_RE2(int i) { Parse3Digits(i, Parse3RE2); } +void Parse_Digits_Backtrack(int i) { Parse3Digits(i, Parse3Backtrack); } +void Parse_Digits_BitState(int i) { Parse3Digits(i, Parse3BitState); } + +BENCHMARK(Parse_Digits_NFA)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_Digits_OnePass)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK(Parse_Digits_PCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(Parse_Digits_RE2)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_Digits_Backtrack)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_Digits_BitState)->ThreadRange(1, NumCPUs()); + +void Parse_CachedDigits_NFA(int i) { Parse3Digits(i, Parse3CachedNFA); } +void Parse_CachedDigits_OnePass(int i) { Parse3Digits(i, Parse3CachedOnePass); } +void Parse_CachedDigits_PCRE(int i) { Parse3Digits(i, Parse3CachedPCRE); } +void Parse_CachedDigits_RE2(int i) { Parse3Digits(i, Parse3CachedRE2); } +void Parse_CachedDigits_Backtrack(int i) { Parse3Digits(i, Parse3CachedBacktrack); } +void Parse_CachedDigits_BitState(int i) { Parse3Digits(i, Parse3CachedBitState); } + +BENCHMARK(Parse_CachedDigits_NFA)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_CachedDigits_OnePass)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK(Parse_CachedDigits_PCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(Parse_CachedDigits_Backtrack)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_CachedDigits_RE2)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_CachedDigits_BitState)->ThreadRange(1, NumCPUs()); + +void Parse3DigitDs(int iters, + void (*parse3)(int, const char*, const StringPiece&)) { + BenchmarkMemoryUsage(); + parse3(iters, "(\\d+)-(\\d+)-(\\d+)", "650-253-0001"); + SetBenchmarkItemsProcessed(iters); +} + +void Parse_DigitDs_NFA(int i) { Parse3DigitDs(i, Parse3NFA); } +void Parse_DigitDs_OnePass(int i) { Parse3DigitDs(i, Parse3OnePass); } +void Parse_DigitDs_PCRE(int i) { Parse3DigitDs(i, Parse3PCRE); } +void Parse_DigitDs_RE2(int i) { Parse3DigitDs(i, Parse3RE2); } +void Parse_DigitDs_Backtrack(int i) { Parse3DigitDs(i, Parse3CachedBacktrack); } +void Parse_DigitDs_BitState(int i) { Parse3DigitDs(i, Parse3CachedBitState); } + +BENCHMARK(Parse_DigitDs_NFA)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_DigitDs_OnePass)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK(Parse_DigitDs_PCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(Parse_DigitDs_RE2)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_DigitDs_Backtrack)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_DigitDs_BitState)->ThreadRange(1, NumCPUs()); + +void Parse_CachedDigitDs_NFA(int i) { Parse3DigitDs(i, Parse3CachedNFA); } +void Parse_CachedDigitDs_OnePass(int i) { Parse3DigitDs(i, Parse3CachedOnePass); } +void Parse_CachedDigitDs_PCRE(int i) { Parse3DigitDs(i, Parse3CachedPCRE); } +void Parse_CachedDigitDs_RE2(int i) { Parse3DigitDs(i, Parse3CachedRE2); } +void Parse_CachedDigitDs_Backtrack(int i) { Parse3DigitDs(i, Parse3CachedBacktrack); } +void Parse_CachedDigitDs_BitState(int i) { Parse3DigitDs(i, Parse3CachedBitState); } + +BENCHMARK(Parse_CachedDigitDs_NFA)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_CachedDigitDs_OnePass)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK(Parse_CachedDigitDs_PCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(Parse_CachedDigitDs_Backtrack)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_CachedDigitDs_RE2)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_CachedDigitDs_BitState)->ThreadRange(1, NumCPUs()); + +// Benchmark: splitting off leading number field. + +void Parse1Split(int iters, + void (*parse1)(int, const char*, const StringPiece&)) { + BenchmarkMemoryUsage(); + parse1(iters, "[0-9]+-(.*)", "650-253-0001"); + SetBenchmarkItemsProcessed(iters); +} + +void Parse_Split_NFA(int i) { Parse1Split(i, Parse1NFA); } +void Parse_Split_OnePass(int i) { Parse1Split(i, Parse1OnePass); } +void Parse_Split_PCRE(int i) { Parse1Split(i, Parse1PCRE); } +void Parse_Split_RE2(int i) { Parse1Split(i, Parse1RE2); } +void Parse_Split_BitState(int i) { Parse1Split(i, Parse1BitState); } + +BENCHMARK(Parse_Split_NFA)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_Split_OnePass)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK(Parse_Split_PCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(Parse_Split_RE2)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_Split_BitState)->ThreadRange(1, NumCPUs()); + +void Parse_CachedSplit_NFA(int i) { Parse1Split(i, Parse1CachedNFA); } +void Parse_CachedSplit_OnePass(int i) { Parse1Split(i, Parse1CachedOnePass); } +void Parse_CachedSplit_PCRE(int i) { Parse1Split(i, Parse1CachedPCRE); } +void Parse_CachedSplit_RE2(int i) { Parse1Split(i, Parse1CachedRE2); } +void Parse_CachedSplit_BitState(int i) { Parse1Split(i, Parse1CachedBitState); } + +BENCHMARK(Parse_CachedSplit_NFA)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_CachedSplit_OnePass)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK(Parse_CachedSplit_PCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(Parse_CachedSplit_RE2)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_CachedSplit_BitState)->ThreadRange(1, NumCPUs()); + +// Benchmark: splitting off leading number field but harder (ambiguous regexp). + +void Parse1SplitHard(int iters, + void (*run)(int, const char*, const StringPiece&)) { + BenchmarkMemoryUsage(); + run(iters, "[0-9]+.(.*)", "650-253-0001"); + SetBenchmarkItemsProcessed(iters); +} + +void Parse_SplitHard_NFA(int i) { Parse1SplitHard(i, Parse1NFA); } +void Parse_SplitHard_PCRE(int i) { Parse1SplitHard(i, Parse1PCRE); } +void Parse_SplitHard_RE2(int i) { Parse1SplitHard(i, Parse1RE2); } +void Parse_SplitHard_BitState(int i) { Parse1SplitHard(i, Parse1BitState); } + +#ifdef USEPCRE +BENCHMARK(Parse_SplitHard_PCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(Parse_SplitHard_RE2)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_SplitHard_BitState)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_SplitHard_NFA)->ThreadRange(1, NumCPUs()); + +void Parse_CachedSplitHard_NFA(int i) { Parse1SplitHard(i, Parse1CachedNFA); } +void Parse_CachedSplitHard_PCRE(int i) { Parse1SplitHard(i, Parse1CachedPCRE); } +void Parse_CachedSplitHard_RE2(int i) { Parse1SplitHard(i, Parse1CachedRE2); } +void Parse_CachedSplitHard_BitState(int i) { Parse1SplitHard(i, Parse1CachedBitState); } +void Parse_CachedSplitHard_Backtrack(int i) { Parse1SplitHard(i, Parse1CachedBacktrack); } + +#ifdef USEPCRE +BENCHMARK(Parse_CachedSplitHard_PCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(Parse_CachedSplitHard_RE2)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_CachedSplitHard_BitState)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_CachedSplitHard_NFA)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_CachedSplitHard_Backtrack)->ThreadRange(1, NumCPUs()); + +// Benchmark: Parse1SplitHard, big text, small match. + +void Parse1SplitBig1(int iters, + void (*run)(int, const char*, const StringPiece&)) { + string s; + s.append(100000, 'x'); + s.append("650-253-0001"); + BenchmarkMemoryUsage(); + run(iters, "[0-9]+.(.*)", s); + SetBenchmarkItemsProcessed(iters); +} + +void Parse_CachedSplitBig1_PCRE(int i) { Parse1SplitBig1(i, SearchParse1CachedPCRE); } +void Parse_CachedSplitBig1_RE2(int i) { Parse1SplitBig1(i, SearchParse1CachedRE2); } + +#ifdef USEPCRE +BENCHMARK(Parse_CachedSplitBig1_PCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(Parse_CachedSplitBig1_RE2)->ThreadRange(1, NumCPUs()); + +// Benchmark: Parse1SplitHard, big text, big match. + +void Parse1SplitBig2(int iters, + void (*run)(int, const char*, const StringPiece&)) { + string s; + s.append("650-253-"); + s.append(100000, '0'); + BenchmarkMemoryUsage(); + run(iters, "[0-9]+.(.*)", s); + SetBenchmarkItemsProcessed(iters); +} + +void Parse_CachedSplitBig2_PCRE(int i) { Parse1SplitBig2(i, SearchParse1CachedPCRE); } +void Parse_CachedSplitBig2_RE2(int i) { Parse1SplitBig2(i, SearchParse1CachedRE2); } + +#ifdef USEPCRE +BENCHMARK(Parse_CachedSplitBig2_PCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(Parse_CachedSplitBig2_RE2)->ThreadRange(1, NumCPUs()); + +// Benchmark: measure time required to parse (but not execute) +// a simple regular expression. + +void ParseRegexp(int iters, const string& regexp) { + for (int i = 0; i < iters; i++) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + re->Decref(); + } +} + +void SimplifyRegexp(int iters, const string& regexp) { + for (int i = 0; i < iters; i++) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Regexp* sre = re->Simplify(); + CHECK(sre); + sre->Decref(); + re->Decref(); + } +} + +void NullWalkRegexp(int iters, const string& regexp) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + for (int i = 0; i < iters; i++) { + re->NullWalk(); + } + re->Decref(); +} + +void SimplifyCompileRegexp(int iters, const string& regexp) { + for (int i = 0; i < iters; i++) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Regexp* sre = re->Simplify(); + CHECK(sre); + Prog* prog = sre->CompileToProg(0); + CHECK(prog); + delete prog; + sre->Decref(); + re->Decref(); + } +} + +void CompileRegexp(int iters, const string& regexp) { + for (int i = 0; i < iters; i++) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + delete prog; + re->Decref(); + } +} + +void CompileToProg(int iters, const string& regexp) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + for (int i = 0; i < iters; i++) { + Prog* prog = re->CompileToProg(0); + CHECK(prog); + delete prog; + } + re->Decref(); +} + +void CompileByteMap(int iters, const string& regexp) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + for (int i = 0; i < iters; i++) { + prog->ComputeByteMap(); + } + delete prog; + re->Decref(); +} + +void CompilePCRE(int iters, const string& regexp) { + for (int i = 0; i < iters; i++) { + PCRE re(regexp, PCRE::UTF8); + CHECK_EQ(re.error(), ""); + } +} + +void CompileRE2(int iters, const string& regexp) { + for (int i = 0; i < iters; i++) { + RE2 re(regexp); + CHECK_EQ(re.error(), ""); + } +} + +void RunBuild(int iters, const string& regexp, void (*run)(int, const string&)) { + run(iters, regexp); + SetBenchmarkItemsProcessed(iters); +} + +} // namespace re2 + +DEFINE_string(compile_regexp, "(.*)-(\\d+)-of-(\\d+)", "regexp for compile benchmarks"); + +namespace re2 { + +void BM_PCRE_Compile(int i) { RunBuild(i, FLAGS_compile_regexp, CompilePCRE); } +void BM_Regexp_Parse(int i) { RunBuild(i, FLAGS_compile_regexp, ParseRegexp); } +void BM_Regexp_Simplify(int i) { RunBuild(i, FLAGS_compile_regexp, SimplifyRegexp); } +void BM_CompileToProg(int i) { RunBuild(i, FLAGS_compile_regexp, CompileToProg); } +void BM_CompileByteMap(int i) { RunBuild(i, FLAGS_compile_regexp, CompileByteMap); } +void BM_Regexp_Compile(int i) { RunBuild(i, FLAGS_compile_regexp, CompileRegexp); } +void BM_Regexp_SimplifyCompile(int i) { RunBuild(i, FLAGS_compile_regexp, SimplifyCompileRegexp); } +void BM_Regexp_NullWalk(int i) { RunBuild(i, FLAGS_compile_regexp, NullWalkRegexp); } +void BM_RE2_Compile(int i) { RunBuild(i, FLAGS_compile_regexp, CompileRE2); } + +#ifdef USEPCRE +BENCHMARK(BM_PCRE_Compile)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(BM_Regexp_Parse)->ThreadRange(1, NumCPUs()); +BENCHMARK(BM_Regexp_Simplify)->ThreadRange(1, NumCPUs()); +BENCHMARK(BM_CompileToProg)->ThreadRange(1, NumCPUs()); +BENCHMARK(BM_CompileByteMap)->ThreadRange(1, NumCPUs()); +BENCHMARK(BM_Regexp_Compile)->ThreadRange(1, NumCPUs()); +BENCHMARK(BM_Regexp_SimplifyCompile)->ThreadRange(1, NumCPUs()); +BENCHMARK(BM_Regexp_NullWalk)->ThreadRange(1, NumCPUs()); +BENCHMARK(BM_RE2_Compile)->ThreadRange(1, NumCPUs()); + + +// Makes text of size nbytes, then calls run to search +// the text for regexp iters times. +void SearchPhone(int iters, int nbytes, ParseImpl* search) { + StopBenchmarkTiming(); + string s; + MakeText(&s, nbytes); + s.append("(650) 253-0001"); + BenchmarkMemoryUsage(); + StartBenchmarkTiming(); + search(iters, "(\\d{3}-|\\(\\d{3}\\)\\s+)(\\d{3}-\\d{4})", s); + SetBenchmarkBytesProcessed(static_cast(iters)*nbytes); +} + +void SearchPhone_CachedPCRE(int i, int n) { + SearchPhone(i, n, SearchParse2CachedPCRE); +} +void SearchPhone_CachedRE2(int i, int n) { + SearchPhone(i, n, SearchParse2CachedRE2); +} + +#ifdef USEPCRE +BENCHMARK_RANGE(SearchPhone_CachedPCRE, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(SearchPhone_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); + +/* +TODO(rsc): Make this work again. + +// Generates and returns a string over binary alphabet {0,1} that contains +// all possible binary sequences of length n as subsequences. The obvious +// brute force method would generate a string of length n * 2^n, but this +// generates a string of length n + 2^n - 1 called a De Bruijn cycle. +// See Knuth, The Art of Computer Programming, Vol 2, Exercise 3.2.2 #17. +static string DeBruijnString(int n) { + CHECK_LT(n, 8*sizeof(int)); + CHECK_GT(n, 0); + + vector did(1<(iters)*s.size()); +} + +void CacheFillPCRE(int i, int n) { CacheFill(i, n, SearchCachedPCRE); } +void CacheFillRE2(int i, int n) { CacheFill(i, n, SearchCachedRE2); } +void CacheFillNFA(int i, int n) { CacheFill(i, n, SearchCachedNFA); } +void CacheFillDFA(int i, int n) { CacheFill(i, n, SearchCachedDFA); } + +// BENCHMARK_WITH_ARG uses __LINE__ to generate distinct identifiers +// for the static BenchmarkRegisterer, which makes it unusable inside +// a macro like DO24 below. MY_BENCHMARK_WITH_ARG uses the argument a +// to make the identifiers distinct (only possible when 'a' is a simple +// expression like 2, not like 1+1). +#define MY_BENCHMARK_WITH_ARG(n, a) \ + bool __benchmark_ ## n ## a = \ + (new ::testing::Benchmark(#n, NewPermanentCallback(&n)))->ThreadRange(1, NumCPUs()); + +#define DO24(A, B) \ + A(B, 1); A(B, 2); A(B, 3); A(B, 4); A(B, 5); A(B, 6); \ + A(B, 7); A(B, 8); A(B, 9); A(B, 10); A(B, 11); A(B, 12); \ + A(B, 13); A(B, 14); A(B, 15); A(B, 16); A(B, 17); A(B, 18); \ + A(B, 19); A(B, 20); A(B, 21); A(B, 22); A(B, 23); A(B, 24); + +DO24(MY_BENCHMARK_WITH_ARG, CacheFillPCRE) +DO24(MY_BENCHMARK_WITH_ARG, CacheFillNFA) +DO24(MY_BENCHMARK_WITH_ARG, CacheFillRE2) +DO24(MY_BENCHMARK_WITH_ARG, CacheFillDFA) + +#undef DO24 +#undef MY_BENCHMARK_WITH_ARG +*/ + +//////////////////////////////////////////////////////////////////////// +// +// Implementation routines. Sad that there are so many, +// but all the interfaces are slightly different. + +// Runs implementation to search for regexp in text, iters times. +// Expect_match says whether the regexp should be found. +// Anchored says whether to run an anchored search. + +void SearchDFA(int iters, const char* regexp, const StringPiece& text, + Prog::Anchor anchor, bool expect_match) { + for (int i = 0; i < iters; i++) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + bool failed = false; + CHECK_EQ(prog->SearchDFA(text, NULL, anchor, Prog::kFirstMatch, + NULL, &failed, NULL), + expect_match); + CHECK(!failed); + delete prog; + re->Decref(); + } +} + +void SearchNFA(int iters, const char* regexp, const StringPiece& text, + Prog::Anchor anchor, bool expect_match) { + for (int i = 0; i < iters; i++) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + CHECK_EQ(prog->SearchNFA(text, NULL, anchor, Prog::kFirstMatch, NULL, 0), + expect_match); + delete prog; + re->Decref(); + } +} + +void SearchOnePass(int iters, const char* regexp, const StringPiece& text, + Prog::Anchor anchor, bool expect_match) { + for (int i = 0; i < iters; i++) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + CHECK(prog->IsOnePass()); + CHECK_EQ(prog->SearchOnePass(text, text, anchor, Prog::kFirstMatch, NULL, 0), + expect_match); + delete prog; + re->Decref(); + } +} + +void SearchBitState(int iters, const char* regexp, const StringPiece& text, + Prog::Anchor anchor, bool expect_match) { + for (int i = 0; i < iters; i++) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + CHECK_EQ(prog->SearchBitState(text, text, anchor, Prog::kFirstMatch, NULL, 0), + expect_match); + delete prog; + re->Decref(); + } +} + +void SearchPCRE(int iters, const char* regexp, const StringPiece& text, + Prog::Anchor anchor, bool expect_match) { + for (int i = 0; i < iters; i++) { + PCRE re(regexp, PCRE::UTF8); + CHECK_EQ(re.error(), ""); + if (anchor == Prog::kAnchored) + CHECK_EQ(PCRE::FullMatch(text, re), expect_match); + else + CHECK_EQ(PCRE::PartialMatch(text, re), expect_match); + } +} + +void SearchRE2(int iters, const char* regexp, const StringPiece& text, + Prog::Anchor anchor, bool expect_match) { + for (int i = 0; i < iters; i++) { + RE2 re(regexp); + CHECK_EQ(re.error(), ""); + if (anchor == Prog::kAnchored) + CHECK_EQ(RE2::FullMatch(text, re), expect_match); + else + CHECK_EQ(RE2::PartialMatch(text, re), expect_match); + } +} + +// SearchCachedXXX is like SearchXXX but only does the +// regexp parsing and compiling once. This lets us measure +// search time without the per-regexp overhead. + +void SearchCachedDFA(int iters, const char* regexp, const StringPiece& text, + Prog::Anchor anchor, bool expect_match) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(1LL<<31); + CHECK(prog); + for (int i = 0; i < iters; i++) { + bool failed = false; + CHECK_EQ(prog->SearchDFA(text, NULL, anchor, + Prog::kFirstMatch, NULL, &failed, NULL), + expect_match); + CHECK(!failed); + } + delete prog; + re->Decref(); +} + +void SearchCachedNFA(int iters, const char* regexp, const StringPiece& text, + Prog::Anchor anchor, bool expect_match) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + for (int i = 0; i < iters; i++) { + CHECK_EQ(prog->SearchNFA(text, NULL, anchor, Prog::kFirstMatch, NULL, 0), + expect_match); + } + delete prog; + re->Decref(); +} + +void SearchCachedOnePass(int iters, const char* regexp, const StringPiece& text, + Prog::Anchor anchor, bool expect_match) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + CHECK(prog->IsOnePass()); + for (int i = 0; i < iters; i++) + CHECK_EQ(prog->SearchOnePass(text, text, anchor, Prog::kFirstMatch, NULL, 0), + expect_match); + delete prog; + re->Decref(); +} + +void SearchCachedBitState(int iters, const char* regexp, const StringPiece& text, + Prog::Anchor anchor, bool expect_match) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + for (int i = 0; i < iters; i++) + CHECK_EQ(prog->SearchBitState(text, text, anchor, Prog::kFirstMatch, NULL, 0), + expect_match); + delete prog; + re->Decref(); +} + +void SearchCachedPCRE(int iters, const char* regexp, const StringPiece& text, + Prog::Anchor anchor, bool expect_match) { + PCRE re(regexp, PCRE::UTF8); + CHECK_EQ(re.error(), ""); + for (int i = 0; i < iters; i++) { + if (anchor == Prog::kAnchored) + CHECK_EQ(PCRE::FullMatch(text, re), expect_match); + else + CHECK_EQ(PCRE::PartialMatch(text, re), expect_match); + } +} + +void SearchCachedRE2(int iters, const char* regexp, const StringPiece& text, + Prog::Anchor anchor, bool expect_match) { + RE2 re(regexp); + CHECK_EQ(re.error(), ""); + for (int i = 0; i < iters; i++) { + if (anchor == Prog::kAnchored) + CHECK_EQ(RE2::FullMatch(text, re), expect_match); + else + CHECK_EQ(RE2::PartialMatch(text, re), expect_match); + } +} + + +// Runs implementation to full match regexp against text, +// extracting three submatches. Expects match always. + +void Parse3NFA(int iters, const char* regexp, const StringPiece& text) { + for (int i = 0; i < iters; i++) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + StringPiece sp[4]; // 4 because sp[0] is whole match. + CHECK(prog->SearchNFA(text, NULL, Prog::kAnchored, Prog::kFullMatch, sp, 4)); + delete prog; + re->Decref(); + } +} + +void Parse3OnePass(int iters, const char* regexp, const StringPiece& text) { + for (int i = 0; i < iters; i++) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + CHECK(prog->IsOnePass()); + StringPiece sp[4]; // 4 because sp[0] is whole match. + CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); + delete prog; + re->Decref(); + } +} + +void Parse3BitState(int iters, const char* regexp, const StringPiece& text) { + for (int i = 0; i < iters; i++) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + StringPiece sp[4]; // 4 because sp[0] is whole match. + CHECK(prog->SearchBitState(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); + delete prog; + re->Decref(); + } +} + +void Parse3Backtrack(int iters, const char* regexp, const StringPiece& text) { + for (int i = 0; i < iters; i++) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + StringPiece sp[4]; // 4 because sp[0] is whole match. + CHECK(prog->UnsafeSearchBacktrack(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); + delete prog; + re->Decref(); + } +} + +void Parse3PCRE(int iters, const char* regexp, const StringPiece& text) { + for (int i = 0; i < iters; i++) { + PCRE re(regexp, PCRE::UTF8); + CHECK_EQ(re.error(), ""); + StringPiece sp1, sp2, sp3; + CHECK(PCRE::FullMatch(text, re, &sp1, &sp2, &sp3)); + } +} + +void Parse3RE2(int iters, const char* regexp, const StringPiece& text) { + for (int i = 0; i < iters; i++) { + RE2 re(regexp); + CHECK_EQ(re.error(), ""); + StringPiece sp1, sp2, sp3; + CHECK(RE2::FullMatch(text, re, &sp1, &sp2, &sp3)); + } +} + +void Parse3CachedNFA(int iters, const char* regexp, const StringPiece& text) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + StringPiece sp[4]; // 4 because sp[0] is whole match. + for (int i = 0; i < iters; i++) { + CHECK(prog->SearchNFA(text, NULL, Prog::kAnchored, Prog::kFullMatch, sp, 4)); + } + delete prog; + re->Decref(); +} + +void Parse3CachedOnePass(int iters, const char* regexp, const StringPiece& text) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + CHECK(prog->IsOnePass()); + StringPiece sp[4]; // 4 because sp[0] is whole match. + for (int i = 0; i < iters; i++) + CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); + delete prog; + re->Decref(); +} + +void Parse3CachedBitState(int iters, const char* regexp, const StringPiece& text) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + StringPiece sp[4]; // 4 because sp[0] is whole match. + for (int i = 0; i < iters; i++) + CHECK(prog->SearchBitState(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); + delete prog; + re->Decref(); +} + +void Parse3CachedBacktrack(int iters, const char* regexp, const StringPiece& text) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + StringPiece sp[4]; // 4 because sp[0] is whole match. + for (int i = 0; i < iters; i++) + CHECK(prog->UnsafeSearchBacktrack(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); + delete prog; + re->Decref(); +} + +void Parse3CachedPCRE(int iters, const char* regexp, const StringPiece& text) { + PCRE re(regexp, PCRE::UTF8); + CHECK_EQ(re.error(), ""); + StringPiece sp1, sp2, sp3; + for (int i = 0; i < iters; i++) { + CHECK(PCRE::FullMatch(text, re, &sp1, &sp2, &sp3)); + } +} + +void Parse3CachedRE2(int iters, const char* regexp, const StringPiece& text) { + RE2 re(regexp); + CHECK_EQ(re.error(), ""); + StringPiece sp1, sp2, sp3; + for (int i = 0; i < iters; i++) { + CHECK(RE2::FullMatch(text, re, &sp1, &sp2, &sp3)); + } +} + + +// Runs implementation to full match regexp against text, +// extracting three submatches. Expects match always. + +void Parse1NFA(int iters, const char* regexp, const StringPiece& text) { + for (int i = 0; i < iters; i++) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + StringPiece sp[2]; // 2 because sp[0] is whole match. + CHECK(prog->SearchNFA(text, NULL, Prog::kAnchored, Prog::kFullMatch, sp, 2)); + delete prog; + re->Decref(); + } +} + +void Parse1OnePass(int iters, const char* regexp, const StringPiece& text) { + for (int i = 0; i < iters; i++) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + CHECK(prog->IsOnePass()); + StringPiece sp[2]; // 2 because sp[0] is whole match. + CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2)); + delete prog; + re->Decref(); + } +} + +void Parse1BitState(int iters, const char* regexp, const StringPiece& text) { + for (int i = 0; i < iters; i++) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + StringPiece sp[2]; // 2 because sp[0] is whole match. + CHECK(prog->SearchBitState(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2)); + delete prog; + re->Decref(); + } +} + +void Parse1PCRE(int iters, const char* regexp, const StringPiece& text) { + for (int i = 0; i < iters; i++) { + PCRE re(regexp, PCRE::UTF8); + CHECK_EQ(re.error(), ""); + StringPiece sp1; + CHECK(PCRE::FullMatch(text, re, &sp1)); + } +} + +void Parse1RE2(int iters, const char* regexp, const StringPiece& text) { + for (int i = 0; i < iters; i++) { + RE2 re(regexp); + CHECK_EQ(re.error(), ""); + StringPiece sp1; + CHECK(RE2::FullMatch(text, re, &sp1)); + } +} + +void Parse1CachedNFA(int iters, const char* regexp, const StringPiece& text) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + StringPiece sp[2]; // 2 because sp[0] is whole match. + for (int i = 0; i < iters; i++) { + CHECK(prog->SearchNFA(text, NULL, Prog::kAnchored, Prog::kFullMatch, sp, 2)); + } + delete prog; + re->Decref(); +} + +void Parse1CachedOnePass(int iters, const char* regexp, const StringPiece& text) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + CHECK(prog->IsOnePass()); + StringPiece sp[2]; // 2 because sp[0] is whole match. + for (int i = 0; i < iters; i++) + CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2)); + delete prog; + re->Decref(); +} + +void Parse1CachedBitState(int iters, const char* regexp, const StringPiece& text) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + StringPiece sp[2]; // 2 because sp[0] is whole match. + for (int i = 0; i < iters; i++) + CHECK(prog->SearchBitState(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2)); + delete prog; + re->Decref(); +} + +void Parse1CachedBacktrack(int iters, const char* regexp, const StringPiece& text) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + StringPiece sp[2]; // 2 because sp[0] is whole match. + for (int i = 0; i < iters; i++) + CHECK(prog->UnsafeSearchBacktrack(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2)); + delete prog; + re->Decref(); +} + +void Parse1CachedPCRE(int iters, const char* regexp, const StringPiece& text) { + PCRE re(regexp, PCRE::UTF8); + CHECK_EQ(re.error(), ""); + StringPiece sp1; + for (int i = 0; i < iters; i++) { + CHECK(PCRE::FullMatch(text, re, &sp1)); + } +} + +void Parse1CachedRE2(int iters, const char* regexp, const StringPiece& text) { + RE2 re(regexp); + CHECK_EQ(re.error(), ""); + StringPiece sp1; + for (int i = 0; i < iters; i++) { + CHECK(RE2::FullMatch(text, re, &sp1)); + } +} + +void SearchParse2CachedPCRE(int iters, const char* regexp, + const StringPiece& text) { + PCRE re(regexp, PCRE::UTF8); + CHECK_EQ(re.error(), ""); + for (int i = 0; i < iters; i++) { + StringPiece sp1, sp2; + CHECK(PCRE::PartialMatch(text, re, &sp1, &sp2)); + } +} + +void SearchParse2CachedRE2(int iters, const char* regexp, + const StringPiece& text) { + RE2 re(regexp); + CHECK_EQ(re.error(), ""); + for (int i = 0; i < iters; i++) { + StringPiece sp1, sp2; + CHECK(RE2::PartialMatch(text, re, &sp1, &sp2)); + } +} + +void SearchParse1CachedPCRE(int iters, const char* regexp, + const StringPiece& text) { + PCRE re(regexp, PCRE::UTF8); + CHECK_EQ(re.error(), ""); + for (int i = 0; i < iters; i++) { + StringPiece sp1; + CHECK(PCRE::PartialMatch(text, re, &sp1)); + } +} + +void SearchParse1CachedRE2(int iters, const char* regexp, + const StringPiece& text) { + RE2 re(regexp); + CHECK_EQ(re.error(), ""); + for (int i = 0; i < iters; i++) { + StringPiece sp1; + CHECK(RE2::PartialMatch(text, re, &sp1)); + } +} + +void EmptyPartialMatchPCRE(int n) { + PCRE re(""); + for (int i = 0; i < n; i++) { + PCRE::PartialMatch("", re); + } +} + +void EmptyPartialMatchRE2(int n) { + RE2 re(""); + for (int i = 0; i < n; i++) { + RE2::PartialMatch("", re); + } +} +#ifdef USEPCRE +BENCHMARK(EmptyPartialMatchPCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(EmptyPartialMatchRE2)->ThreadRange(1, NumCPUs()); + +void SimplePartialMatchPCRE(int n) { + PCRE re("abcdefg"); + for (int i = 0; i < n; i++) { + PCRE::PartialMatch("abcdefg", re); + } +} + +void SimplePartialMatchRE2(int n) { + RE2 re("abcdefg"); + for (int i = 0; i < n; i++) { + RE2::PartialMatch("abcdefg", re); + } +} +#ifdef USEPCRE +BENCHMARK(SimplePartialMatchPCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(SimplePartialMatchRE2)->ThreadRange(1, NumCPUs()); + +static string http_text = + "GET /asdfhjasdhfasdlfhasdflkjasdfkljasdhflaskdjhf" + "alksdjfhasdlkfhasdlkjfhasdljkfhadsjklf HTTP/1.1"; + +void HTTPPartialMatchPCRE(int n) { + StringPiece a; + PCRE re("(?-s)^(?:GET|POST) +([^ ]+) HTTP"); + for (int i = 0; i < n; i++) { + PCRE::PartialMatch(http_text, re, &a); + } +} + +void HTTPPartialMatchRE2(int n) { + StringPiece a; + RE2 re("(?-s)^(?:GET|POST) +([^ ]+) HTTP"); + for (int i = 0; i < n; i++) { + RE2::PartialMatch(http_text, re, &a); + } +} + +#ifdef USEPCRE +BENCHMARK(HTTPPartialMatchPCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(HTTPPartialMatchRE2)->ThreadRange(1, NumCPUs()); + +static string http_smalltext = + "GET /abc HTTP/1.1"; + +void SmallHTTPPartialMatchPCRE(int n) { + StringPiece a; + PCRE re("(?-s)^(?:GET|POST) +([^ ]+) HTTP"); + for (int i = 0; i < n; i++) { + PCRE::PartialMatch(http_text, re, &a); + } +} + +void SmallHTTPPartialMatchRE2(int n) { + StringPiece a; + RE2 re("(?-s)^(?:GET|POST) +([^ ]+) HTTP"); + for (int i = 0; i < n; i++) { + RE2::PartialMatch(http_text, re, &a); + } +} + +#ifdef USEPCRE +BENCHMARK(SmallHTTPPartialMatchPCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(SmallHTTPPartialMatchRE2)->ThreadRange(1, NumCPUs()); + +void DotMatchPCRE(int n) { + StringPiece a; + PCRE re("(?-s)^(.+)"); + for (int i = 0; i < n; i++) { + PCRE::PartialMatch(http_text, re, &a); + } +} + +void DotMatchRE2(int n) { + StringPiece a; + RE2 re("(?-s)^(.+)"); + for (int i = 0; i < n; i++) { + RE2::PartialMatch(http_text, re, &a); + } +} + +#ifdef USEPCRE +BENCHMARK(DotMatchPCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(DotMatchRE2)->ThreadRange(1, NumCPUs()); + +void ASCIIMatchPCRE(int n) { + StringPiece a; + PCRE re("(?-s)^([ -~]+)"); + for (int i = 0; i < n; i++) { + PCRE::PartialMatch(http_text, re, &a); + } +} + +void ASCIIMatchRE2(int n) { + StringPiece a; + RE2 re("(?-s)^([ -~]+)"); + for (int i = 0; i < n; i++) { + RE2::PartialMatch(http_text, re, &a); + } +} + +#ifdef USEPCRE +BENCHMARK(ASCIIMatchPCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(ASCIIMatchRE2)->ThreadRange(1, NumCPUs()); + +void FullMatchPCRE(int iter, int n, const char *regexp) { + StopBenchmarkTiming(); + string s; + MakeText(&s, n); + s += "ABCDEFGHIJ"; + BenchmarkMemoryUsage(); + PCRE re(regexp); + StartBenchmarkTiming(); + for (int i = 0; i < iter; i++) + CHECK(PCRE::FullMatch(s, re)); + SetBenchmarkBytesProcessed(static_cast(iter)*n); +} + +void FullMatchRE2(int iter, int n, const char *regexp) { + StopBenchmarkTiming(); + string s; + MakeText(&s, n); + s += "ABCDEFGHIJ"; + BenchmarkMemoryUsage(); + RE2 re(regexp, RE2::Latin1); + StartBenchmarkTiming(); + for (int i = 0; i < iter; i++) + CHECK(RE2::FullMatch(s, re)); + SetBenchmarkBytesProcessed(static_cast(iter)*n); +} + +void FullMatch_DotStar_CachedPCRE(int i, int n) { FullMatchPCRE(i, n, "(?s).*"); } +void FullMatch_DotStar_CachedRE2(int i, int n) { FullMatchRE2(i, n, "(?s).*"); } + +void FullMatch_DotStarDollar_CachedPCRE(int i, int n) { FullMatchPCRE(i, n, "(?s).*$"); } +void FullMatch_DotStarDollar_CachedRE2(int i, int n) { FullMatchRE2(i, n, "(?s).*$"); } + +void FullMatch_DotStarCapture_CachedPCRE(int i, int n) { FullMatchPCRE(i, n, "(?s)((.*)()()($))"); } +void FullMatch_DotStarCapture_CachedRE2(int i, int n) { FullMatchRE2(i, n, "(?s)((.*)()()($))"); } + +#ifdef USEPCRE +BENCHMARK_RANGE(FullMatch_DotStar_CachedPCRE, 8, 2<<20); +#endif +BENCHMARK_RANGE(FullMatch_DotStar_CachedRE2, 8, 2<<20); + +#ifdef USEPCRE +BENCHMARK_RANGE(FullMatch_DotStarDollar_CachedPCRE, 8, 2<<20); +#endif +BENCHMARK_RANGE(FullMatch_DotStarDollar_CachedRE2, 8, 2<<20); + +#ifdef USEPCRE +BENCHMARK_RANGE(FullMatch_DotStarCapture_CachedPCRE, 8, 2<<20); +#endif +BENCHMARK_RANGE(FullMatch_DotStarCapture_CachedRE2, 8, 2<<20); + +} // namespace re2 diff --git a/re2/testing/regexp_generator.cc b/re2/testing/regexp_generator.cc new file mode 100644 index 0000000..cf2db11 --- /dev/null +++ b/re2/testing/regexp_generator.cc @@ -0,0 +1,264 @@ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Regular expression generator: generates all possible +// regular expressions within parameters (see regexp_generator.h for details). + +// The regexp generator first generates a sequence of commands in a simple +// postfix language. Each command in the language is a string, +// like "a" or "%s*" or "%s|%s". +// +// To evaluate a command, enough arguments are popped from the value stack to +// plug into the %s slots. Then the result is pushed onto the stack. +// For example, the command sequence +// a b %s%s c +// results in the stack +// ab c +// +// GeneratePostfix generates all possible command sequences. +// Then RunPostfix turns each sequence into a regular expression +// and passes the regexp to HandleRegexp. + +#include +#include +#include +#include +#include "util/test.h" +#include "re2/testing/regexp_generator.h" + +namespace re2 { + +// Returns a vector of the egrep regexp operators. +const vector& RegexpGenerator::EgrepOps() { + static const char *ops[] = { + "%s%s", + "%s|%s", + "%s*", + "%s+", + "%s?", + "%s\\C*", + }; + static vector v(ops, ops + arraysize(ops)); + return v; +} + +RegexpGenerator::RegexpGenerator(int maxatoms, int maxops, + const vector& atoms, + const vector& ops) + : maxatoms_(maxatoms), maxops_(maxops), atoms_(atoms), ops_(ops) { + // Degenerate case. + if (atoms_.size() == 0) + maxatoms_ = 0; + if (ops_.size() == 0) + maxops_ = 0; +} + +// Generates all possible regular expressions (within the parameters), +// calling HandleRegexp for each one. +void RegexpGenerator::Generate() { + vector postfix; + GeneratePostfix(&postfix, 0, 0, 0); +} + +// Generates random regular expressions, calling HandleRegexp for each one. +void RegexpGenerator::GenerateRandom(int32 seed, int n) { + ACMRandom acm(seed); + acm_ = &acm; + + for (int i = 0; i < n; i++) { + vector postfix; + GenerateRandomPostfix(&postfix, 0, 0, 0); + } + + acm_ = NULL; +} + +// Counts and returns the number of occurrences of "%s" in s. +static int CountArgs(const string& s) { + const char *p = s.c_str(); + int n = 0; + while ((p = strstr(p, "%s")) != NULL) { + p += 2; + n++; + } + return n; +} + +// Generates all possible postfix command sequences. +// Each sequence is handed off to RunPostfix to generate a regular expression. +// The arguments are: +// post: the current postfix sequence +// nstk: the number of elements that would be on the stack after executing +// the sequence +// ops: the number of operators used in the sequence +// atoms: the number of atoms used in the sequence +// For example, if post were ["a", "b", "%s%s", "c"], +// then nstk = 2, ops = 1, atoms = 3. +// +// The initial call should be GeneratePostfix([empty vector], 0, 0, 0). +// +void RegexpGenerator::GeneratePostfix(vector* post, int nstk, + int ops, int atoms) { + if (nstk == 1) + RunPostfix(*post); + + // Early out: if used too many operators or can't + // get back down to a single expression on the stack + // using binary operators, give up. + if (ops + nstk - 1 > maxops_) + return; + + // Add atoms if there is room. + if (atoms < maxatoms_) { + for (int i = 0; i < atoms_.size(); i++) { + post->push_back(atoms_[i]); + GeneratePostfix(post, nstk + 1, ops, atoms + 1); + post->pop_back(); + } + } + + // Add operators if there are enough arguments. + if (ops < maxops_) { + for (int i = 0; i < ops_.size(); i++) { + const string& fmt = ops_[i]; + int nargs = CountArgs(fmt); + if (nargs <= nstk) { + post->push_back(fmt); + GeneratePostfix(post, nstk - nargs + 1, ops + 1, atoms); + post->pop_back(); + } + } + } +} + +// Generates a random postfix command sequence. +// Stops and returns true once a single sequence has been generated. +bool RegexpGenerator::GenerateRandomPostfix(vector *post, int nstk, + int ops, int atoms) { + for (;;) { + // Stop if we get to a single element, but only sometimes. + if (nstk == 1 && acm_->Uniform(maxatoms_ + 1 - atoms) == 0) { + RunPostfix(*post); + return true; + } + + // Early out: if used too many operators or can't + // get back down to a single expression on the stack + // using binary operators, give up. + if (ops + nstk - 1 > maxops_) + return false; + + // Add operators if there are enough arguments. + if (ops < maxops_ && acm_->Uniform(2) == 0) { + const string& fmt = ops_[acm_->Uniform(ops_.size())]; + int nargs = CountArgs(fmt); + if (nargs <= nstk) { + post->push_back(fmt); + bool ret = GenerateRandomPostfix(post, nstk - nargs + 1, + ops + 1, atoms); + post->pop_back(); + if (ret) + return true; + } + } + + // Add atoms if there is room. + if (atoms < maxatoms_ && acm_->Uniform(2) == 0) { + post->push_back(atoms_[acm_->Uniform(atoms_.size())]); + bool ret = GenerateRandomPostfix(post, nstk + 1, ops, atoms + 1); + post->pop_back(); + if (ret) + return true; + } + } +} + +// Interprets the postfix command sequence to create a regular expression +// passed to HandleRegexp. The results of operators like %s|%s are wrapped +// in (?: ) to avoid needing to maintain a precedence table. +void RegexpGenerator::RunPostfix(const vector& post) { + stack regexps; + for (int i = 0; i < post.size(); i++) { + switch (CountArgs(post[i])) { + default: + LOG(FATAL) << "Bad operator: " << post[i]; + case 0: + regexps.push(post[i]); + break; + case 1: { + string a = regexps.top(); + regexps.pop(); + regexps.push("(?:" + StringPrintf(post[i].c_str(), a.c_str()) + ")"); + break; + } + case 2: { + string b = regexps.top(); + regexps.pop(); + string a = regexps.top(); + regexps.pop(); + regexps.push("(?:" + + StringPrintf(post[i].c_str(), a.c_str(), b.c_str()) + + ")"); + break; + } + } + } + + if (regexps.size() != 1) { + // Internal error - should never happen. + printf("Bad regexp program:\n"); + for (int i = 0; i < post.size(); i++) { + printf(" %s\n", CEscape(post[i]).c_str()); + } + printf("Stack after running program:\n"); + while (!regexps.empty()) { + printf(" %s\n", CEscape(regexps.top()).c_str()); + regexps.pop(); + } + LOG(FATAL) << "Bad regexp program."; + } + + HandleRegexp(regexps.top()); + HandleRegexp("^(?:" + regexps.top() + ")$"); + HandleRegexp("^(?:" + regexps.top() + ")"); + HandleRegexp("(?:" + regexps.top() + ")$"); +} + +// Split s into an vector of strings, one for each UTF-8 character. +vector Explode(const StringPiece& s) { + vector v; + + for (const char *q = s.begin(); q < s.end(); ) { + const char* p = q; + Rune r; + q += chartorune(&r, q); + v.push_back(string(p, q - p)); + } + + return v; +} + +// Split string everywhere a substring is found, returning +// vector of pieces. +vector Split(const StringPiece& sep, const StringPiece& s) { + vector v; + + if (sep.size() == 0) + return Explode(s); + + const char *p = s.begin(); + for (const char *q = s.begin(); q + sep.size() <= s.end(); q++) { + if (StringPiece(q, sep.size()) == sep) { + v.push_back(string(p, q - p)); + p = q + sep.size(); + q = p - 1; // -1 for ++ in loop + continue; + } + } + if (p < s.end()) + v.push_back(string(p, s.end() - p)); + return v; +} + +} // namespace re2 diff --git a/re2/testing/regexp_generator.h b/re2/testing/regexp_generator.h new file mode 100644 index 0000000..b4506f2 --- /dev/null +++ b/re2/testing/regexp_generator.h @@ -0,0 +1,70 @@ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Regular expression generator: generates all possible +// regular expressions within given parameters (see below for details). + +#ifndef RE2_TESTING_REGEXP_GENERATOR_H__ +#define RE2_TESTING_REGEXP_GENERATOR_H__ + +#include +#include +#include "util/random.h" +#include "util/util.h" +#include "re2/stringpiece.h" + +namespace re2 { + +// Regular expression generator. +// +// Given a set of atom expressions like "a", "b", or "." +// and operators like "%s*", generates all possible regular expressions +// using at most maxbases base expressions and maxops operators. +// For each such expression re, calls HandleRegexp(re). +// +// Callers are expected to subclass RegexpGenerator and provide HandleRegexp. +// +class RegexpGenerator { + public: + RegexpGenerator(int maxatoms, int maxops, const vector& atoms, + const vector& ops); + virtual ~RegexpGenerator() {} + + // Generates all the regular expressions, calling HandleRegexp(re) for each. + void Generate(); + + // Generates n random regular expressions, calling HandleRegexp(re) for each. + void GenerateRandom(int32 seed, int n); + + // Handles a regular expression. Must be provided by subclass. + virtual void HandleRegexp(const string& regexp) = 0; + + // The egrep regexp operators: * + ? | and concatenation. + static const vector& EgrepOps(); + + private: + void RunPostfix(const vector& post); + void GeneratePostfix(vector* post, int nstk, int ops, int lits); + bool GenerateRandomPostfix(vector* post, int nstk, int ops, int lits); + + int maxatoms_; // Maximum number of atoms allowed in expr. + int maxops_; // Maximum number of ops allowed in expr. + vector atoms_; // Possible atoms. + vector ops_; // Possible ops. + ACMRandom* acm_; // Random generator. + DISALLOW_EVIL_CONSTRUCTORS(RegexpGenerator); +}; + +// Helpers for preparing arguments to RegexpGenerator constructor. + +// Returns one string for each character in s. +vector Explode(const StringPiece& s); + +// Splits string everywhere sep is found, returning +// vector of pieces. +vector Split(const StringPiece& sep, const StringPiece& s); + +} // namespace re2 + +#endif // RE2_TESTING_REGEXP_GENERATOR_H__ diff --git a/re2/testing/regexp_test.cc b/re2/testing/regexp_test.cc new file mode 100644 index 0000000..f317cbc --- /dev/null +++ b/re2/testing/regexp_test.cc @@ -0,0 +1,81 @@ +// Copyright 2006 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Test parse.cc, dump.cc, and tostring.cc. + +#include +#include +#include "util/test.h" +#include "re2/regexp.h" + +namespace re2 { + +// Test that overflowed ref counts work. +TEST(Regexp, BigRef) { + Regexp* re; + re = Regexp::Parse("x", Regexp::NoParseFlags, NULL); + for (int i = 0; i < 100000; i++) + re->Incref(); + for (int i = 0; i < 100000; i++) + re->Decref(); + CHECK_EQ(re->Ref(), 1); + re->Decref(); +} + +// Test that very large Concats work. +// Depends on overflowed ref counts working. +TEST(Regexp, BigConcat) { + Regexp* x; + x = Regexp::Parse("x", Regexp::NoParseFlags, NULL); + vector v(90000, x); // ToString bails out at 100000 + for (int i = 0; i < v.size(); i++) + x->Incref(); + CHECK_EQ(x->Ref(), 1 + v.size()) << x->Ref(); + Regexp* re = Regexp::Concat(&v[0], v.size(), Regexp::NoParseFlags); + CHECK_EQ(re->ToString(), string(v.size(), 'x')); + re->Decref(); + CHECK_EQ(x->Ref(), 1) << x->Ref(); + x->Decref(); +} + +TEST(Regexp, NamedCaptures) { + Regexp* x; + RegexpStatus status; + x = Regexp::Parse( + "(?Pa+)|(e)(?Pw*)+(?Pb+)", Regexp::PerlX, &status); + EXPECT_TRUE(status.ok()); + EXPECT_EQ(4, x->NumCaptures()); + const map* have = x->NamedCaptures(); + EXPECT_TRUE(have != NULL); + EXPECT_EQ(2, have->size()); // there are only two named groups in + // the regexp: 'g1' and 'g2'. + map want; + want["g1"] = 1; + want["g2"] = 3; + EXPECT_EQ(want, *have); + x->Decref(); + delete have; +} + +TEST(Regexp, CaptureNames) { + Regexp* x; + RegexpStatus status; + x = Regexp::Parse( + "(?Pa+)|(e)(?Pw*)+(?Pb+)", Regexp::PerlX, &status); + EXPECT_TRUE(status.ok()); + EXPECT_EQ(4, x->NumCaptures()); + const map* have = x->CaptureNames(); + EXPECT_TRUE(have != NULL); + EXPECT_EQ(3, have->size()); + map want; + want[1] = "g1"; + want[3] = "g2"; + want[4] = "g1"; + + EXPECT_EQ(want, *have); + x->Decref(); + delete have; +} + +} // namespace re2 diff --git a/re2/testing/required_prefix_test.cc b/re2/testing/required_prefix_test.cc new file mode 100644 index 0000000..1f0b216 --- /dev/null +++ b/re2/testing/required_prefix_test.cc @@ -0,0 +1,67 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "util/test.h" +#include "re2/regexp.h" + +namespace re2 { + +struct PrefixTest { + const char* regexp; + bool return_value; + const char* prefix; + bool foldcase; + const char* suffix; +}; + +static PrefixTest tests[] = { + // If the regexp is missing a ^, there's no required prefix. + { "abc", false }, + { "", false }, + { "(?m)^", false }, + + // If the regexp immediately goes into + // something not a literal match, there's no required prefix. + { "^(abc)", false }, + { "^a*", false }, + + // Otherwise, it should work. + { "^abc$", true, "abc", false, "(?-m:$)" }, + { "^abc", "true", "abc", false, "" }, + { "^(?i)abc", true, "abc", true, "" }, + { "^abcd*", true, "abc", false, "d*" }, + { "^[Aa][Bb]cd*", true, "ab", true, "cd*" }, + { "^ab[Cc]d*", true, "ab", false, "[Cc]d*" }, + { "^☺abc", true, "☺abc", false, "" }, +}; + +TEST(RequiredPrefix, SimpleTests) { + for (int i = 0; i < arraysize(tests); i++) { + const PrefixTest& t = tests[i]; + for (int j = 0; j < 2; j++) { + Regexp::ParseFlags flags = Regexp::LikePerl; + if (j == 0) + flags = flags | Regexp::Latin1; + Regexp* re = Regexp::Parse(t.regexp, flags, NULL); + CHECK(re) << " " << t.regexp; + string p; + bool f = false; + Regexp* s = NULL; + CHECK_EQ(t.return_value, re->RequiredPrefix(&p, &f, &s)) + << " " << t.regexp << " " << (j==0 ? "latin1" : "utf") << " " << re->Dump(); + if (t.return_value) { + CHECK_EQ(p, string(t.prefix)) + << " " << t.regexp << " " << (j==0 ? "latin1" : "utf"); + CHECK_EQ(f, t.foldcase) + << " " << t.regexp << " " << (j==0 ? "latin1" : "utf"); + CHECK_EQ(s->ToString(), string(t.suffix)) + << " " << t.regexp << " " << (j==0 ? "latin1" : "utf"); + s->Decref(); + } + re->Decref(); + } + } +} + +} // namespace re2 diff --git a/re2/testing/search_test.cc b/re2/testing/search_test.cc new file mode 100644 index 0000000..3ab2ae3 --- /dev/null +++ b/re2/testing/search_test.cc @@ -0,0 +1,325 @@ +// Copyright 2006-2007 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include +#include "util/test.h" +#include "re2/prog.h" +#include "re2/regexp.h" +#include "re2/testing/tester.h" +#include "re2/testing/exhaustive_tester.h" + +namespace re2 { + +struct RegexpTest { + const char* regexp; + const char* text; +}; + +RegexpTest simple_tests[] = { + { "a", "a" }, + { "a", "zyzzyva" }, + { "a+", "aa" }, + { "(a+|b)+", "ab" }, + { "ab|cd", "xabcdx" }, + { "h.*od?", "hello\ngoodbye\n" }, + { "h.*o", "hello\ngoodbye\n" }, + { "h.*o", "goodbye\nhello\n" }, + { "h.*o", "hello world" }, + { "h.*o", "othello, world" }, + { "[^\\s\\S]", "aaaaaaa" }, + { "a", "aaaaaaa" }, + { "a*", "aaaaaaa" }, + { "a*", "" }, + { "a*", NULL }, + { "ab|cd", "xabcdx" }, + { "a", "cab" }, + { "a*b", "cab" }, + { "((((((((((((((((((((x))))))))))))))))))))", "x" }, + { "[abcd]", "xxxabcdxxx" }, + { "[^x]", "xxxabcdxxx" }, + { "[abcd]+", "xxxabcdxxx" }, + { "[^x]+", "xxxabcdxxx" }, + { "(fo|foo)", "fo" }, + { "(foo|fo)", "foo" }, + + { "aa", "aA" }, + { "a", "Aa" }, + { "a", "A" }, + { "ABC", "abc" }, + { "abc", "XABCY" }, + { "ABC", "xabcy" }, + + // Make sure ^ and $ work. + // The pathological cases didn't work + // in the original grep code. + { "foo|bar|[A-Z]", "foo" }, + { "^(foo|bar|[A-Z])", "foo" }, + { "(foo|bar|[A-Z])$", "foo\n" }, + { "(foo|bar|[A-Z])$", "foo" }, + { "^(foo|bar|[A-Z])$", "foo\n" }, + { "^(foo|bar|[A-Z])$", "foo" }, + { "^(foo|bar|[A-Z])$", "bar" }, + { "^(foo|bar|[A-Z])$", "X" }, + { "^(foo|bar|[A-Z])$", "XY" }, + { "^(fo|foo)$", "fo" }, + { "^(fo|foo)$", "foo" }, + { "^^(fo|foo)$", "fo" }, + { "^^(fo|foo)$", "foo" }, + { "^$", "" }, + { "^$", "x" }, + { "^^$", "" }, + { "^$$", "" }, + { "^^$", "x" }, + { "^$$", "x" }, + { "^^$$", "" }, + { "^^$$", "x" }, + { "^^^^^^^^$$$$$$$$", "" }, + { "^", "x" }, + { "$", "x" }, + + // Word boundaries. + { "\\bfoo\\b", "nofoo foo that" }, + { "a\\b", "faoa x" }, + { "\\bbar", "bar x" }, + { "\\bbar", "foo\nbar x" }, + { "bar\\b", "foobar" }, + { "bar\\b", "foobar\nxxx" }, + { "(foo|bar|[A-Z])\\b", "foo" }, + { "(foo|bar|[A-Z])\\b", "foo\n" }, + { "\\b", "" }, + { "\\b", "x" }, + { "\\b(foo|bar|[A-Z])", "foo" }, + { "\\b(foo|bar|[A-Z])\\b", "X" }, + { "\\b(foo|bar|[A-Z])\\b", "XY" }, + { "\\b(foo|bar|[A-Z])\\b", "bar" }, + { "\\b(foo|bar|[A-Z])\\b", "foo" }, + { "\\b(foo|bar|[A-Z])\\b", "foo\n" }, + { "\\b(foo|bar|[A-Z])\\b", "ffoo bbar N x" }, + { "\\b(fo|foo)\\b", "fo" }, + { "\\b(fo|foo)\\b", "foo" }, + { "\\b\\b", "" }, + { "\\b\\b", "x" }, + { "\\b$", "" }, + { "\\b$", "x" }, + { "\\b$", "y x" }, + { "\\b.$", "x" }, + { "^\\b(fo|foo)\\b", "fo" }, + { "^\\b(fo|foo)\\b", "foo" }, + { "^\\b", "" }, + { "^\\b", "x" }, + { "^\\b\\b", "" }, + { "^\\b\\b", "x" }, + { "^\\b$", "" }, + { "^\\b$", "x" }, + { "^\\b.$", "x" }, + { "^\\b.\\b$", "x" }, + { "^^^^^^^^\\b$$$$$$$", "" }, + { "^^^^^^^^\\b.$$$$$$", "x" }, + { "^^^^^^^^\\b$$$$$$$", "x" }, + + // Non-word boundaries. + { "\\Bfoo\\B", "n foo xfoox that" }, + { "a\\B", "faoa x" }, + { "\\Bbar", "bar x" }, + { "\\Bbar", "foo\nbar x" }, + { "bar\\B", "foobar" }, + { "bar\\B", "foobar\nxxx" }, + { "(foo|bar|[A-Z])\\B", "foox" }, + { "(foo|bar|[A-Z])\\B", "foo\n" }, + { "\\B", "" }, + { "\\B", "x" }, + { "\\B(foo|bar|[A-Z])", "foo" }, + { "\\B(foo|bar|[A-Z])\\B", "xXy" }, + { "\\B(foo|bar|[A-Z])\\B", "XY" }, + { "\\B(foo|bar|[A-Z])\\B", "XYZ" }, + { "\\B(foo|bar|[A-Z])\\B", "abara" }, + { "\\B(foo|bar|[A-Z])\\B", "xfoo_" }, + { "\\B(foo|bar|[A-Z])\\B", "xfoo\n" }, + { "\\B(foo|bar|[A-Z])\\B", "foo bar vNx" }, + { "\\B(fo|foo)\\B", "xfoo" }, + { "\\B(foo|fo)\\B", "xfooo" }, + { "\\B\\B", "" }, + { "\\B\\B", "x" }, + { "\\B$", "" }, + { "\\B$", "x" }, + { "\\B$", "y x" }, + { "\\B.$", "x" }, + { "^\\B(fo|foo)\\B", "fo" }, + { "^\\B(fo|foo)\\B", "foo" }, + { "^\\B", "" }, + { "^\\B", "x" }, + { "^\\B\\B", "" }, + { "^\\B\\B", "x" }, + { "^\\B$", "" }, + { "^\\B$", "x" }, + { "^\\B.$", "x" }, + { "^\\B.\\B$", "x" }, + { "^^^^^^^^\\B$$$$$$$", "" }, + { "^^^^^^^^\\B.$$$$$$", "x" }, + { "^^^^^^^^\\B$$$$$$$", "x" }, + + // PCRE uses only ASCII for \b computation. + // All non-ASCII are *not* word characters. + { "\\bx\\b", "x" }, + { "\\bx\\b", "x>" }, + { "\\bx\\b", "" }, + { "\\bx\\b", "ax" }, + { "\\bx\\b", "xb" }, + { "\\bx\\b", "axb" }, + { "\\bx\\b", "«x" }, + { "\\bx\\b", "x»" }, + { "\\bx\\b", "«x»" }, + { "\\bx\\b", "axb" }, + { "\\bx\\b", "áxβ" }, + { "\\Bx\\B", "axb" }, + { "\\Bx\\B", "áxβ" }, + + // Weird boundary cases. + { "^$^$", "" }, + { "^$^", "" }, + { "$^$", "" }, + + { "^$^$", "x" }, + { "^$^", "x" }, + { "$^$", "x" }, + + { "^$^$", "x\ny" }, + { "^$^", "x\ny" }, + { "$^$", "x\ny" }, + + { "^$^$", "x\n\ny" }, + { "^$^", "x\n\ny" }, + { "$^$", "x\n\ny" }, + + { "^(foo\\$)$", "foo$bar" }, + { "(foo\\$)", "foo$bar" }, + { "^...$", "abc" }, + + // UTF-8 + { "^\xe6\x9c\xac$", "\xe6\x9c\xac" }, + { "^...$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" }, + { "^...$", ".\xe6\x9c\xac." }, + + { "^\\C\\C\\C$", "\xe6\x9c\xac" }, + { "^\\C$", "\xe6\x9c\xac" }, + { "^\\C\\C\\C$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" }, + + // Latin1 + { "^...$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" }, + { "^.........$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" }, + { "^...$", ".\xe6\x9c\xac." }, + { "^.....$", ".\xe6\x9c\xac." }, + + // Perl v Posix + { "\\B(fo|foo)\\B", "xfooo" }, + { "(fo|foo)", "foo" }, + + // Octal escapes. + { "\\141", "a" }, + { "\\060", "0" }, + { "\\0600", "00" }, + { "\\608", "08" }, + { "\\01", "\01" }, + { "\\018", "\01" "8" }, + + // Hexadecimal escapes + { "\\x{61}", "a" }, + { "\\x61", "a" }, + { "\\x{00000061}", "a" }, + + // Unicode scripts. + { "\\p{Greek}+", "aαβb" }, + { "\\P{Greek}+", "aαβb" }, + { "\\p{^Greek}+", "aαβb" }, + { "\\P{^Greek}+", "aαβb" }, + + // Unicode properties. Nd is decimal number. N is any number. + { "[^0-9]+", "abc123" }, + { "\\p{Nd}+", "abc123²³¼½¾₀₉" }, + { "\\p{^Nd}+", "abc123²³¼½¾₀₉" }, + { "\\P{Nd}+", "abc123²³¼½¾₀₉" }, + { "\\P{^Nd}+", "abc123²³¼½¾₀₉" }, + { "\\pN+", "abc123²³¼½¾₀₉" }, + { "\\p{N}+", "abc123²³¼½¾₀₉" }, + { "\\p{^N}+", "abc123²³¼½¾₀₉" }, + + { "\\p{Any}+", "abc123" }, + + // Character classes & case folding. + { "(?i)[@-A]+", "@AaB" }, // matches @Aa but not B + { "(?i)[A-Z]+", "aAzZ" }, + { "(?i)[^\\\\]+", "Aa\\" }, // \\ is between A-Z and a-z - + // splits the ranges in an interesting way. + + // would like to use, but PCRE mishandles in full-match, non-greedy mode + // { "(?i)[\\\\]+", "Aa" }, + + { "(?i)[acegikmoqsuwy]+", "acegikmoqsuwyACEGIKMOQSUWY" }, + + // Character classes & case folding. + { "[@-A]+", "@AaB" }, + { "[A-Z]+", "aAzZ" }, + { "[^\\\\]+", "Aa\\" }, + { "[acegikmoqsuwy]+", "acegikmoqsuwyACEGIKMOQSUWY" }, + + // Anchoring. (^abc in aabcdef was a former bug) + // The tester checks for a match in the text and + // subpieces of the text with a byte removed on either side. + { "^abc", "abcdef" }, + { "^abc", "aabcdef" }, + { "^[ay]*[bx]+c", "abcdef" }, + { "^[ay]*[bx]+c", "aabcdef" }, + { "def$", "abcdef" }, + { "def$", "abcdeff" }, + { "d[ex][fy]$", "abcdef" }, + { "d[ex][fy]$", "abcdeff" }, + { "[dz][ex][fy]$", "abcdef" }, + { "[dz][ex][fy]$", "abcdeff" }, + { "(?m)^abc", "abcdef" }, + { "(?m)^abc", "aabcdef" }, + { "(?m)^[ay]*[bx]+c", "abcdef" }, + { "(?m)^[ay]*[bx]+c", "aabcdef" }, + { "(?m)def$", "abcdef" }, + { "(?m)def$", "abcdeff" }, + { "(?m)d[ex][fy]$", "abcdef" }, + { "(?m)d[ex][fy]$", "abcdeff" }, + { "(?m)[dz][ex][fy]$", "abcdef" }, + { "(?m)[dz][ex][fy]$", "abcdeff" }, + { "^", "a" }, + { "^^", "a" }, + + // Context. + // The tester checks for a match in the text and + // subpieces of the text with a byte removed on either side. + { "a", "a" }, + { "ab*", "a" }, + { "a\\C*", "a" }, + + // Former bugs. + { "a\\C*|ba\\C", "baba" }, +}; + +TEST(Regexp, SearchTests) { + int failures = 0; + for (int i = 0; i < arraysize(simple_tests); i++) { + const RegexpTest& t = simple_tests[i]; + if (!TestRegexpOnText(t.regexp, t.text)) + failures++; + +#ifdef LOGGING + // Build a dummy ExhaustiveTest call that will trigger just + // this one test, so that we log the test case. + vector atom, alpha, ops; + atom.push_back(StringPiece(t.regexp).as_string()); + alpha.push_back(StringPiece(t.text).as_string()); + ExhaustiveTest(1, 0, atom, ops, 1, alpha, "", ""); +#endif + + } + EXPECT_EQ(failures, 0); +} + +} // namespace re2 diff --git a/re2/testing/set_test.cc b/re2/testing/set_test.cc new file mode 100644 index 0000000..74058a4 --- /dev/null +++ b/re2/testing/set_test.cc @@ -0,0 +1,114 @@ +// Copyright 2010 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include +#include + +#include "util/test.h" +#include "re2/re2.h" +#include "re2/set.h" + +namespace re2 { + +TEST(Set, Unanchored) { + RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED); + + CHECK_EQ(s.Add("foo", NULL), 0); + CHECK_EQ(s.Add("(", NULL), -1); + CHECK_EQ(s.Add("bar", NULL), 1); + + CHECK_EQ(s.Compile(), true); + + vector v; + CHECK_EQ(s.Match("foobar", &v), true); + CHECK_EQ(v.size(), 2); + CHECK_EQ(v[0], 0); + CHECK_EQ(v[1], 1); + + v.clear(); + CHECK_EQ(s.Match("fooba", &v), true); + CHECK_EQ(v.size(), 1); + CHECK_EQ(v[0], 0); + + v.clear(); + CHECK_EQ(s.Match("oobar", &v), true); + CHECK_EQ(v.size(), 1); + CHECK_EQ(v[0], 1); +} + +TEST(Set, UnanchoredFactored) { + RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED); + + CHECK_EQ(s.Add("foo", NULL), 0); + CHECK_EQ(s.Add("(", NULL), -1); + CHECK_EQ(s.Add("foobar", NULL), 1); + + CHECK_EQ(s.Compile(), true); + + vector v; + CHECK_EQ(s.Match("foobar", &v), true); + CHECK_EQ(v.size(), 2); + CHECK_EQ(v[0], 0); + CHECK_EQ(v[1], 1); + + v.clear(); + CHECK_EQ(s.Match("obarfoobaroo", &v), true); + CHECK_EQ(v.size(), 2); + CHECK_EQ(v[0], 0); + CHECK_EQ(v[1], 1); + + v.clear(); + CHECK_EQ(s.Match("fooba", &v), true); + CHECK_EQ(v.size(), 1); + CHECK_EQ(v[0], 0); + + v.clear(); + CHECK_EQ(s.Match("oobar", &v), false); + CHECK_EQ(v.size(), 0); +} + +TEST(Set, UnanchoredDollar) { + RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED); + + CHECK_EQ(s.Add("foo$", NULL), 0); + CHECK_EQ(s.Compile(), true); + + vector v; + CHECK_EQ(s.Match("foo", &v), true); + CHECK_EQ(v.size(), 1); + CHECK_EQ(v[0], 0); +} + +TEST(Set, Anchored) { + RE2::Set s(RE2::DefaultOptions, RE2::ANCHOR_BOTH); + + CHECK_EQ(s.Add("foo", NULL), 0); + CHECK_EQ(s.Add("(", NULL), -1); + CHECK_EQ(s.Add("bar", NULL), 1); + + CHECK_EQ(s.Compile(), true); + + vector v; + CHECK_EQ(s.Match("foobar", &v), false); + CHECK_EQ(v.size(), 0); + + CHECK_EQ(s.Match("fooba", &v), false); + CHECK_EQ(v.size(), 0); + + CHECK_EQ(s.Match("oobar", &v), false); + CHECK_EQ(v.size(), 0); + + CHECK_EQ(s.Match("foo", &v), true); + CHECK_EQ(v.size(), 1); + CHECK_EQ(v[0], 0); + + CHECK_EQ(s.Match("bar", &v), true); + CHECK_EQ(v.size(), 1); + CHECK_EQ(v[0], 1); + +} + +} // namespace re2 + diff --git a/re2/testing/simplify_test.cc b/re2/testing/simplify_test.cc new file mode 100644 index 0000000..d54837c --- /dev/null +++ b/re2/testing/simplify_test.cc @@ -0,0 +1,167 @@ +// Copyright 2006 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Test simplify.cc. + +#include +#include +#include "util/test.h" +#include "re2/regexp.h" + +namespace re2 { + +struct Test { + const char* regexp; + const char* simplified; +}; + +static Test tests[] = { + // Already-simple constructs + { "a", "a" }, + { "ab", "ab" }, + { "a|b", "[a-b]" }, + { "ab|cd", "ab|cd" }, + { "(ab)*", "(ab)*" }, + { "(ab)+", "(ab)+" }, + { "(ab)?", "(ab)?" }, + { ".", "." }, + { "^", "^" }, + { "$", "$" }, + { "[ac]", "[ac]" }, + { "[^ac]", "[^ac]" }, + + // Posix character classes + { "[[:alnum:]]", "[0-9A-Za-z]" }, + { "[[:alpha:]]", "[A-Za-z]" }, + { "[[:blank:]]", "[\\t ]" }, + { "[[:cntrl:]]", "[\\x00-\\x1f\\x7f]" }, + { "[[:digit:]]", "[0-9]" }, + { "[[:graph:]]", "[!-~]" }, + { "[[:lower:]]", "[a-z]" }, + { "[[:print:]]", "[ -~]" }, + { "[[:punct:]]", "[!-/:-@\\[-`{-~]" }, + { "[[:space:]]" , "[\\t-\\r ]" }, + { "[[:upper:]]", "[A-Z]" }, + { "[[:xdigit:]]", "[0-9A-Fa-f]" }, + + // Perl character classes + { "\\d", "[0-9]" }, + { "\\s", "[\\t-\\n\\f-\\r ]" }, + { "\\w", "[0-9A-Z_a-z]" }, + { "\\D", "[^0-9]" }, + { "\\S", "[^\\t-\\n\\f-\\r ]" }, + { "\\W", "[^0-9A-Z_a-z]" }, + { "[\\d]", "[0-9]" }, + { "[\\s]", "[\\t-\\n\\f-\\r ]" }, + { "[\\w]", "[0-9A-Z_a-z]" }, + { "[\\D]", "[^0-9]" }, + { "[\\S]", "[^\\t-\\n\\f-\\r ]" }, + { "[\\W]", "[^0-9A-Z_a-z]" }, + + // Posix repetitions + { "a{1}", "a" }, + { "a{2}", "aa" }, + { "a{5}", "aaaaa" }, + { "a{0,1}", "a?" }, + // The next three are illegible because Simplify inserts (?:) + // parens instead of () parens to avoid creating extra + // captured subexpressions. The comments show a version fewer parens. + { "(a){0,2}", "(?:(a)(a)?)?" }, // (aa?)? + { "(a){0,4}", "(?:(a)(?:(a)(?:(a)(a)?)?)?)?" }, // (a(a(aa?)?)?)? + { "(a){2,6}", "(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?" }, // aa(a(a(aa?)?)?)? + { "a{0,2}", "(?:aa?)?" }, // (aa?)? + { "a{0,4}", "(?:a(?:a(?:aa?)?)?)?" }, // (a(a(aa?)?)?)? + { "a{2,6}", "aa(?:a(?:a(?:aa?)?)?)?" }, // aa(a(a(aa?)?)?)? + { "a{0,}", "a*" }, + { "a{1,}", "a+" }, + { "a{2,}", "aa+" }, + { "a{5,}", "aaaaa+" }, + + // Test that operators simplify their arguments. + // (Simplify used to not simplify arguments to a {} repeat.) + { "(?:a{1,}){1,}", "a+" }, + { "(a{1,}b{1,})", "(a+b+)" }, + { "a{1,}|b{1,}", "a+|b+" }, + { "(?:a{1,})*", "(?:a+)*" }, + { "(?:a{1,})+", "a+" }, + { "(?:a{1,})?", "(?:a+)?" }, + { "a{0}", "" }, + + // Character class simplification + { "[ab]", "[a-b]" }, + { "[a-za-za-z]", "[a-z]" }, + { "[A-Za-zA-Za-z]", "[A-Za-z]" }, + { "[ABCDEFGH]", "[A-H]" }, + { "[AB-CD-EF-GH]", "[A-H]" }, + { "[W-ZP-XE-R]", "[E-Z]" }, + { "[a-ee-gg-m]", "[a-m]" }, + { "[a-ea-ha-m]", "[a-m]" }, + { "[a-ma-ha-e]", "[a-m]" }, + { "[a-zA-Z0-9 -~]", "[ -~]" }, + + // Empty character classes + { "[^[:cntrl:][:^cntrl:]]", "[^\\x00-\\x{10ffff}]" }, + + // Full character classes + { "[[:cntrl:][:^cntrl:]]", "." }, + + // Unicode case folding. + { "(?i)A", "[Aa]" }, + { "(?i)a", "[Aa]" }, + { "(?i)K", "[Kk\\x{212a}]" }, + { "(?i)k", "[Kk\\x{212a}]" }, + { "(?i)\\x{212a}", "[Kk\\x{212a}]" }, + { "(?i)[a-z]", "[A-Za-z\\x{17f}\\x{212a}]" }, + { "(?i)[\\x00-\\x{FFFD}]", "[\\x00-\\x{fffd}]" }, + { "(?i)[\\x00-\\x{10ffff}]", "." }, + + // Empty string as a regular expression. + // Empty string must be preserved inside parens in order + // to make submatches work right, so these are less + // interesting than they used to be. ToString inserts + // explicit (?:) in place of non-parenthesized empty strings, + // to make them easier to spot for other parsers. + { "(a|b|)", "([a-b]|(?:))" }, + { "(|)", "()" }, + { "a()", "a()" }, + { "(()|())", "(()|())" }, + { "(a|)", "(a|(?:))" }, + { "ab()cd()", "ab()cd()" }, + { "()", "()" }, + { "()*", "()*" }, + { "()+", "()+" }, + { "()?" , "()?" }, + { "(){0}", "" }, + { "(){1}", "()" }, + { "(){1,}", "()+" }, + { "(){0,2}", "(?:()()?)?" }, +}; + +TEST(TestSimplify, SimpleRegexps) { + for (int i = 0; i < arraysize(tests); i++) { + RegexpStatus status; + VLOG(1) << "Testing " << tests[i].regexp; + Regexp* re = Regexp::Parse(tests[i].regexp, + Regexp::MatchNL | (Regexp::LikePerl & + ~Regexp::OneLine), + &status); + CHECK(re != NULL) << " " << tests[i].regexp << " " << status.Text(); + Regexp* sre = re->Simplify(); + CHECK(sre != NULL); + + // Check that already-simple regexps don't allocate new ones. + if (strcmp(tests[i].regexp, tests[i].simplified) == 0) { + CHECK(re == sre) << " " << tests[i].regexp + << " " << re->ToString() << " " << sre->ToString(); + } + + EXPECT_EQ(tests[i].simplified, sre->ToString()) + << " " << tests[i].regexp << " " << sre->Dump(); + + re->Decref(); + sre->Decref(); + } +} + +} // namespace re2 diff --git a/re2/testing/string_generator.cc b/re2/testing/string_generator.cc new file mode 100644 index 0000000..5be6d3e --- /dev/null +++ b/re2/testing/string_generator.cc @@ -0,0 +1,113 @@ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// String generator: generates all possible strings of up to +// maxlen letters using the set of letters in alpha. +// Fetch strings using a Java-like Next()/HasNext() interface. + +#include +#include +#include "util/test.h" +#include "re2/testing/string_generator.h" + +namespace re2 { + +StringGenerator::StringGenerator(int maxlen, const vector& alphabet) + : maxlen_(maxlen), alphabet_(alphabet), + generate_null_(false), + random_(false), nrandom_(0), acm_(NULL) { + + // Degenerate case: no letters, no non-empty strings. + if (alphabet_.size() == 0) + maxlen_ = 0; + + // Next() will return empty string (digits_ is empty). + hasnext_ = true; +} + +StringGenerator::~StringGenerator() { + delete acm_; +} + +// Resets the string generator state to the beginning. +void StringGenerator::Reset() { + digits_.clear(); + hasnext_ = true; + random_ = false; + nrandom_ = 0; + generate_null_ = false; +} + +// Increments the big number in digits_, returning true if successful. +// Returns false if all the numbers have been used. +bool StringGenerator::IncrementDigits() { + // First try to increment the current number. + for (int i = digits_.size() - 1; i >= 0; i--) { + if (++digits_[i] < alphabet_.size()) + return true; + digits_[i] = 0; + } + + // If that failed, make a longer number. + if (digits_.size() < maxlen_) { + digits_.push_back(0); + return true; + } + + return false; +} + +// Generates random digits_, return true if successful. +// Returns false if the random sequence is over. +bool StringGenerator::RandomDigits() { + if (--nrandom_ <= 0) + return false; + + // Pick length. + int len = acm_->Uniform(maxlen_+1); + digits_.resize(len); + for (int i = 0; i < len; i++) + digits_[i] = acm_->Uniform(alphabet_.size()); + return true; +} + +// Returns the next string in the iteration, which is the one +// currently described by digits_. Calls IncrementDigits +// after computing the string, so that it knows the answer +// for subsequent HasNext() calls. +const StringPiece& StringGenerator::Next() { + CHECK(hasnext_); + if (generate_null_) { + generate_null_ = false; + sp_ = NULL; + return sp_; + } + s_.clear(); + for (int i = 0; i < digits_.size(); i++) { + s_ += alphabet_[digits_[i]]; + } + hasnext_ = random_ ? RandomDigits() : IncrementDigits(); + sp_ = s_; + return sp_; +} + +// Sets generator up to return n random strings. +void StringGenerator::Random(int32 seed, int n) { + if (acm_ == NULL) + acm_ = new ACMRandom(seed); + else + acm_->Reset(seed); + + random_ = true; + nrandom_ = n; + hasnext_ = nrandom_ > 0; +} + +void StringGenerator::GenerateNULL() { + generate_null_ = true; + hasnext_ = true; +} + +} // namespace re2 + diff --git a/re2/testing/string_generator.h b/re2/testing/string_generator.h new file mode 100644 index 0000000..6a9ef42 --- /dev/null +++ b/re2/testing/string_generator.h @@ -0,0 +1,58 @@ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// String generator: generates all possible strings of up to +// maxlen letters using the set of letters in alpha. +// Fetch strings using a Java-like Next()/HasNext() interface. + +#ifndef RE2_TESTING_STRING_GENERATOR_H__ +#define RE2_TESTING_STRING_GENERATOR_H__ + +#include +#include +#include "util/util.h" +#include "util/random.h" +#include "re2/stringpiece.h" + +namespace re2 { + +class StringGenerator { + public: + StringGenerator(int maxlen, const vector& alphabet); + ~StringGenerator(); + const StringPiece& Next(); + bool HasNext() { return hasnext_; } + + // Resets generator to start sequence over. + void Reset(); + + // Causes generator to emit random strings for next n calls to Next(). + void Random(int32 seed, int n); + + // Causes generator to emit a NULL as the next call. + void GenerateNULL(); + + private: + bool IncrementDigits(); + bool RandomDigits(); + + // Global state. + int maxlen_; // Maximum length string to generate. + vector alphabet_; // Alphabet, one string per letter. + + // Iteration state. + StringPiece sp_; // Last StringPiece returned by Next(). + string s_; // String data in last StringPiece returned by Next(). + bool hasnext_; // Whether Next() can be called again. + vector digits_; // Alphabet indices for next string. + bool generate_null_; // Whether to generate a NULL StringPiece next. + bool random_; // Whether generated strings are random. + int nrandom_; // Number of random strings left to generate. + ACMRandom* acm_; // Random number generator + DISALLOW_EVIL_CONSTRUCTORS(StringGenerator); +}; + +} // namespace re2 + +#endif // RE2_TESTING_STRING_GENERATOR_H__ diff --git a/re2/testing/string_generator_test.cc b/re2/testing/string_generator_test.cc new file mode 100644 index 0000000..d13401a --- /dev/null +++ b/re2/testing/string_generator_test.cc @@ -0,0 +1,109 @@ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Test StringGenerator. + +#include +#include +#include +#include "util/test.h" +#include "re2/testing/string_generator.h" +#include "re2/testing/regexp_generator.h" + +namespace re2 { + +// Returns i to the e. +static int64 IntegerPower(int i, int e) { + int64 p = 1; + while (e-- > 0) + p *= i; + return p; +} + +// Checks that for given settings of the string generator: +// * it generates strings that are non-decreasing in length. +// * strings of the same length are sorted in alphabet order. +// * it doesn't generate the same string twice. +// * it generates the right number of strings. +// +// If all of these hold, the StringGenerator is behaving. +// Assumes that the alphabet is sorted, so that the generated +// strings can just be compared lexicographically. +static void RunTest(int len, string alphabet, bool donull) { + StringGenerator g(len, Explode(alphabet)); + + int n = 0; + int last_l = -1; + string last_s; + + if (donull) { + g.GenerateNULL(); + EXPECT_TRUE(g.HasNext()); + StringPiece sp = g.Next(); + EXPECT_EQ(sp.data(), static_cast(NULL)); + EXPECT_EQ(sp.size(), 0); + } + + while (g.HasNext()) { + string s = g.Next().as_string(); + n++; + + // Check that all characters in s appear in alphabet. + for (const char *p = s.c_str(); *p != '\0'; ) { + Rune r; + p += chartorune(&r, p); + EXPECT_TRUE(utfrune(alphabet.c_str(), r) != NULL); + } + + // Check that string is properly ordered w.r.t. previous string. + int l = utflen(s.c_str()); + EXPECT_LE(l, len); + if (last_l < l) { + last_l = l; + } else { + EXPECT_EQ(last_l, l); + EXPECT_LT(last_s, s); + } + last_s = s; + } + + // Check total string count. + int64 m = 0; + int alpha = utflen(alphabet.c_str()); + if (alpha == 0) // Degenerate case. + len = 0; + for (int i = 0; i <= len; i++) + m += IntegerPower(alpha, i); + EXPECT_EQ(n, m); +} + +TEST(StringGenerator, NoLength) { + RunTest(0, "abc", false); +} + +TEST(StringGenerator, NoLengthNoAlphabet) { + RunTest(0, "", false); +} + +TEST(StringGenerator, NoAlphabet) { + RunTest(5, "", false); +} + +TEST(StringGenerator, Simple) { + RunTest(3, "abc", false); +} + +TEST(StringGenerator, UTF8) { + RunTest(4, "abc\xE2\x98\xBA", false); +} + +TEST(StringGenerator, GenNULL) { + RunTest(0, "abc", true); + RunTest(0, "", true); + RunTest(5, "", true); + RunTest(3, "abc", true); + RunTest(4, "abc\xE2\x98\xBA", true); +} + +} // namespace re2 diff --git a/re2/testing/tester.cc b/re2/testing/tester.cc new file mode 100644 index 0000000..003dc5a --- /dev/null +++ b/re2/testing/tester.cc @@ -0,0 +1,640 @@ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Regular expression engine tester -- test all the implementations against each other. + +#include "util/util.h" +#include "util/flags.h" +#include "re2/testing/tester.h" +#include "re2/prog.h" +#include "re2/re2.h" +#include "re2/regexp.h" + +DEFINE_bool(dump_prog, false, "dump regexp program"); +DEFINE_bool(log_okay, false, "log successful runs"); +DEFINE_bool(dump_rprog, false, "dump reversed regexp program"); + +DEFINE_int32(max_regexp_failures, 100, + "maximum number of regexp test failures (-1 = unlimited)"); + +DEFINE_string(regexp_engines, "", "pattern to select regexp engines to test"); + +namespace re2 { + +enum { + kMaxSubmatch = 1+16, // $0...$16 +}; + +const char* engine_types[kEngineMax] = { + "Backtrack", + "NFA", + "DFA", + "DFA1", + "OnePass", + "BitState", + "RE2", + "RE2a", + "RE2b", + "PCRE", +}; + +// Returns the name string for the type t. +static string EngineString(Engine t) { + if (t < 0 || t >= arraysize(engine_types) || engine_types[t] == NULL) { + return StringPrintf("type%d", static_cast(t)); + } + return engine_types[t]; +} + +// Returns bit mask of engines to use. +static uint32 Engines() { + static uint32 cached_engines; + static bool did_parse; + + if (did_parse) + return cached_engines; + + if (FLAGS_regexp_engines.empty()) { + cached_engines = ~0; + } else { + for (Engine i = static_cast(0); i < kEngineMax; i++) + if (strstr(EngineString(i).c_str(), FLAGS_regexp_engines.c_str())) + cached_engines |= 1<(0); i < kEngineMax; i++) { + if (cached_engines & (1<(s.begin() - text.begin()), + static_cast(s.end() - text.begin())); +} + +// Returns whether text contains non-ASCII (>= 0x80) bytes. +static bool NonASCII(const StringPiece& text) { + for (int i = 0; i < text.size(); i++) + if ((uint8)text[i] >= 0x80) + return true; + return false; +} + +// Returns string representation of match kind. +static string FormatKind(Prog::MatchKind kind) { + switch (kind) { + case Prog::kFullMatch: + return "full match"; + case Prog::kLongestMatch: + return "longest match"; + case Prog::kFirstMatch: + return "first match"; + case Prog::kManyMatch: + return "many match"; + } + return "???"; +} + +// Returns string representation of anchor kind. +static string FormatAnchor(Prog::Anchor anchor) { + switch (anchor) { + case Prog::kAnchored: + return "anchored"; + case Prog::kUnanchored: + return "unanchored"; + } + return "???"; +} + +struct ParseMode { + Regexp::ParseFlags parse_flags; + string desc; +}; + +static const Regexp::ParseFlags single_line = + Regexp::LikePerl; +static const Regexp::ParseFlags multi_line = + static_cast(Regexp::LikePerl & ~Regexp::OneLine); + +static ParseMode parse_modes[] = { + { single_line, "single-line" }, + { single_line|Regexp::Latin1, "single-line, latin1" }, + { multi_line, "multiline" }, + { multi_line|Regexp::NonGreedy, "multiline, nongreedy" }, + { multi_line|Regexp::Latin1, "multiline, latin1" }, +}; + +static string FormatMode(Regexp::ParseFlags flags) { + for (int i = 0; i < arraysize(parse_modes); i++) + if (parse_modes[i].parse_flags == flags) + return parse_modes[i].desc; + return StringPrintf("%#x", static_cast(flags)); +} + +// Constructs and saves all the matching engines that +// will be required for the given tests. +TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind, + Regexp::ParseFlags flags) + : regexp_str_(regexp_str), + kind_(kind), + flags_(flags), + error_(false), + regexp_(NULL), + num_captures_(0), + prog_(NULL), + rprog_(NULL), + re_(NULL), + re2_(NULL) { + + VLOG(1) << CEscape(regexp_str); + + // Compile regexp to prog. + // Always required - needed for backtracking (reference implementation). + RegexpStatus status; + regexp_ = Regexp::Parse(regexp_str, flags, &status); + if (regexp_ == NULL) { + LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_) + << " mode: " << FormatMode(flags); + error_ = true; + return; + } + num_captures_ = regexp_->NumCaptures(); + prog_ = regexp_->CompileToProg(0); + if (prog_ == NULL) { + LOG(INFO) << "Cannot compile: " << CEscape(regexp_str_); + error_ = true; + return; + } + if (FLAGS_dump_prog) { + LOG(INFO) << "Prog for " + << " regexp " + << CEscape(regexp_str_) + << " (" << FormatKind(kind_) + << ", " << FormatMode(flags_) + << ")\n" + << prog_->Dump(); + } + + // Compile regexp to reversed prog. Only needed for DFA engines. + if (Engines() & ((1<CompileToReverseProg(0); + if (rprog_ == NULL) { + LOG(INFO) << "Cannot reverse compile: " << CEscape(regexp_str_); + error_ = true; + return; + } + if (FLAGS_dump_rprog) + LOG(INFO) << rprog_->Dump(); + } + + // Create re string that will be used for RE and RE2. + string re = regexp_str.as_string(); + // Accomodate flags. + // Regexp::Latin1 will be accomodated below. + if (!(flags & Regexp::OneLine)) + re = "(?m)" + re; + if (flags & Regexp::NonGreedy) + re = "(?U)" + re; + if (flags & Regexp::DotNL) + re = "(?s)" + re; + + // Compile regexp to RE2. + if (Engines() & ((1<error().empty()) { + LOG(INFO) << "Cannot RE2: " << CEscape(re); + error_ = true; + return; + } + } + + // Compile regexp to RE. + // PCRE as exposed by the RE interface isn't always usable. + // 1. It disagrees about handling of empty-string reptitions + // like matching (a*)* against "b". PCRE treats the (a*) as + // occurring once, while we treat it as occurring not at all. + // 2. It treats $ as this weird thing meaning end of string + // or before the \n at the end of the string. + // 3. It doesn't implement POSIX leftmost-longest matching. + // MimicsPCRE() detects 1 and 2. + if ((Engines() & (1<MimicsPCRE() && + kind_ != Prog::kLongestMatch) { + PCRE_Options o; + o.set_option(PCRE::UTF8); + if (flags & Regexp::Latin1) + o.set_option(PCRE::None); + // PCRE has interface bug keeping us from finding $0, so + // add one more layer of parens. + re_ = new PCRE("("+re+")", o); + if (!re_->error().empty()) { + LOG(INFO) << "Cannot PCRE: " << CEscape(re); + error_ = true; + return; + } + } +} + +TestInstance::~TestInstance() { + if (regexp_) + regexp_->Decref(); + delete prog_; + delete rprog_; + delete re_; + delete re2_; +} + +// Runs a single search using the named engine type. +// This interface hides all the irregularities of the various +// engine interfaces from the rest of this file. +void TestInstance::RunSearch(Engine type, + const StringPiece& orig_text, + const StringPiece& orig_context, + Prog::Anchor anchor, + Result *result) { + memset(result, 0, sizeof *result); + if (regexp_ == NULL) { + result->skipped = true; + return; + } + int nsubmatch = 1 + num_captures_; // NumCaptures doesn't count $0 + if (nsubmatch > kMaxSubmatch) + nsubmatch = kMaxSubmatch; + + StringPiece text = orig_text; + StringPiece context = orig_context; + + switch (type) { + default: + LOG(FATAL) << "Bad RunSearch type: " << (int)type; + + case kEngineBacktrack: + if (prog_ == NULL) { + result->skipped = true; + break; + } + result->matched = + prog_->UnsafeSearchBacktrack(text, context, anchor, kind_, + result->submatch, nsubmatch); + result->have_submatch = true; + break; + + case kEngineNFA: + if (prog_ == NULL) { + result->skipped = true; + break; + } + result->matched = + prog_->SearchNFA(text, context, anchor, kind_, + result->submatch, nsubmatch); + result->have_submatch = true; + break; + + case kEngineDFA: + if (prog_ == NULL) { + result->skipped = true; + break; + } + result->matched = prog_->SearchDFA(text, context, anchor, kind_, NULL, + &result->skipped, NULL); + break; + + case kEngineDFA1: + if (prog_ == NULL || rprog_ == NULL) { + result->skipped = true; + break; + } + result->matched = + prog_->SearchDFA(text, context, anchor, kind_, result->submatch, + &result->skipped, NULL); + // If anchored, no need for second run, + // but do it anyway to find more bugs. + if (result->matched) { + if (!rprog_->SearchDFA(result->submatch[0], context, + Prog::kAnchored, Prog::kLongestMatch, + result->submatch, + &result->skipped, NULL)) { + LOG(ERROR) << "Reverse DFA inconsistency: " << CEscape(regexp_str_) + << " on " << CEscape(text); + result->matched = false; + } + } + result->have_submatch0 = true; + break; + + case kEngineOnePass: + if (prog_ == NULL || + anchor == Prog::kUnanchored || + !prog_->IsOnePass() || + nsubmatch > Prog::kMaxOnePassCapture) { + result->skipped = true; + break; + } + result->matched = prog_->SearchOnePass(text, context, anchor, kind_, + result->submatch, nsubmatch); + result->have_submatch = true; + break; + + case kEngineBitState: + if (prog_ == NULL) { + result->skipped = true; + break; + } + result->matched = prog_->SearchBitState(text, context, anchor, kind_, + result->submatch, nsubmatch); + result->have_submatch = true; + break; + + case kEngineRE2: + case kEngineRE2a: + case kEngineRE2b: { + if (!re2_ || text.end() != context.end()) { + result->skipped = true; + break; + } + + RE2::Anchor re_anchor; + if (anchor == Prog::kAnchored) + re_anchor = RE2::ANCHOR_START; + else + re_anchor = RE2::UNANCHORED; + if (kind_ == Prog::kFullMatch) + re_anchor = RE2::ANCHOR_BOTH; + + result->matched = re2_->Match(context, + text.begin() - context.begin(), + text.end() - context.begin(), + re_anchor, result->submatch, nsubmatch); + result->have_submatch = nsubmatch > 0; + break; + } + + case kEnginePCRE: { + if (!re_ || text.begin() != context.begin() || + text.end() != context.end()) { + result->skipped = true; + break; + } + + const PCRE::Arg **argptr = new const PCRE::Arg*[nsubmatch]; + PCRE::Arg *a = new PCRE::Arg[nsubmatch]; + for (int i = 0; i < nsubmatch; i++) { + a[i] = PCRE::Arg(&result->submatch[i]); + argptr[i] = &a[i]; + } + int consumed; + PCRE::Anchor pcre_anchor; + if (anchor == Prog::kAnchored) + pcre_anchor = PCRE::ANCHOR_START; + else + pcre_anchor = PCRE::UNANCHORED; + if (kind_ == Prog::kFullMatch) + pcre_anchor = PCRE::ANCHOR_BOTH; + re_->ClearHitLimit(); + result->matched = + re_->DoMatch(text, + pcre_anchor, + &consumed, + argptr, nsubmatch); + if (re_->HitLimit()) { + result->untrusted = true; + delete[] argptr; + delete[] a; + break; + } + result->have_submatch = true; + + // Work around RE interface bug: PCRE returns -1 as the + // offsets for an unmatched subexpression, and RE should + // turn that into StringPiece(NULL) but in fact it uses + // StringPiece(text.begin() - 1, 0). Oops. + for (int i = 0; i < nsubmatch; i++) + if (result->submatch[i].begin() == text.begin() - 1) + result->submatch[i] = NULL; + delete[] argptr; + delete[] a; + break; + } + } + + if (!result->matched) + memset(result->submatch, 0, sizeof result->submatch); +} + +// Checks whether r is okay given that correct is the right answer. +// Specifically, r's answers have to match (but it doesn't have to +// claim to have all the answers). +static bool ResultOkay(const Result& r, const Result& correct) { + if (r.skipped) + return true; + if (r.matched != correct.matched) + return false; + if (r.have_submatch || r.have_submatch0) { + for (int i = 0; i < kMaxSubmatch; i++) { + if (correct.submatch[i].begin() != r.submatch[i].begin() || + correct.submatch[i].size() != r.submatch[i].size()) + return false; + if (!r.have_submatch) + break; + } + } + return true; +} + +// Runs a single test. +bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context, + Prog::Anchor anchor) { + // Backtracking is the gold standard. + Result correct; + RunSearch(kEngineBacktrack, text, context, anchor, &correct); + if (correct.skipped) { + if (regexp_ == NULL) + return true; + LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_) + << " " << FormatMode(flags_); + return false; + } + VLOG(1) << "Try: regexp " << CEscape(regexp_str_) + << " text " << CEscape(text) + << " (" << FormatKind(kind_) + << ", " << FormatAnchor(anchor) + << ", " << FormatMode(flags_) + << ")"; + + // Compare the others. + bool all_okay = true; + for (Engine i = kEngineBacktrack+1; i < kEngineMax; i++) { + if (!(Engines() & (1< 0 && --FLAGS_max_regexp_failures == 0) + LOG(QFATAL) << "Too many regexp failures."; + } + + return all_okay; +} + +void TestInstance::LogMatch(const char* prefix, Engine e, + const StringPiece& text, const StringPiece& context, + Prog::Anchor anchor) { + LOG(INFO) << prefix + << EngineString(e) + << " regexp " + << CEscape(regexp_str_) + << " " + << CEscape(regexp_->ToString()) + << " text " + << CEscape(text) + << " (" + << text.begin() - context.begin() + << "," + << text.end() - context.begin() + << ") of context " + << CEscape(context) + << " (" << FormatKind(kind_) + << ", " << FormatAnchor(anchor) + << ", " << FormatMode(flags_) + << ")"; +} + +static Prog::MatchKind kinds[] = { + Prog::kFirstMatch, + Prog::kLongestMatch, + Prog::kFullMatch, +}; + +// Test all possible match kinds and parse modes. +Tester::Tester(const StringPiece& regexp) { + error_ = false; + for (int i = 0; i < arraysize(kinds); i++) { + for (int j = 0; j < arraysize(parse_modes); j++) { + TestInstance* t = new TestInstance(regexp, kinds[i], + parse_modes[j].parse_flags); + error_ |= t->error(); + v_.push_back(t); + } + } +} + +Tester::~Tester() { + for (int i = 0; i < v_.size(); i++) + delete v_[i]; +} + +bool Tester::TestCase(const StringPiece& text, const StringPiece& context, + Prog::Anchor anchor) { + bool okay = true; + for (int i = 0; i < v_.size(); i++) + okay &= (!v_[i]->error() && v_[i]->RunCase(text, context, anchor)); + return okay; +} + +static Prog::Anchor anchors[] = { + Prog::kAnchored, + Prog::kUnanchored +}; + +bool Tester::TestInput(const StringPiece& text) { + bool okay = TestInputInContext(text, text); + if (text.size() > 0) { + StringPiece sp; + sp = text; + sp.remove_prefix(1); + okay &= TestInputInContext(sp, text); + sp = text; + sp.remove_suffix(1); + okay &= TestInputInContext(sp, text); + } + return okay; +} + +bool Tester::TestInputInContext(const StringPiece& text, + const StringPiece& context) { + bool okay = true; + for (int i = 0; i < arraysize(anchors); i++) + okay &= TestCase(text, context, anchors[i]); + return okay; +} + +bool TestRegexpOnText(const StringPiece& regexp, + const StringPiece& text) { + Tester t(regexp); + return t.TestInput(text); +} + +} // namespace re2 diff --git a/re2/testing/tester.h b/re2/testing/tester.h new file mode 100644 index 0000000..6e16e77 --- /dev/null +++ b/re2/testing/tester.h @@ -0,0 +1,121 @@ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Comparative tester for regular expression matching. +// Checks all implementations against each other. + +#ifndef RE2_TESTING_TESTER_H__ +#define RE2_TESTING_TESTER_H__ + +#include "re2/stringpiece.h" +#include "re2/prog.h" +#include "re2/regexp.h" +#include "re2/re2.h" +#include "util/pcre.h" + +namespace re2 { + +class Regexp; + +// All the supported regexp engines. +enum Engine { + kEngineBacktrack = 0, // Prog::BadSearchBacktrack + kEngineNFA, // Prog::SearchNFA + kEngineDFA, // Prog::SearchDFA, only ask whether it matched + kEngineDFA1, // Prog::SearchDFA, ask for match[0] + kEngineOnePass, // Prog::SearchOnePass, if applicable + kEngineBitState, // Prog::SearchBitState + kEngineRE2, // RE2, all submatches + kEngineRE2a, // RE2, only ask for match[0] + kEngineRE2b, // RE2, only ask whether it matched + kEnginePCRE, // PCRE (util/pcre.h) + + kEngineMax, +}; + +// Make normal math on the enum preserve the type. +// By default, C++ doesn't define ++ on enum, and e+1 has type int. +static inline void operator++(Engine& e, int unused) { + e = static_cast(e+1); +} + +static inline Engine operator+(Engine e, int i) { + return static_cast(static_cast(e)+i); +} + +// A TestInstance caches per-regexp state for a given +// regular expression in a given configuration +// (UTF-8 vs Latin1, longest vs first match, etc.). +class TestInstance { + public: + struct Result; + + TestInstance(const StringPiece& regexp, Prog::MatchKind kind, + Regexp::ParseFlags flags); + ~TestInstance(); + Regexp::ParseFlags flags() { return flags_; } + bool error() { return error_; } + + // Runs a single test case: search in text, which is in context, + // using the given anchoring. + bool RunCase(const StringPiece& text, const StringPiece& context, + Prog::Anchor anchor); + + private: + // Runs a single search using the named engine type. + void RunSearch(Engine type, + const StringPiece& text, const StringPiece& context, + Prog::Anchor anchor, + Result *result); + + void LogMatch(const char* prefix, Engine e, const StringPiece& text, + const StringPiece& context, Prog::Anchor anchor); + + const StringPiece& regexp_str_; // regexp being tested + Prog::MatchKind kind_; // kind of match + Regexp::ParseFlags flags_; // flags for parsing regexp_str_ + bool error_; // error during constructor? + + Regexp* regexp_; // parsed regexp + int num_captures_; // regexp_->NumCaptures() cached + Prog* prog_; // compiled program + Prog* rprog_; // compiled reverse program + PCRE* re_; // PCRE implementation + RE2* re2_; // RE2 implementation + + DISALLOW_EVIL_CONSTRUCTORS(TestInstance); +}; + +// A group of TestInstances for all possible configurations. +class Tester { + public: + explicit Tester(const StringPiece& regexp); + ~Tester(); + + bool error() { return error_; } + + // Runs a single test case: search in text, which is in context, + // using the given anchoring. + bool TestCase(const StringPiece& text, const StringPiece& context, + Prog::Anchor anchor); + + // Run TestCase(text, text, anchor) for all anchoring modes. + bool TestInput(const StringPiece& text); + + // Run TestCase(text, context, anchor) for all anchoring modes. + bool TestInputInContext(const StringPiece& text, const StringPiece& context); + + private: + bool error_; + vector v_; + + DISALLOW_EVIL_CONSTRUCTORS(Tester); +}; + +// Run all possible tests using regexp and text. +bool TestRegexpOnText(const StringPiece& regexp, const StringPiece& text); + +} // namespace re2 + +#endif // RE2_TESTING_TESTER_H__ diff --git a/re2/testing/unicode_test.py b/re2/testing/unicode_test.py new file mode 100755 index 0000000..a88a3ad --- /dev/null +++ b/re2/testing/unicode_test.py @@ -0,0 +1,207 @@ +#!/usr/bin/python2.4 +# +# Copyright 2008 The RE2 Authors. All Rights Reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +"""Unittest for the util/regexp/re2/unicode.py module.""" + +import os +import StringIO +from google3.pyglib import flags +from google3.testing.pybase import googletest +from google3.util.regexp.re2 import unicode + +_UNICODE_DIR = os.path.join(flags.FLAGS.test_srcdir, "google3", "third_party", + "unicode", "ucd-5.1.0") + + +class ConvertTest(googletest.TestCase): + """Test the conversion functions.""" + + def testUInt(self): + self.assertEquals(0x0000, unicode._UInt("0000")) + self.assertEquals(0x263A, unicode._UInt("263A")) + self.assertEquals(0x10FFFF, unicode._UInt("10FFFF")) + self.assertRaises(unicode.InputError, unicode._UInt, "263") + self.assertRaises(unicode.InputError, unicode._UInt, "263AAAA") + self.assertRaises(unicode.InputError, unicode._UInt, "110000") + + def testURange(self): + self.assertEquals([1, 2, 3], unicode._URange("0001..0003")) + self.assertEquals([1], unicode._URange("0001")) + self.assertRaises(unicode.InputError, unicode._URange, "0001..0003..0005") + self.assertRaises(unicode.InputError, unicode._URange, "0003..0001") + self.assertRaises(unicode.InputError, unicode._URange, "0001..0001") + + def testUStr(self): + self.assertEquals("0x263A", unicode._UStr(0x263a)) + self.assertEquals("0x10FFFF", unicode._UStr(0x10FFFF)) + self.assertRaises(unicode.InputError, unicode._UStr, 0x110000) + self.assertRaises(unicode.InputError, unicode._UStr, -1) + + +_UNICODE_TABLE = """# Commented line, should be ignored. +# The next line is blank and should be ignored. + +0041;Capital A;Line 1 +0061..007A;Lowercase;Line 2 +1F00;;Ignored +1FFE;;Line 3 +10FFFF;Runemax;Line 4 +0000;Zero;Line 5 +""" + +_BAD_TABLE1 = """ +111111;Not a code point; +""" + +_BAD_TABLE2 = """ +0000;;Missing +""" + +_BAD_TABLE3 = """ +0010..0001;Bad range; +""" + + +class AbortError(Exception): + """Function should not have been called.""" + + +def Abort(): + raise AbortError("Abort") + + +def StringTable(s, n, f): + unicode.ReadUnicodeTable(StringIO.StringIO(s), n, f) + + +class ReadUnicodeTableTest(googletest.TestCase): + """Test the ReadUnicodeTable function.""" + + def testSimpleTable(self): + + ncall = [0] # can't assign to ordinary int in DoLine + + def DoLine(codes, fields): + self.assertEquals(3, len(fields)) + ncall[0] += 1 + self.assertEquals("Line %d" % (ncall[0],), fields[2]) + if ncall[0] == 1: + self.assertEquals([0x0041], codes) + self.assertEquals("0041", fields[0]) + self.assertEquals("Capital A", fields[1]) + elif ncall[0] == 2: + self.assertEquals(range(0x0061, 0x007A + 1), codes) + self.assertEquals("0061..007A", fields[0]) + self.assertEquals("Lowercase", fields[1]) + elif ncall[0] == 3: + self.assertEquals(range(0x1F00, 0x1FFE + 1), codes) + self.assertEquals("1F00..1FFE", fields[0]) + self.assertEquals("Greek", fields[1]) + elif ncall[0] == 4: + self.assertEquals([0x10FFFF], codes) + self.assertEquals("10FFFF", fields[0]) + self.assertEquals("Runemax", fields[1]) + elif ncall[0] == 5: + self.assertEquals([0x0000], codes) + self.assertEquals("0000", fields[0]) + self.assertEquals("Zero", fields[1]) + + StringTable(_UNICODE_TABLE, 3, DoLine) + self.assertEquals(5, ncall[0]) + + def testErrorTables(self): + self.assertRaises(unicode.InputError, StringTable, _UNICODE_TABLE, 4, Abort) + self.assertRaises(unicode.InputError, StringTable, _UNICODE_TABLE, 2, Abort) + self.assertRaises(unicode.InputError, StringTable, _BAD_TABLE1, 3, Abort) + self.assertRaises(unicode.InputError, StringTable, _BAD_TABLE2, 3, Abort) + self.assertRaises(unicode.InputError, StringTable, _BAD_TABLE3, 3, Abort) + + +class ParseContinueTest(googletest.TestCase): + """Test the ParseContinue function.""" + + def testParseContinue(self): + self.assertEquals(("Private Use", "First"), + unicode._ParseContinue("")) + self.assertEquals(("Private Use", "Last"), + unicode._ParseContinue("")) + self.assertEquals(("", None), + unicode._ParseContinue("")) + + +class CaseGroupsTest(googletest.TestCase): + """Test the CaseGroups function (and the CaseFoldingReader).""" + + def FindGroup(self, c): + if type(c) == str: + c = ord(c) + for g in self.groups: + if c in g: + return g + return None + + def testCaseGroups(self): + self.groups = unicode.CaseGroups(unicode_dir=_UNICODE_DIR) + self.assertEquals([ord("A"), ord("a")], self.FindGroup("a")) + self.assertEquals(None, self.FindGroup("0")) + + +class ScriptsTest(googletest.TestCase): + """Test the Scripts function (and the ScriptsReader).""" + + def FindScript(self, c): + if type(c) == str: + c = ord(c) + for script, codes in self.scripts.items(): + for code in codes: + if c == code: + return script + return None + + def testScripts(self): + self.scripts = unicode.Scripts(unicode_dir=_UNICODE_DIR) + self.assertEquals("Latin", self.FindScript("a")) + self.assertEquals("Common", self.FindScript("0")) + self.assertEquals(None, self.FindScript(0xFFFE)) + + +class CategoriesTest(googletest.TestCase): + """Test the Categories function (and the UnicodeDataReader).""" + + def FindCategory(self, c): + if type(c) == str: + c = ord(c) + short = None + for category, codes in self.categories.items(): + for code in codes: + if code == c: + # prefer category Nd over N + if len(category) > 1: + return category + if short == None: + short = category + return short + + def testCategories(self): + self.categories = unicode.Categories(unicode_dir=_UNICODE_DIR) + self.assertEquals("Ll", self.FindCategory("a")) + self.assertEquals("Nd", self.FindCategory("0")) + self.assertEquals("Lo", self.FindCategory(0xAD00)) # in First, Last range + self.assertEquals(None, self.FindCategory(0xFFFE)) + self.assertEquals("Lo", self.FindCategory(0x8B5A)) + self.assertEquals("Lo", self.FindCategory(0x6C38)) + self.assertEquals("Lo", self.FindCategory(0x92D2)) + self.assertTrue(ord("a") in self.categories["L"]) + self.assertTrue(ord("0") in self.categories["N"]) + self.assertTrue(0x8B5A in self.categories["L"]) + self.assertTrue(0x6C38 in self.categories["L"]) + self.assertTrue(0x92D2 in self.categories["L"]) + +def main(): + googletest.main() + +if __name__ == "__main__": + main() diff --git a/re2/unicode.py b/re2/unicode.py old mode 100755 new mode 100644 diff --git a/testinstall.cc b/testinstall.cc index 40b7a8a..17edfb4 100644 --- a/testinstall.cc +++ b/testinstall.cc @@ -1,3 +1,7 @@ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + #include #include #include diff --git a/util/logging.h b/util/logging.h index c8f6604..4443f7c 100644 --- a/util/logging.h +++ b/util/logging.h @@ -48,17 +48,25 @@ class LogMessage { public: - LogMessage(const char* file, int line) { + LogMessage(const char* file, int line) : flushed_(false) { stream() << file << ":" << line << ": "; } - ~LogMessage() { + void Flush() { stream() << "\n"; string s = str_.str(); - if(write(2, s.data(), s.size()) < 0) {} // shut up gcc + int n = (int)s.size(); // shut up msvc + if(write(2, s.data(), n) < 0) {} // shut up gcc + flushed_ = true; + } + ~LogMessage() { + if (!flushed_) { + Flush(); + } } ostream& stream() { return str_; } private: + bool flushed_; std::ostringstream str_; DISALLOW_EVIL_CONSTRUCTORS(LogMessage); }; @@ -68,7 +76,7 @@ class LogMessageFatal : public LogMessage { LogMessageFatal(const char* file, int line) : LogMessage(file, line) { } ~LogMessageFatal() { - std::cerr << "\n"; + Flush(); abort(); } private: diff --git a/util/mutex.h b/util/mutex.h index d2f69e7..9787bfb 100644 --- a/util/mutex.h +++ b/util/mutex.h @@ -72,7 +72,7 @@ class Mutex { MutexType mutex_; // Catch the error of writing Mutex when intending MutexLock. - Mutex(Mutex *ignored) {} + Mutex(Mutex *ignored); // Disallow "evil" constructors Mutex(const Mutex&); void operator=(const Mutex&); @@ -185,6 +185,27 @@ class WriterMutexLock { #define ReaderMutexLock(x) COMPILE_ASSERT(0, rmutex_lock_decl_missing_var_name) #define WriterMutexLock(x) COMPILE_ASSERT(0, wmutex_lock_decl_missing_var_name) +// Provide safe way to declare and use global, linker-initialized mutex. Sigh. +#ifdef HAVE_PTHREAD + +#define GLOBAL_MUTEX(name) \ + static pthread_mutex_t (name) = PTHREAD_MUTEX_INITIALIZER +#define GLOBAL_MUTEX_LOCK(name) \ + pthread_mutex_lock(&(name)) +#define GLOBAL_MUTEX_UNLOCK(name) \ + pthread_mutex_unlock(&(name)) + +#else + +#define GLOBAL_MUTEX(name) \ + static Mutex name +#define GLOBAL_MUTEX_LOCK(name) \ + name.Lock() +#define GLOBAL_MUTEX_UNLOCK(name) \ + name.Unlock() + +#endif + } // namespace re2 #endif /* #define RE2_UTIL_MUTEX_H_ */ diff --git a/util/sparse_array.h b/util/sparse_array.h index c024bed..3e33f89 100644 --- a/util/sparse_array.h +++ b/util/sparse_array.h @@ -224,13 +224,14 @@ class SparseArray { int max_size_; int* sparse_to_dense_; vector dense_; + bool valgrind_; DISALLOW_EVIL_CONSTRUCTORS(SparseArray); }; template SparseArray::SparseArray() - : size_(0), max_size_(0), sparse_to_dense_(NULL), dense_() {} + : size_(0), max_size_(0), sparse_to_dense_(NULL), dense_(), valgrind_(RunningOnValgrind()) {} // IndexValue pairs: exposed in SparseArray::iterator. template @@ -272,7 +273,7 @@ void SparseArray::resize(int new_max_size) { if (sparse_to_dense_) { memmove(a, sparse_to_dense_, max_size_*sizeof a[0]); // Don't need to zero the memory but appease Valgrind. - if (RunningOnValgrind()) { + if (valgrind_) { for (int i = max_size_; i < new_max_size; i++) a[i] = 0xababababU; } @@ -417,9 +418,10 @@ void SparseArray::create_index(int i) { template SparseArray::SparseArray(int max_size) { max_size_ = max_size; sparse_to_dense_ = new int[max_size]; + valgrind_ = RunningOnValgrind(); dense_.resize(max_size); // Don't need to zero the new memory, but appease Valgrind. - if (RunningOnValgrind()) { + if (valgrind_) { for (int i = 0; i < max_size; i++) { sparse_to_dense_[i] = 0xababababU; dense_[i].index_ = 0xababababU; diff --git a/util/sparse_set.h b/util/sparse_set.h index 9cb5753..165dd09 100644 --- a/util/sparse_set.h +++ b/util/sparse_set.h @@ -54,15 +54,16 @@ namespace re2 { class SparseSet { public: SparseSet() - : size_(0), max_size_(0), sparse_to_dense_(NULL), dense_(NULL) {} + : size_(0), max_size_(0), sparse_to_dense_(NULL), dense_(NULL), valgrind_(RunningOnValgrind()) {} SparseSet(int max_size) { max_size_ = max_size; sparse_to_dense_ = new int[max_size]; dense_ = new int[max_size]; + valgrind_ = RunningOnValgrind(); // Don't need to zero the memory, but do so anyway // to appease Valgrind. - if (RunningOnValgrind()) { + if (valgrind_) { for (int i = 0; i < max_size; i++) { dense_[i] = 0xababababU; sparse_to_dense_[i] = 0xababababU; @@ -94,7 +95,7 @@ class SparseSet { int* a = new int[new_max_size]; if (sparse_to_dense_) { memmove(a, sparse_to_dense_, max_size_*sizeof a[0]); - if (RunningOnValgrind()) { + if (valgrind_) { for (int i = max_size_; i < new_max_size; i++) a[i] = 0xababababU; } @@ -105,7 +106,7 @@ class SparseSet { a = new int[new_max_size]; if (dense_) { memmove(a, dense_, size_*sizeof a[0]); - if (RunningOnValgrind()) { + if (valgrind_) { for (int i = size_; i < new_max_size; i++) a[i] = 0xababababU; } @@ -168,6 +169,7 @@ class SparseSet { int max_size_; int* sparse_to_dense_; int* dense_; + bool valgrind_; DISALLOW_EVIL_CONSTRUCTORS(SparseSet); }; diff --git a/util/util.h b/util/util.h index bf897ae..463cbfb 100644 --- a/util/util.h +++ b/util/util.h @@ -14,6 +14,7 @@ #include #include #include +#include // For isdigit, isalpha. // C++ #include @@ -22,7 +23,7 @@ #include #include #include -#include +#include #include #include @@ -47,9 +48,8 @@ using std::make_pair; #else #include // using gnustl #endif - using std::tr1::unordered_set; - + #elif defined(__GNUC__) && !defined(USE_CXX0X) #include @@ -92,6 +92,7 @@ template struct CompileAssert {}; // Fake lock annotations. For real ones, see // http://code.google.com/p/data-race-test/ +#ifndef ANNOTATE_PUBLISH_MEMORY_RANGE #define ANNOTATE_PUBLISH_MEMORY_RANGE(a, b) #define ANNOTATE_IGNORE_WRITES_BEGIN() #define ANNOTATE_IGNORE_WRITES_END() @@ -99,6 +100,8 @@ template struct CompileAssert {}; #define NO_THREAD_SAFETY_ANALYSIS #define ANNOTATE_HAPPENS_BEFORE(x) #define ANNOTATE_HAPPENS_AFTER(x) +#define ANNOTATE_UNPROTECTED_READ(x) (x) +#endif class StringPiece; diff --git a/util/valgrind.cc b/util/valgrind.cc index 749bb59..46f804b 100644 --- a/util/valgrind.cc +++ b/util/valgrind.cc @@ -7,18 +7,12 @@ namespace re2 { -static bool checkValgrind() { +int RunningOnValgrind() { #ifdef RUNNING_ON_VALGRIND return RUNNING_ON_VALGRIND; #else - return false; + return 0; #endif } -static const int valgrind = checkValgrind(); - -int RunningOnValgrind() { - return valgrind; -} - } // namespace re2 -- cgit v1.2.3