diff options
Diffstat (limited to 're2/make_unicode_casefold.py')
-rwxr-xr-x | re2/make_unicode_casefold.py | 146 |
1 files changed, 146 insertions, 0 deletions
diff --git a/re2/make_unicode_casefold.py b/re2/make_unicode_casefold.py new file mode 100755 index 0000000..3375d2e --- /dev/null +++ b/re2/make_unicode_casefold.py @@ -0,0 +1,146 @@ +#!/usr/bin/python +# coding=utf-8 +# +# Copyright 2008 The RE2 Authors. All Rights Reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +# See unicode_casefold.h for description of case folding tables. + +"""Generate C++ table for Unicode case folding.""" + +import unicode, sys + +_header = """ +// GENERATED BY make_unicode_casefold.py; DO NOT EDIT. +// make_unicode_casefold.py >unicode_casefold.cc + +#include "re2/unicode_casefold.h" + +namespace re2 { + +""" + +_trailer = """ + +} // namespace re2 + +""" + +def _Delta(a, b): + """Compute the delta for b - a. Even/odd and odd/even + are handled specially, as described above.""" + if a+1 == b: + if a%2 == 0: + return 'EvenOdd' + else: + return 'OddEven' + if a == b+1: + if a%2 == 0: + return 'OddEven' + else: + return 'EvenOdd' + return b - a + +def _AddDelta(a, delta): + """Return a + delta, handling EvenOdd and OddEven specially.""" + if type(delta) == int: + return a+delta + if delta == 'EvenOdd': + if a%2 == 0: + return a+1 + else: + return a-1 + if delta == 'OddEven': + if a%2 == 1: + return a+1 + else: + return a-1 + print >>sys.stderr, "Bad Delta: ", delta + raise "Bad Delta" + +def _MakeRanges(pairs): + """Turn a list like [(65,97), (66, 98), ..., (90,122)] + into [(65, 90, +32)].""" + ranges = [] + last = -100 + + def evenodd(last, a, b, r): + if a != last+1 or b != _AddDelta(a, r[2]): + return False + r[1] = a + return True + + def evenoddpair(last, a, b, r): + if a != last+2: + return False + delta = r[2] + d = delta + if type(delta) is not str: + return False + if delta.endswith('Skip'): + d = delta[:-4] + else: + delta = d + 'Skip' + if b != _AddDelta(a, d): + return False + r[1] = a + r[2] = delta + return True + + for a, b in pairs: + if ranges and evenodd(last, a, b, ranges[-1]): + pass + elif ranges and evenoddpair(last, a, b, ranges[-1]): + pass + else: + ranges.append([a, a, _Delta(a, b)]) + last = a + return ranges + +# The maximum size of a case-folding group. +# Case folding is implemented in parse.cc by a recursive process +# with a recursion depth equal to the size of the largest +# case-folding group, so it is important that this bound be small. +# The current tables have no group bigger than 4. +# If there are ever groups bigger than 10 or so, it will be +# time to rework the code in parse.cc. +MaxCasefoldGroup = 4 + +def main(): + lowergroups, casegroups = unicode.CaseGroups() + foldpairs = [] + seen = {} + for c in casegroups: + if len(c) > MaxCasefoldGroup: + raise unicode.Error("casefold group too long: %s" % (c,)) + for i in range(len(c)): + if c[i-1] in seen: + raise unicode.Error("bad casegroups %d -> %d" % (c[i-1], c[i])) + seen[c[i-1]] = True + foldpairs.append([c[i-1], c[i]]) + + lowerpairs = [] + for lower, group in lowergroups.iteritems(): + for g in group: + if g != lower: + lowerpairs.append([g, lower]) + + def printpairs(name, foldpairs): + foldpairs.sort() + foldranges = _MakeRanges(foldpairs) + print "// %d groups, %d pairs, %d ranges" % (len(casegroups), len(foldpairs), len(foldranges)) + print "CaseFold unicode_%s[] = {" % (name,) + for lo, hi, delta in foldranges: + print "\t{ %d, %d, %s }," % (lo, hi, delta) + print "};" + print "int num_unicode_%s = %d;" % (name, len(foldranges),) + print "" + + print _header + printpairs("casefold", foldpairs) + printpairs("tolower", lowerpairs) + print _trailer + +if __name__ == '__main__': + main() |