diff options
Diffstat (limited to 'maint/GenerateCommon.py')
-rw-r--r-- | maint/GenerateCommon.py | 355 |
1 files changed, 355 insertions, 0 deletions
diff --git a/maint/GenerateCommon.py b/maint/GenerateCommon.py new file mode 100644 index 00000000..03f9ac55 --- /dev/null +++ b/maint/GenerateCommon.py @@ -0,0 +1,355 @@ +#! /usr/bin/python + +# PCRE2 UNICODE PROPERTY SUPPORT +# ------------------------------ + +# This file is a Python module containing common lists and functions for the +# GenerateXXX scripts that create various.c and .h files from Unicode data +# files. It was created as part of a re-organizaton of these scripts in +# December 2021. + + +import re + + +# --------------------------------------------------------------------------- +# DATA LISTS +# --------------------------------------------------------------------------- + +# BIDI classes in the DerivedBidiClass.txt file, with comments. + +bidi_classes = [ + 'AL', 'Arabic letter', + 'AN', 'Arabic number', + 'B', 'Paragraph separator', + 'BN', 'Boundary neutral', + 'CS', 'Common separator', + 'EN', 'European number', + 'ES', 'European separator', + 'ET', 'European terminator', + 'FSI', 'First strong isolate', + 'L', 'Left to right', + 'LRE', 'Left to right embedding', + 'LRI', 'Left to right isolate', + 'LRO', 'Left to right override', + 'NSM', 'Non-spacing mark', + 'ON', 'Other neutral', + 'PDF', 'Pop directional format', + 'PDI', 'Pop directional isolate', + 'R', 'Right to left', + 'RLE', 'Right to left embedding', + 'RLI', 'Right to left isolate', + 'RLO', 'Right to left override', + 'S', 'Segment separator', + 'WS', 'White space' + ] + +# Particular category property names, with comments. NOTE: If ever this list +# is changed, the table called "catposstab" in the pcre2_auto_possess.c file +# must be edited to keep in step. + +category_names = [ + 'Cc', 'Control', + 'Cf', 'Format', + 'Cn', 'Unassigned', + 'Co', 'Private use', + 'Cs', 'Surrogate', + 'Ll', 'Lower case letter', + 'Lm', 'Modifier letter', + 'Lo', 'Other letter', + 'Lt', 'Title case letter', + 'Lu', 'Upper case letter', + 'Mc', 'Spacing mark', + 'Me', 'Enclosing mark', + 'Mn', 'Non-spacing mark', + 'Nd', 'Decimal number', + 'Nl', 'Letter number', + 'No', 'Other number', + 'Pc', 'Connector punctuation', + 'Pd', 'Dash punctuation', + 'Pe', 'Close punctuation', + 'Pf', 'Final punctuation', + 'Pi', 'Initial punctuation', + 'Po', 'Other punctuation', + 'Ps', 'Open punctuation', + 'Sc', 'Currency symbol', + 'Sk', 'Modifier symbol', + 'Sm', 'Mathematical symbol', + 'So', 'Other symbol', + 'Zl', 'Line separator', + 'Zp', 'Paragraph separator', + 'Zs', 'Space separator' + ] + +# The Extended_Pictographic property is not found in the file where all the +# others are (GraphemeBreakProperty.txt). It comes from the emoji-data.txt +# file, but we list it here so that the name has the correct index value. + +break_properties = [ + 'CR', ' 0', + 'LF', ' 1', + 'Control', ' 2', + 'Extend', ' 3', + 'Prepend', ' 4', + 'SpacingMark', ' 5', + 'L', ' 6 Hangul syllable type L', + 'V', ' 7 Hangul syllable type V', + 'T', ' 8 Hangul syllable type T', + 'LV', ' 9 Hangul syllable type LV', + 'LVT', '10 Hangul syllable type LVT', + 'Regional_Indicator', '11', + 'Other', '12', + 'ZWJ', '13', + 'Extended_Pictographic', '14' + ] + +# List of files from which the names of Boolean properties are obtained, along +# with a list of regex patterns for properties to be ignored, and a list of +# extra pattern names to add. + +bool_propsfiles = ['PropList.txt', 'DerivedCoreProperties.txt', 'emoji-data.txt'] +bool_propsignore = [r'^Other_', r'^Hyphen$'] +bool_propsextras = ['ASCII', 'Bidi_Mirrored'] + + +# --------------------------------------------------------------------------- +# GET BOOLEAN PROPERTY NAMES +# --------------------------------------------------------------------------- + +# Get a list of Boolean property names from a number of files. + +def getbpropslist(): + bplist = [] + bplast = "" + + for filename in bool_propsfiles: + try: + file = open('Unicode.tables/' + filename, 'r') + except IOError: + print(f"** Couldn't open {'Unicode.tables/' + filename}\n") + sys.exit(1) + + for line in file: + line = re.sub(r'#.*', '', line) + data = list(map(str.strip, line.split(';'))) + if len(data) <= 1 or data[1] == bplast: + continue + bplast = data[1] + for pat in bool_propsignore: + if re.match(pat, bplast) != None: + break + else: + bplist.append(bplast) + + file.close() + + bplist.extend(bool_propsextras) + bplist.sort() + return bplist + +bool_properties = getbpropslist() +bool_props_list_item_size = (len(bool_properties) + 31) // 32 + + + +# --------------------------------------------------------------------------- +# COLLECTING PROPERTY NAMES AND ALIASES +# --------------------------------------------------------------------------- + +script_names = ['Unknown'] +abbreviations = {} + +def collect_property_names(): + global script_names + global abbreviations + + names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_]+) #') + + last_script_name = "" + with open("Unicode.tables/Scripts.txt") as f: + for line in f: + match_obj = names_re.match(line) + + if match_obj == None or match_obj.group(1) == last_script_name: + continue + + last_script_name = match_obj.group(1) + script_names.append(last_script_name) + + # Sometimes there is comment in the line + # so splitting around semicolon is not enough + value_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_ ]+))?') + + with open("Unicode.tables/PropertyValueAliases.txt") as f: + for line in f: + match_obj = value_alias_re.match(line) + + if match_obj == None: + continue + + if match_obj.group(1) == "sc": + if match_obj.group(2) == match_obj.group(3): + abbreviations[match_obj.group(3)] = () + elif match_obj.group(4) == None: + abbreviations[match_obj.group(3)] = (match_obj.group(2),) + else: + abbreviations[match_obj.group(3)] = (match_obj.group(2), match_obj.group(4)) + + # We can also collect Boolean property abbreviations into the same dictionary + + bin_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_]+))?') + with open("Unicode.tables/PropertyAliases.txt") as f: + for line in f: + match_obj = bin_alias_re.match(line) + if match_obj == None: + continue + + if match_obj.group(2) in bool_properties: + if match_obj.group(3) == None: + abbreviations[match_obj.group(2)] = (match_obj.group(1),) + else: + abbreviations[match_obj.group(2)] = (match_obj.group(1), match_obj.group(3)) + +collect_property_names() + + + +# --------------------------------------------------------------------------- +# REORDERING SCRIPT NAMES +# --------------------------------------------------------------------------- + +script_abbrevs = [] + +def reorder_scripts(): + global script_names + global script_abbrevs + global abbreviations + + for name in script_names: + abbrevs = abbreviations[name] + script_abbrevs.append(name if len(abbrevs) == 0 else abbrevs[0]) + + extended_script_abbrevs = set() + with open("Unicode.tables/ScriptExtensions.txt") as f: + names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_ ]+) #') + + for line in f: + match_obj = names_re.match(line) + + if match_obj == None: + continue + + for name in match_obj.group(1).split(" "): + extended_script_abbrevs.add(name) + + new_script_names = [] + new_script_abbrevs = [] + + for idx, abbrev in enumerate(script_abbrevs): + if abbrev in extended_script_abbrevs: + new_script_names.append(script_names[idx]) + new_script_abbrevs.append(abbrev) + + for idx, abbrev in enumerate(script_abbrevs): + if abbrev not in extended_script_abbrevs: + new_script_names.append(script_names[idx]) + new_script_abbrevs.append(abbrev) + + script_names = new_script_names + script_abbrevs = new_script_abbrevs + +reorder_scripts() +script_list_item_size = (script_names.index('Unknown') + 31) // 32 + + +# --------------------------------------------------------------------------- +# DERIVED LISTS +# --------------------------------------------------------------------------- + +# Create general character property names from the first letters of the +# particular categories. + +gcn_set = set(category_names[i][0] for i in range(0, len(category_names), 2)) +general_category_names = list(gcn_set) +general_category_names.sort() + + +# --------------------------------------------------------------------------- +# FUNCTIONS +# --------------------------------------------------------------------------- + +import sys + +# Open an output file, using the command's argument or a default. Write common +# preliminary header information. + +def open_output(default): + if len(sys.argv) > 2: + print('** Too many arguments: just give a file name') + sys.exit(1) + if len(sys.argv) == 2: + output_name = sys.argv[1] + else: + output_name = default + try: + file = open(output_name, "w") + except IOError: + print ("** Couldn't open %s" % output_name) + sys.exit(1) + + script_name = sys.argv[0] + i = script_name.rfind('/') + if i >= 0: + script_name = script_name[i+1:] + + file.write("""\ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Original API code Copyright (c) 1997-2012 University of Cambridge + New API code Copyright (c) 2016-2022 University of Cambridge + +This module is auto-generated from Unicode data files. DO NOT EDIT MANUALLY! +""") + + file.write("Instead, modify the maint/%s script and run it to generate\n" + "a new version of this code.\n\n" % script_name) + + file.write("""\ +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ +\n""") + return file + +# End of UcpCommon.py |