aboutsummaryrefslogtreecommitdiff
path: root/maint/GenerateCommon.py
diff options
context:
space:
mode:
Diffstat (limited to 'maint/GenerateCommon.py')
-rw-r--r--maint/GenerateCommon.py355
1 files changed, 355 insertions, 0 deletions
diff --git a/maint/GenerateCommon.py b/maint/GenerateCommon.py
new file mode 100644
index 00000000..03f9ac55
--- /dev/null
+++ b/maint/GenerateCommon.py
@@ -0,0 +1,355 @@
+#! /usr/bin/python
+
+# PCRE2 UNICODE PROPERTY SUPPORT
+# ------------------------------
+
+# This file is a Python module containing common lists and functions for the
+# GenerateXXX scripts that create various.c and .h files from Unicode data
+# files. It was created as part of a re-organizaton of these scripts in
+# December 2021.
+
+
+import re
+
+
+# ---------------------------------------------------------------------------
+# DATA LISTS
+# ---------------------------------------------------------------------------
+
+# BIDI classes in the DerivedBidiClass.txt file, with comments.
+
+bidi_classes = [
+ 'AL', 'Arabic letter',
+ 'AN', 'Arabic number',
+ 'B', 'Paragraph separator',
+ 'BN', 'Boundary neutral',
+ 'CS', 'Common separator',
+ 'EN', 'European number',
+ 'ES', 'European separator',
+ 'ET', 'European terminator',
+ 'FSI', 'First strong isolate',
+ 'L', 'Left to right',
+ 'LRE', 'Left to right embedding',
+ 'LRI', 'Left to right isolate',
+ 'LRO', 'Left to right override',
+ 'NSM', 'Non-spacing mark',
+ 'ON', 'Other neutral',
+ 'PDF', 'Pop directional format',
+ 'PDI', 'Pop directional isolate',
+ 'R', 'Right to left',
+ 'RLE', 'Right to left embedding',
+ 'RLI', 'Right to left isolate',
+ 'RLO', 'Right to left override',
+ 'S', 'Segment separator',
+ 'WS', 'White space'
+ ]
+
+# Particular category property names, with comments. NOTE: If ever this list
+# is changed, the table called "catposstab" in the pcre2_auto_possess.c file
+# must be edited to keep in step.
+
+category_names = [
+ 'Cc', 'Control',
+ 'Cf', 'Format',
+ 'Cn', 'Unassigned',
+ 'Co', 'Private use',
+ 'Cs', 'Surrogate',
+ 'Ll', 'Lower case letter',
+ 'Lm', 'Modifier letter',
+ 'Lo', 'Other letter',
+ 'Lt', 'Title case letter',
+ 'Lu', 'Upper case letter',
+ 'Mc', 'Spacing mark',
+ 'Me', 'Enclosing mark',
+ 'Mn', 'Non-spacing mark',
+ 'Nd', 'Decimal number',
+ 'Nl', 'Letter number',
+ 'No', 'Other number',
+ 'Pc', 'Connector punctuation',
+ 'Pd', 'Dash punctuation',
+ 'Pe', 'Close punctuation',
+ 'Pf', 'Final punctuation',
+ 'Pi', 'Initial punctuation',
+ 'Po', 'Other punctuation',
+ 'Ps', 'Open punctuation',
+ 'Sc', 'Currency symbol',
+ 'Sk', 'Modifier symbol',
+ 'Sm', 'Mathematical symbol',
+ 'So', 'Other symbol',
+ 'Zl', 'Line separator',
+ 'Zp', 'Paragraph separator',
+ 'Zs', 'Space separator'
+ ]
+
+# The Extended_Pictographic property is not found in the file where all the
+# others are (GraphemeBreakProperty.txt). It comes from the emoji-data.txt
+# file, but we list it here so that the name has the correct index value.
+
+break_properties = [
+ 'CR', ' 0',
+ 'LF', ' 1',
+ 'Control', ' 2',
+ 'Extend', ' 3',
+ 'Prepend', ' 4',
+ 'SpacingMark', ' 5',
+ 'L', ' 6 Hangul syllable type L',
+ 'V', ' 7 Hangul syllable type V',
+ 'T', ' 8 Hangul syllable type T',
+ 'LV', ' 9 Hangul syllable type LV',
+ 'LVT', '10 Hangul syllable type LVT',
+ 'Regional_Indicator', '11',
+ 'Other', '12',
+ 'ZWJ', '13',
+ 'Extended_Pictographic', '14'
+ ]
+
+# List of files from which the names of Boolean properties are obtained, along
+# with a list of regex patterns for properties to be ignored, and a list of
+# extra pattern names to add.
+
+bool_propsfiles = ['PropList.txt', 'DerivedCoreProperties.txt', 'emoji-data.txt']
+bool_propsignore = [r'^Other_', r'^Hyphen$']
+bool_propsextras = ['ASCII', 'Bidi_Mirrored']
+
+
+# ---------------------------------------------------------------------------
+# GET BOOLEAN PROPERTY NAMES
+# ---------------------------------------------------------------------------
+
+# Get a list of Boolean property names from a number of files.
+
+def getbpropslist():
+ bplist = []
+ bplast = ""
+
+ for filename in bool_propsfiles:
+ try:
+ file = open('Unicode.tables/' + filename, 'r')
+ except IOError:
+ print(f"** Couldn't open {'Unicode.tables/' + filename}\n")
+ sys.exit(1)
+
+ for line in file:
+ line = re.sub(r'#.*', '', line)
+ data = list(map(str.strip, line.split(';')))
+ if len(data) <= 1 or data[1] == bplast:
+ continue
+ bplast = data[1]
+ for pat in bool_propsignore:
+ if re.match(pat, bplast) != None:
+ break
+ else:
+ bplist.append(bplast)
+
+ file.close()
+
+ bplist.extend(bool_propsextras)
+ bplist.sort()
+ return bplist
+
+bool_properties = getbpropslist()
+bool_props_list_item_size = (len(bool_properties) + 31) // 32
+
+
+
+# ---------------------------------------------------------------------------
+# COLLECTING PROPERTY NAMES AND ALIASES
+# ---------------------------------------------------------------------------
+
+script_names = ['Unknown']
+abbreviations = {}
+
+def collect_property_names():
+ global script_names
+ global abbreviations
+
+ names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_]+) #')
+
+ last_script_name = ""
+ with open("Unicode.tables/Scripts.txt") as f:
+ for line in f:
+ match_obj = names_re.match(line)
+
+ if match_obj == None or match_obj.group(1) == last_script_name:
+ continue
+
+ last_script_name = match_obj.group(1)
+ script_names.append(last_script_name)
+
+ # Sometimes there is comment in the line
+ # so splitting around semicolon is not enough
+ value_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_ ]+))?')
+
+ with open("Unicode.tables/PropertyValueAliases.txt") as f:
+ for line in f:
+ match_obj = value_alias_re.match(line)
+
+ if match_obj == None:
+ continue
+
+ if match_obj.group(1) == "sc":
+ if match_obj.group(2) == match_obj.group(3):
+ abbreviations[match_obj.group(3)] = ()
+ elif match_obj.group(4) == None:
+ abbreviations[match_obj.group(3)] = (match_obj.group(2),)
+ else:
+ abbreviations[match_obj.group(3)] = (match_obj.group(2), match_obj.group(4))
+
+ # We can also collect Boolean property abbreviations into the same dictionary
+
+ bin_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_]+))?')
+ with open("Unicode.tables/PropertyAliases.txt") as f:
+ for line in f:
+ match_obj = bin_alias_re.match(line)
+ if match_obj == None:
+ continue
+
+ if match_obj.group(2) in bool_properties:
+ if match_obj.group(3) == None:
+ abbreviations[match_obj.group(2)] = (match_obj.group(1),)
+ else:
+ abbreviations[match_obj.group(2)] = (match_obj.group(1), match_obj.group(3))
+
+collect_property_names()
+
+
+
+# ---------------------------------------------------------------------------
+# REORDERING SCRIPT NAMES
+# ---------------------------------------------------------------------------
+
+script_abbrevs = []
+
+def reorder_scripts():
+ global script_names
+ global script_abbrevs
+ global abbreviations
+
+ for name in script_names:
+ abbrevs = abbreviations[name]
+ script_abbrevs.append(name if len(abbrevs) == 0 else abbrevs[0])
+
+ extended_script_abbrevs = set()
+ with open("Unicode.tables/ScriptExtensions.txt") as f:
+ names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_ ]+) #')
+
+ for line in f:
+ match_obj = names_re.match(line)
+
+ if match_obj == None:
+ continue
+
+ for name in match_obj.group(1).split(" "):
+ extended_script_abbrevs.add(name)
+
+ new_script_names = []
+ new_script_abbrevs = []
+
+ for idx, abbrev in enumerate(script_abbrevs):
+ if abbrev in extended_script_abbrevs:
+ new_script_names.append(script_names[idx])
+ new_script_abbrevs.append(abbrev)
+
+ for idx, abbrev in enumerate(script_abbrevs):
+ if abbrev not in extended_script_abbrevs:
+ new_script_names.append(script_names[idx])
+ new_script_abbrevs.append(abbrev)
+
+ script_names = new_script_names
+ script_abbrevs = new_script_abbrevs
+
+reorder_scripts()
+script_list_item_size = (script_names.index('Unknown') + 31) // 32
+
+
+# ---------------------------------------------------------------------------
+# DERIVED LISTS
+# ---------------------------------------------------------------------------
+
+# Create general character property names from the first letters of the
+# particular categories.
+
+gcn_set = set(category_names[i][0] for i in range(0, len(category_names), 2))
+general_category_names = list(gcn_set)
+general_category_names.sort()
+
+
+# ---------------------------------------------------------------------------
+# FUNCTIONS
+# ---------------------------------------------------------------------------
+
+import sys
+
+# Open an output file, using the command's argument or a default. Write common
+# preliminary header information.
+
+def open_output(default):
+ if len(sys.argv) > 2:
+ print('** Too many arguments: just give a file name')
+ sys.exit(1)
+ if len(sys.argv) == 2:
+ output_name = sys.argv[1]
+ else:
+ output_name = default
+ try:
+ file = open(output_name, "w")
+ except IOError:
+ print ("** Couldn't open %s" % output_name)
+ sys.exit(1)
+
+ script_name = sys.argv[0]
+ i = script_name.rfind('/')
+ if i >= 0:
+ script_name = script_name[i+1:]
+
+ file.write("""\
+/*************************************************
+* Perl-Compatible Regular Expressions *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+ Written by Philip Hazel
+ Original API code Copyright (c) 1997-2012 University of Cambridge
+ New API code Copyright (c) 2016-2022 University of Cambridge
+
+This module is auto-generated from Unicode data files. DO NOT EDIT MANUALLY!
+""")
+
+ file.write("Instead, modify the maint/%s script and run it to generate\n"
+ "a new version of this code.\n\n" % script_name)
+
+ file.write("""\
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of the University of Cambridge nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+\n""")
+ return file
+
+# End of UcpCommon.py