1 files changed, 355 insertions, 0 deletions
diff --git a/maint/GenerateCommon.py b/maint/GenerateCommon.py
new file mode 100644
index 00000000..03f9ac55
--- /dev/null
+++ b/maint/GenerateCommon.py
@@ -0,0 +1,355 @@
+#! /usr/bin/python
+
+#                   PCRE2 UNICODE PROPERTY SUPPORT
+#                   ------------------------------
+
+# This file is a Python module containing common lists and functions for the
+# GenerateXXX scripts that create various.c and .h files from Unicode data
+# files. It was created as part of a re-organizaton of these scripts in
+# December 2021.
+
+
+import re
+
+
+# ---------------------------------------------------------------------------
+#                             DATA LISTS
+# ---------------------------------------------------------------------------
+
+# BIDI classes in the DerivedBidiClass.txt file, with comments.
+
+bidi_classes = [
+  'AL',  'Arabic letter',
+  'AN',  'Arabic number',
+  'B',   'Paragraph separator',
+  'BN',  'Boundary neutral',
+  'CS',  'Common separator',
+  'EN',  'European number',
+  'ES',  'European separator',
+  'ET',  'European terminator',
+  'FSI', 'First strong isolate',
+  'L',   'Left to right',
+  'LRE', 'Left to right embedding',
+  'LRI', 'Left to right isolate',
+  'LRO', 'Left to right override',
+  'NSM', 'Non-spacing mark',
+  'ON',  'Other neutral',
+  'PDF', 'Pop directional format',
+  'PDI', 'Pop directional isolate',
+  'R',   'Right to left',
+  'RLE', 'Right to left embedding',
+  'RLI', 'Right to left isolate',
+  'RLO', 'Right to left override',
+  'S',   'Segment separator',
+  'WS',  'White space'
+  ]
+
+# Particular category property names, with comments. NOTE: If ever this list
+# is changed, the table called "catposstab" in the pcre2_auto_possess.c file
+# must be edited to keep in step.
+
+category_names = [
+  'Cc', 'Control',
+  'Cf', 'Format',
+  'Cn', 'Unassigned',
+  'Co', 'Private use',
+  'Cs', 'Surrogate',
+  'Ll', 'Lower case letter',
+  'Lm', 'Modifier letter',
+  'Lo', 'Other letter',
+  'Lt', 'Title case letter',
+  'Lu', 'Upper case letter',
+  'Mc', 'Spacing mark',
+  'Me', 'Enclosing mark',
+  'Mn', 'Non-spacing mark',
+  'Nd', 'Decimal number',
+  'Nl', 'Letter number',
+  'No', 'Other number',
+  'Pc', 'Connector punctuation',
+  'Pd', 'Dash punctuation',
+  'Pe', 'Close punctuation',
+  'Pf', 'Final punctuation',
+  'Pi', 'Initial punctuation',
+  'Po', 'Other punctuation',
+  'Ps', 'Open punctuation',
+  'Sc', 'Currency symbol',
+  'Sk', 'Modifier symbol',
+  'Sm', 'Mathematical symbol',
+  'So', 'Other symbol',
+  'Zl', 'Line separator',
+  'Zp', 'Paragraph separator',
+  'Zs', 'Space separator'
+  ]
+
+# The Extended_Pictographic property is not found in the file where all the
+# others are (GraphemeBreakProperty.txt). It comes from the emoji-data.txt
+# file, but we list it here so that the name has the correct index value.
+
+break_properties = [
+  'CR',                    ' 0',
+  'LF',                    ' 1',
+  'Control',               ' 2',
+  'Extend',                ' 3',
+  'Prepend',               ' 4',
+  'SpacingMark',           ' 5',
+  'L',                     ' 6 Hangul syllable type L',
+  'V',                     ' 7 Hangul syllable type V',
+  'T',                     ' 8 Hangul syllable type T',
+  'LV',                    ' 9 Hangul syllable type LV',
+  'LVT',                   '10 Hangul syllable type LVT',
+  'Regional_Indicator',    '11',
+  'Other',                 '12',
+  'ZWJ',                   '13',
+  'Extended_Pictographic', '14'
+  ]
+
+# List of files from which the names of Boolean properties are obtained, along
+# with a list of regex patterns for properties to be ignored, and a list of
+# extra pattern names to add.
+
+bool_propsfiles = ['PropList.txt', 'DerivedCoreProperties.txt', 'emoji-data.txt']
+bool_propsignore = [r'^Other_', r'^Hyphen$']
+bool_propsextras = ['ASCII', 'Bidi_Mirrored']
+
+
+# ---------------------------------------------------------------------------
+#                   GET BOOLEAN PROPERTY NAMES
+# ---------------------------------------------------------------------------
+
+# Get a list of Boolean property names from a number of files.
+
+def getbpropslist():
+  bplist = []
+  bplast = ""
+
+  for filename in bool_propsfiles:
+    try:
+      file = open('Unicode.tables/' + filename, 'r')
+    except IOError:
+      print(f"** Couldn't open {'Unicode.tables/' + filename}\n")
+      sys.exit(1)
+
+    for line in file:
+      line = re.sub(r'#.*', '', line)
+      data = list(map(str.strip, line.split(';')))
+      if len(data) <= 1 or data[1] == bplast:
+        continue
+      bplast = data[1]
+      for pat in bool_propsignore:
+        if re.match(pat, bplast) != None:
+          break
+      else:
+        bplist.append(bplast)
+
+    file.close()
+
+  bplist.extend(bool_propsextras)
+  bplist.sort()
+  return bplist
+
+bool_properties = getbpropslist()
+bool_props_list_item_size = (len(bool_properties) + 31) // 32
+
+
+
+# ---------------------------------------------------------------------------
+#                  COLLECTING PROPERTY NAMES AND ALIASES
+# ---------------------------------------------------------------------------
+
+script_names = ['Unknown']
+abbreviations = {}
+
+def collect_property_names():
+  global script_names
+  global abbreviations
+
+  names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_]+) #')
+
+  last_script_name = ""
+  with open("Unicode.tables/Scripts.txt") as f:
+    for line in f:
+      match_obj = names_re.match(line)
+
+      if match_obj == None or match_obj.group(1) == last_script_name:
+        continue
+
+      last_script_name = match_obj.group(1)
+      script_names.append(last_script_name)
+
+  # Sometimes there is comment in the line
+  # so splitting around semicolon is not enough
+  value_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_ ]+))?')
+
+  with open("Unicode.tables/PropertyValueAliases.txt") as f:
+    for line in f:
+      match_obj = value_alias_re.match(line)
+
+      if match_obj == None:
+        continue
+
+      if match_obj.group(1) == "sc":
+        if match_obj.group(2) == match_obj.group(3):
+          abbreviations[match_obj.group(3)] = ()
+        elif match_obj.group(4) == None:
+          abbreviations[match_obj.group(3)] = (match_obj.group(2),)
+        else:
+          abbreviations[match_obj.group(3)] = (match_obj.group(2), match_obj.group(4))
+
+  # We can also collect Boolean property abbreviations into the same dictionary
+
+  bin_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_]+))?')
+  with open("Unicode.tables/PropertyAliases.txt") as f:
+    for line in f:
+      match_obj = bin_alias_re.match(line)
+      if match_obj == None:
+        continue
+
+      if match_obj.group(2) in bool_properties:
+        if match_obj.group(3) == None:
+          abbreviations[match_obj.group(2)] = (match_obj.group(1),)
+        else:
+          abbreviations[match_obj.group(2)] = (match_obj.group(1), match_obj.group(3))
+
+collect_property_names()
+
+
+
+# ---------------------------------------------------------------------------
+#                      REORDERING SCRIPT NAMES
+# ---------------------------------------------------------------------------
+
+script_abbrevs = []
+
+def reorder_scripts():
+  global script_names
+  global script_abbrevs
+  global abbreviations
+
+  for name in script_names:
+    abbrevs = abbreviations[name]
+    script_abbrevs.append(name if len(abbrevs) == 0 else abbrevs[0])
+
+  extended_script_abbrevs = set()
+  with open("Unicode.tables/ScriptExtensions.txt") as f:
+    names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_ ]+) #')
+
+    for line in f:
+      match_obj = names_re.match(line)
+
+      if match_obj == None:
+        continue
+
+      for name in match_obj.group(1).split(" "):
+        extended_script_abbrevs.add(name)
+
+  new_script_names = []
+  new_script_abbrevs = []
+
+  for idx, abbrev in enumerate(script_abbrevs):
+    if abbrev in extended_script_abbrevs:
+      new_script_names.append(script_names[idx])
+      new_script_abbrevs.append(abbrev)
+
+  for idx, abbrev in enumerate(script_abbrevs):
+    if abbrev not in extended_script_abbrevs:
+      new_script_names.append(script_names[idx])
+      new_script_abbrevs.append(abbrev)
+
+  script_names = new_script_names
+  script_abbrevs = new_script_abbrevs
+
+reorder_scripts()
+script_list_item_size = (script_names.index('Unknown') + 31) // 32
+
+
+# ---------------------------------------------------------------------------
+#                         DERIVED LISTS
+# ---------------------------------------------------------------------------
+
+# Create general character property names from the first letters of the
+# particular categories.
+
+gcn_set = set(category_names[i][0] for i in range(0, len(category_names), 2))
+general_category_names = list(gcn_set)
+general_category_names.sort()
+
+
+# ---------------------------------------------------------------------------
+#                           FUNCTIONS
+# ---------------------------------------------------------------------------
+
+import sys
+
+# Open an output file, using the command's argument or a default. Write common
+# preliminary header information.
+
+def open_output(default):
+  if len(sys.argv) > 2:
+    print('** Too many arguments: just give a file name')
+    sys.exit(1)
+  if len(sys.argv) == 2:
+    output_name = sys.argv[1]
+  else:
+    output_name = default
+  try:
+    file = open(output_name, "w")
+  except IOError:
+    print ("** Couldn't open %s" % output_name)
+    sys.exit(1)
+
+  script_name = sys.argv[0]
+  i = script_name.rfind('/')
+  if i >= 0:
+    script_name = script_name[i+1:]
+
+  file.write("""\
+/*************************************************
+*      Perl-Compatible Regular Expressions       *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+                       Written by Philip Hazel
+     Original API code Copyright (c) 1997-2012 University of Cambridge
+          New API code Copyright (c) 2016-2022 University of Cambridge
+
+This module is auto-generated from Unicode data files. DO NOT EDIT MANUALLY!
+""")
+
+  file.write("Instead, modify the maint/%s script and run it to generate\n"
+  "a new version of this code.\n\n" % script_name)
+
+  file.write("""\
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of the University of Cambridge nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+\n""")
+  return file
+
+# End of UcpCommon.py