aboutsummaryrefslogtreecommitdiff
path: root/scripts/generate_identifier_pattern.py
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/generate_identifier_pattern.py')
-rwxr-xr-xscripts/generate_identifier_pattern.py74
1 files changed, 74 insertions, 0 deletions
diff --git a/scripts/generate_identifier_pattern.py b/scripts/generate_identifier_pattern.py
new file mode 100755
index 00000000..6b479535
--- /dev/null
+++ b/scripts/generate_identifier_pattern.py
@@ -0,0 +1,74 @@
+import itertools
+import os
+import re
+import sys
+
+
+def get_characters():
+ """Find every Unicode character that is valid in a Python `identifier`_ but
+ is not matched by the regex ``\\w`` group.
+
+ ``\\w`` matches some characters that aren't valid in identifiers, but
+ :meth:`str.isidentifier` will catch that later in lexing.
+
+ All start characters are valid continue characters, so we only test for
+ continue characters.
+
+ _identifier: https://docs.python.org/3/reference/lexical_analysis.html#identifiers
+ """
+ for cp in range(sys.maxunicode + 1):
+ s = chr(cp)
+
+ if ("a" + s).isidentifier() and not re.match(r"\w", s):
+ yield s
+
+
+def collapse_ranges(data):
+ """Given a sorted list of unique characters, generate ranges representing
+ sequential code points.
+
+ Source: https://stackoverflow.com/a/4629241/400617
+ """
+ for _, b in itertools.groupby(enumerate(data), lambda x: ord(x[1]) - x[0]):
+ b = list(b)
+ yield b[0][1], b[-1][1]
+
+
+def build_pattern(ranges):
+ """Output the regex pattern for ranges of characters.
+
+ One and two character ranges output the individual characters.
+ """
+ out = []
+
+ for a, b in ranges:
+ if a == b: # single char
+ out.append(a)
+ elif ord(b) - ord(a) == 1: # two chars, range is redundant
+ out.append(a)
+ out.append(b)
+ else:
+ out.append(f"{a}-{b}")
+
+ return "".join(out)
+
+
+def main():
+ """Build the regex pattern and write it to
+ ``jinja2/_identifier.py``.
+ """
+ pattern = build_pattern(collapse_ranges(get_characters()))
+ filename = os.path.abspath(
+ os.path.join(os.path.dirname(__file__), "..", "src", "jinja2", "_identifier.py")
+ )
+
+ with open(filename, "w", encoding="utf8") as f:
+ f.write("import re\n\n")
+ f.write("# generated by scripts/generate_identifier_pattern.py\n")
+ f.write("pattern = re.compile(\n")
+ f.write(f' r"[\\w{pattern}]+" # noqa: B950\n')
+ f.write(")\n")
+
+
+if __name__ == "__main__":
+ main()