diff options
author | IRIS YANG <irisykyang@google.com> | 2020-08-18 13:17:02 +0000 |
---|---|---|
committer | IRIS YANG <irisykyang@google.com> | 2020-08-18 13:31:16 +0000 |
commit | 3121357a0d62a6fe8c9fdcbfe5fd91f12b8f380d (patch) | |
tree | 2046b95d53a74b793dd54b7ea6e1b86724b93435 /scripts | |
parent | 81aec74062b5c629b3408f7f3d18343ec0bbcab8 (diff) | |
download | jinja-3121357a0d62a6fe8c9fdcbfe5fd91f12b8f380d.tar.gz |
Revert "Revert "Import external/python/jinja into master""
This reverts commit 81aec74062b5c629b3408f7f3d18343ec0bbcab8.
Reason for revert: It seems Jinja folder is empty. Revert the revert to add files back.
Third-party review: ag/11821018
Change-Id: I4429a3b3448cdf2eb62ec388392a2a29fa3dbc21
Diffstat (limited to 'scripts')
-rwxr-xr-x | scripts/generate_identifier_pattern.py | 74 |
1 files changed, 74 insertions, 0 deletions
diff --git a/scripts/generate_identifier_pattern.py b/scripts/generate_identifier_pattern.py new file mode 100755 index 00000000..6b479535 --- /dev/null +++ b/scripts/generate_identifier_pattern.py @@ -0,0 +1,74 @@ +import itertools +import os +import re +import sys + + +def get_characters(): + """Find every Unicode character that is valid in a Python `identifier`_ but + is not matched by the regex ``\\w`` group. + + ``\\w`` matches some characters that aren't valid in identifiers, but + :meth:`str.isidentifier` will catch that later in lexing. + + All start characters are valid continue characters, so we only test for + continue characters. + + _identifier: https://docs.python.org/3/reference/lexical_analysis.html#identifiers + """ + for cp in range(sys.maxunicode + 1): + s = chr(cp) + + if ("a" + s).isidentifier() and not re.match(r"\w", s): + yield s + + +def collapse_ranges(data): + """Given a sorted list of unique characters, generate ranges representing + sequential code points. + + Source: https://stackoverflow.com/a/4629241/400617 + """ + for _, b in itertools.groupby(enumerate(data), lambda x: ord(x[1]) - x[0]): + b = list(b) + yield b[0][1], b[-1][1] + + +def build_pattern(ranges): + """Output the regex pattern for ranges of characters. + + One and two character ranges output the individual characters. + """ + out = [] + + for a, b in ranges: + if a == b: # single char + out.append(a) + elif ord(b) - ord(a) == 1: # two chars, range is redundant + out.append(a) + out.append(b) + else: + out.append(f"{a}-{b}") + + return "".join(out) + + +def main(): + """Build the regex pattern and write it to + ``jinja2/_identifier.py``. + """ + pattern = build_pattern(collapse_ranges(get_characters())) + filename = os.path.abspath( + os.path.join(os.path.dirname(__file__), "..", "src", "jinja2", "_identifier.py") + ) + + with open(filename, "w", encoding="utf8") as f: + f.write("import re\n\n") + f.write("# generated by scripts/generate_identifier_pattern.py\n") + f.write("pattern = re.compile(\n") + f.write(f' r"[\\w{pattern}]+" # noqa: B950\n') + f.write(")\n") + + +if __name__ == "__main__": + main() |