diff options
Diffstat (limited to 'lib/python2.7/lib2to3/pgen2')
-rw-r--r-- | lib/python2.7/lib2to3/pgen2/__init__.py | 4 | ||||
-rw-r--r-- | lib/python2.7/lib2to3/pgen2/conv.py | 257 | ||||
-rw-r--r-- | lib/python2.7/lib2to3/pgen2/driver.py | 157 | ||||
-rw-r--r-- | lib/python2.7/lib2to3/pgen2/grammar.py | 184 | ||||
-rw-r--r-- | lib/python2.7/lib2to3/pgen2/literals.py | 60 | ||||
-rw-r--r-- | lib/python2.7/lib2to3/pgen2/parse.py | 201 | ||||
-rw-r--r-- | lib/python2.7/lib2to3/pgen2/pgen.py | 386 | ||||
-rwxr-xr-x | lib/python2.7/lib2to3/pgen2/token.py | 82 | ||||
-rw-r--r-- | lib/python2.7/lib2to3/pgen2/tokenize.py | 500 |
9 files changed, 0 insertions, 1831 deletions
diff --git a/lib/python2.7/lib2to3/pgen2/__init__.py b/lib/python2.7/lib2to3/pgen2/__init__.py deleted file mode 100644 index af39048..0000000 --- a/lib/python2.7/lib2to3/pgen2/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. -# Licensed to PSF under a Contributor Agreement. - -"""The pgen2 package.""" diff --git a/lib/python2.7/lib2to3/pgen2/conv.py b/lib/python2.7/lib2to3/pgen2/conv.py deleted file mode 100644 index 28fbb0b..0000000 --- a/lib/python2.7/lib2to3/pgen2/conv.py +++ /dev/null @@ -1,257 +0,0 @@ -# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. -# Licensed to PSF under a Contributor Agreement. - -"""Convert graminit.[ch] spit out by pgen to Python code. - -Pgen is the Python parser generator. It is useful to quickly create a -parser from a grammar file in Python's grammar notation. But I don't -want my parsers to be written in C (yet), so I'm translating the -parsing tables to Python data structures and writing a Python parse -engine. - -Note that the token numbers are constants determined by the standard -Python tokenizer. The standard token module defines these numbers and -their names (the names are not used much). The token numbers are -hardcoded into the Python tokenizer and into pgen. A Python -implementation of the Python tokenizer is also available, in the -standard tokenize module. - -On the other hand, symbol numbers (representing the grammar's -non-terminals) are assigned by pgen based on the actual grammar -input. - -Note: this module is pretty much obsolete; the pgen module generates -equivalent grammar tables directly from the Grammar.txt input file -without having to invoke the Python pgen C program. - -""" - -# Python imports -import re - -# Local imports -from pgen2 import grammar, token - - -class Converter(grammar.Grammar): - """Grammar subclass that reads classic pgen output files. - - The run() method reads the tables as produced by the pgen parser - generator, typically contained in two C files, graminit.h and - graminit.c. The other methods are for internal use only. - - See the base class for more documentation. - - """ - - def run(self, graminit_h, graminit_c): - """Load the grammar tables from the text files written by pgen.""" - self.parse_graminit_h(graminit_h) - self.parse_graminit_c(graminit_c) - self.finish_off() - - def parse_graminit_h(self, filename): - """Parse the .h file written by pgen. (Internal) - - This file is a sequence of #define statements defining the - nonterminals of the grammar as numbers. We build two tables - mapping the numbers to names and back. - - """ - try: - f = open(filename) - except IOError, err: - print "Can't open %s: %s" % (filename, err) - return False - self.symbol2number = {} - self.number2symbol = {} - lineno = 0 - for line in f: - lineno += 1 - mo = re.match(r"^#define\s+(\w+)\s+(\d+)$", line) - if not mo and line.strip(): - print "%s(%s): can't parse %s" % (filename, lineno, - line.strip()) - else: - symbol, number = mo.groups() - number = int(number) - assert symbol not in self.symbol2number - assert number not in self.number2symbol - self.symbol2number[symbol] = number - self.number2symbol[number] = symbol - return True - - def parse_graminit_c(self, filename): - """Parse the .c file written by pgen. (Internal) - - The file looks as follows. The first two lines are always this: - - #include "pgenheaders.h" - #include "grammar.h" - - After that come four blocks: - - 1) one or more state definitions - 2) a table defining dfas - 3) a table defining labels - 4) a struct defining the grammar - - A state definition has the following form: - - one or more arc arrays, each of the form: - static arc arcs_<n>_<m>[<k>] = { - {<i>, <j>}, - ... - }; - - followed by a state array, of the form: - static state states_<s>[<t>] = { - {<k>, arcs_<n>_<m>}, - ... - }; - - """ - try: - f = open(filename) - except IOError, err: - print "Can't open %s: %s" % (filename, err) - return False - # The code below essentially uses f's iterator-ness! - lineno = 0 - - # Expect the two #include lines - lineno, line = lineno+1, f.next() - assert line == '#include "pgenheaders.h"\n', (lineno, line) - lineno, line = lineno+1, f.next() - assert line == '#include "grammar.h"\n', (lineno, line) - - # Parse the state definitions - lineno, line = lineno+1, f.next() - allarcs = {} - states = [] - while line.startswith("static arc "): - while line.startswith("static arc "): - mo = re.match(r"static arc arcs_(\d+)_(\d+)\[(\d+)\] = {$", - line) - assert mo, (lineno, line) - n, m, k = map(int, mo.groups()) - arcs = [] - for _ in range(k): - lineno, line = lineno+1, f.next() - mo = re.match(r"\s+{(\d+), (\d+)},$", line) - assert mo, (lineno, line) - i, j = map(int, mo.groups()) - arcs.append((i, j)) - lineno, line = lineno+1, f.next() - assert line == "};\n", (lineno, line) - allarcs[(n, m)] = arcs - lineno, line = lineno+1, f.next() - mo = re.match(r"static state states_(\d+)\[(\d+)\] = {$", line) - assert mo, (lineno, line) - s, t = map(int, mo.groups()) - assert s == len(states), (lineno, line) - state = [] - for _ in range(t): - lineno, line = lineno+1, f.next() - mo = re.match(r"\s+{(\d+), arcs_(\d+)_(\d+)},$", line) - assert mo, (lineno, line) - k, n, m = map(int, mo.groups()) - arcs = allarcs[n, m] - assert k == len(arcs), (lineno, line) - state.append(arcs) - states.append(state) - lineno, line = lineno+1, f.next() - assert line == "};\n", (lineno, line) - lineno, line = lineno+1, f.next() - self.states = states - - # Parse the dfas - dfas = {} - mo = re.match(r"static dfa dfas\[(\d+)\] = {$", line) - assert mo, (lineno, line) - ndfas = int(mo.group(1)) - for i in range(ndfas): - lineno, line = lineno+1, f.next() - mo = re.match(r'\s+{(\d+), "(\w+)", (\d+), (\d+), states_(\d+),$', - line) - assert mo, (lineno, line) - symbol = mo.group(2) - number, x, y, z = map(int, mo.group(1, 3, 4, 5)) - assert self.symbol2number[symbol] == number, (lineno, line) - assert self.number2symbol[number] == symbol, (lineno, line) - assert x == 0, (lineno, line) - state = states[z] - assert y == len(state), (lineno, line) - lineno, line = lineno+1, f.next() - mo = re.match(r'\s+("(?:\\\d\d\d)*")},$', line) - assert mo, (lineno, line) - first = {} - rawbitset = eval(mo.group(1)) - for i, c in enumerate(rawbitset): - byte = ord(c) - for j in range(8): - if byte & (1<<j): - first[i*8 + j] = 1 - dfas[number] = (state, first) - lineno, line = lineno+1, f.next() - assert line == "};\n", (lineno, line) - self.dfas = dfas - - # Parse the labels - labels = [] - lineno, line = lineno+1, f.next() - mo = re.match(r"static label labels\[(\d+)\] = {$", line) - assert mo, (lineno, line) - nlabels = int(mo.group(1)) - for i in range(nlabels): - lineno, line = lineno+1, f.next() - mo = re.match(r'\s+{(\d+), (0|"\w+")},$', line) - assert mo, (lineno, line) - x, y = mo.groups() - x = int(x) - if y == "0": - y = None - else: - y = eval(y) - labels.append((x, y)) - lineno, line = lineno+1, f.next() - assert line == "};\n", (lineno, line) - self.labels = labels - - # Parse the grammar struct - lineno, line = lineno+1, f.next() - assert line == "grammar _PyParser_Grammar = {\n", (lineno, line) - lineno, line = lineno+1, f.next() - mo = re.match(r"\s+(\d+),$", line) - assert mo, (lineno, line) - ndfas = int(mo.group(1)) - assert ndfas == len(self.dfas) - lineno, line = lineno+1, f.next() - assert line == "\tdfas,\n", (lineno, line) - lineno, line = lineno+1, f.next() - mo = re.match(r"\s+{(\d+), labels},$", line) - assert mo, (lineno, line) - nlabels = int(mo.group(1)) - assert nlabels == len(self.labels), (lineno, line) - lineno, line = lineno+1, f.next() - mo = re.match(r"\s+(\d+)$", line) - assert mo, (lineno, line) - start = int(mo.group(1)) - assert start in self.number2symbol, (lineno, line) - self.start = start - lineno, line = lineno+1, f.next() - assert line == "};\n", (lineno, line) - try: - lineno, line = lineno+1, f.next() - except StopIteration: - pass - else: - assert 0, (lineno, line) - - def finish_off(self): - """Create additional useful structures. (Internal).""" - self.keywords = {} # map from keyword strings to arc labels - self.tokens = {} # map from numeric token values to arc labels - for ilabel, (type, value) in enumerate(self.labels): - if type == token.NAME and value is not None: - self.keywords[value] = ilabel - elif value is None: - self.tokens[type] = ilabel diff --git a/lib/python2.7/lib2to3/pgen2/driver.py b/lib/python2.7/lib2to3/pgen2/driver.py deleted file mode 100644 index 39dafb9..0000000 --- a/lib/python2.7/lib2to3/pgen2/driver.py +++ /dev/null @@ -1,157 +0,0 @@ -# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. -# Licensed to PSF under a Contributor Agreement. - -# Modifications: -# Copyright 2006 Google, Inc. All Rights Reserved. -# Licensed to PSF under a Contributor Agreement. - -"""Parser driver. - -This provides a high-level interface to parse a file into a syntax tree. - -""" - -__author__ = "Guido van Rossum <guido@python.org>" - -__all__ = ["Driver", "load_grammar"] - -# Python imports -import codecs -import os -import logging -import StringIO -import sys - -# Pgen imports -from . import grammar, parse, token, tokenize, pgen - - -class Driver(object): - - def __init__(self, grammar, convert=None, logger=None): - self.grammar = grammar - if logger is None: - logger = logging.getLogger() - self.logger = logger - self.convert = convert - - def parse_tokens(self, tokens, debug=False): - """Parse a series of tokens and return the syntax tree.""" - # XXX Move the prefix computation into a wrapper around tokenize. - p = parse.Parser(self.grammar, self.convert) - p.setup() - lineno = 1 - column = 0 - type = value = start = end = line_text = None - prefix = u"" - for quintuple in tokens: - type, value, start, end, line_text = quintuple - if start != (lineno, column): - assert (lineno, column) <= start, ((lineno, column), start) - s_lineno, s_column = start - if lineno < s_lineno: - prefix += "\n" * (s_lineno - lineno) - lineno = s_lineno - column = 0 - if column < s_column: - prefix += line_text[column:s_column] - column = s_column - if type in (tokenize.COMMENT, tokenize.NL): - prefix += value - lineno, column = end - if value.endswith("\n"): - lineno += 1 - column = 0 - continue - if type == token.OP: - type = grammar.opmap[value] - if debug: - self.logger.debug("%s %r (prefix=%r)", - token.tok_name[type], value, prefix) - if p.addtoken(type, value, (prefix, start)): - if debug: - self.logger.debug("Stop.") - break - prefix = "" - lineno, column = end - if value.endswith("\n"): - lineno += 1 - column = 0 - else: - # We never broke out -- EOF is too soon (how can this happen???) - raise parse.ParseError("incomplete input", - type, value, (prefix, start)) - return p.rootnode - - def parse_stream_raw(self, stream, debug=False): - """Parse a stream and return the syntax tree.""" - tokens = tokenize.generate_tokens(stream.readline) - return self.parse_tokens(tokens, debug) - - def parse_stream(self, stream, debug=False): - """Parse a stream and return the syntax tree.""" - return self.parse_stream_raw(stream, debug) - - def parse_file(self, filename, encoding=None, debug=False): - """Parse a file and return the syntax tree.""" - stream = codecs.open(filename, "r", encoding) - try: - return self.parse_stream(stream, debug) - finally: - stream.close() - - def parse_string(self, text, debug=False): - """Parse a string and return the syntax tree.""" - tokens = tokenize.generate_tokens(StringIO.StringIO(text).readline) - return self.parse_tokens(tokens, debug) - - -def load_grammar(gt="Grammar.txt", gp=None, - save=True, force=False, logger=None): - """Load the grammar (maybe from a pickle).""" - if logger is None: - logger = logging.getLogger() - if gp is None: - head, tail = os.path.splitext(gt) - if tail == ".txt": - tail = "" - gp = head + tail + ".".join(map(str, sys.version_info)) + ".pickle" - if force or not _newer(gp, gt): - logger.info("Generating grammar tables from %s", gt) - g = pgen.generate_grammar(gt) - if save: - logger.info("Writing grammar tables to %s", gp) - try: - g.dump(gp) - except IOError, e: - logger.info("Writing failed:"+str(e)) - else: - g = grammar.Grammar() - g.load(gp) - return g - - -def _newer(a, b): - """Inquire whether file a was written since file b.""" - if not os.path.exists(a): - return False - if not os.path.exists(b): - return True - return os.path.getmtime(a) >= os.path.getmtime(b) - - -def main(*args): - """Main program, when run as a script: produce grammar pickle files. - - Calls load_grammar for each argument, a path to a grammar text file. - """ - if not args: - args = sys.argv[1:] - logging.basicConfig(level=logging.INFO, stream=sys.stdout, - format='%(message)s') - for gt in args: - load_grammar(gt, save=True, force=True) - return True - -if __name__ == "__main__": - sys.exit(int(not main())) diff --git a/lib/python2.7/lib2to3/pgen2/grammar.py b/lib/python2.7/lib2to3/pgen2/grammar.py deleted file mode 100644 index 1aa5c43..0000000 --- a/lib/python2.7/lib2to3/pgen2/grammar.py +++ /dev/null @@ -1,184 +0,0 @@ -# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. -# Licensed to PSF under a Contributor Agreement. - -"""This module defines the data structures used to represent a grammar. - -These are a bit arcane because they are derived from the data -structures used by Python's 'pgen' parser generator. - -There's also a table here mapping operators to their names in the -token module; the Python tokenize module reports all operators as the -fallback token code OP, but the parser needs the actual token code. - -""" - -# Python imports -import pickle - -# Local imports -from . import token, tokenize - - -class Grammar(object): - """Pgen parsing tables conversion class. - - Once initialized, this class supplies the grammar tables for the - parsing engine implemented by parse.py. The parsing engine - accesses the instance variables directly. The class here does not - provide initialization of the tables; several subclasses exist to - do this (see the conv and pgen modules). - - The load() method reads the tables from a pickle file, which is - much faster than the other ways offered by subclasses. The pickle - file is written by calling dump() (after loading the grammar - tables using a subclass). The report() method prints a readable - representation of the tables to stdout, for debugging. - - The instance variables are as follows: - - symbol2number -- a dict mapping symbol names to numbers. Symbol - numbers are always 256 or higher, to distinguish - them from token numbers, which are between 0 and - 255 (inclusive). - - number2symbol -- a dict mapping numbers to symbol names; - these two are each other's inverse. - - states -- a list of DFAs, where each DFA is a list of - states, each state is a list of arcs, and each - arc is a (i, j) pair where i is a label and j is - a state number. The DFA number is the index into - this list. (This name is slightly confusing.) - Final states are represented by a special arc of - the form (0, j) where j is its own state number. - - dfas -- a dict mapping symbol numbers to (DFA, first) - pairs, where DFA is an item from the states list - above, and first is a set of tokens that can - begin this grammar rule (represented by a dict - whose values are always 1). - - labels -- a list of (x, y) pairs where x is either a token - number or a symbol number, and y is either None - or a string; the strings are keywords. The label - number is the index in this list; label numbers - are used to mark state transitions (arcs) in the - DFAs. - - start -- the number of the grammar's start symbol. - - keywords -- a dict mapping keyword strings to arc labels. - - tokens -- a dict mapping token numbers to arc labels. - - """ - - def __init__(self): - self.symbol2number = {} - self.number2symbol = {} - self.states = [] - self.dfas = {} - self.labels = [(0, "EMPTY")] - self.keywords = {} - self.tokens = {} - self.symbol2label = {} - self.start = 256 - - def dump(self, filename): - """Dump the grammar tables to a pickle file.""" - f = open(filename, "wb") - pickle.dump(self.__dict__, f, 2) - f.close() - - def load(self, filename): - """Load the grammar tables from a pickle file.""" - f = open(filename, "rb") - d = pickle.load(f) - f.close() - self.__dict__.update(d) - - def copy(self): - """ - Copy the grammar. - """ - new = self.__class__() - for dict_attr in ("symbol2number", "number2symbol", "dfas", "keywords", - "tokens", "symbol2label"): - setattr(new, dict_attr, getattr(self, dict_attr).copy()) - new.labels = self.labels[:] - new.states = self.states[:] - new.start = self.start - return new - - def report(self): - """Dump the grammar tables to standard output, for debugging.""" - from pprint import pprint - print "s2n" - pprint(self.symbol2number) - print "n2s" - pprint(self.number2symbol) - print "states" - pprint(self.states) - print "dfas" - pprint(self.dfas) - print "labels" - pprint(self.labels) - print "start", self.start - - -# Map from operator to number (since tokenize doesn't do this) - -opmap_raw = """ -( LPAR -) RPAR -[ LSQB -] RSQB -: COLON -, COMMA -; SEMI -+ PLUS -- MINUS -* STAR -/ SLASH -| VBAR -& AMPER -< LESS -> GREATER -= EQUAL -. DOT -% PERCENT -` BACKQUOTE -{ LBRACE -} RBRACE -@ AT -== EQEQUAL -!= NOTEQUAL -<> NOTEQUAL -<= LESSEQUAL ->= GREATEREQUAL -~ TILDE -^ CIRCUMFLEX -<< LEFTSHIFT ->> RIGHTSHIFT -** DOUBLESTAR -+= PLUSEQUAL --= MINEQUAL -*= STAREQUAL -/= SLASHEQUAL -%= PERCENTEQUAL -&= AMPEREQUAL -|= VBAREQUAL -^= CIRCUMFLEXEQUAL -<<= LEFTSHIFTEQUAL ->>= RIGHTSHIFTEQUAL -**= DOUBLESTAREQUAL -// DOUBLESLASH -//= DOUBLESLASHEQUAL --> RARROW -""" - -opmap = {} -for line in opmap_raw.splitlines(): - if line: - op, name = line.split() - opmap[op] = getattr(token, name) diff --git a/lib/python2.7/lib2to3/pgen2/literals.py b/lib/python2.7/lib2to3/pgen2/literals.py deleted file mode 100644 index 0b3948a..0000000 --- a/lib/python2.7/lib2to3/pgen2/literals.py +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. -# Licensed to PSF under a Contributor Agreement. - -"""Safely evaluate Python string literals without using eval().""" - -import re - -simple_escapes = {"a": "\a", - "b": "\b", - "f": "\f", - "n": "\n", - "r": "\r", - "t": "\t", - "v": "\v", - "'": "'", - '"': '"', - "\\": "\\"} - -def escape(m): - all, tail = m.group(0, 1) - assert all.startswith("\\") - esc = simple_escapes.get(tail) - if esc is not None: - return esc - if tail.startswith("x"): - hexes = tail[1:] - if len(hexes) < 2: - raise ValueError("invalid hex string escape ('\\%s')" % tail) - try: - i = int(hexes, 16) - except ValueError: - raise ValueError("invalid hex string escape ('\\%s')" % tail) - else: - try: - i = int(tail, 8) - except ValueError: - raise ValueError("invalid octal string escape ('\\%s')" % tail) - return chr(i) - -def evalString(s): - assert s.startswith("'") or s.startswith('"'), repr(s[:1]) - q = s[0] - if s[:3] == q*3: - q = q*3 - assert s.endswith(q), repr(s[-len(q):]) - assert len(s) >= 2*len(q) - s = s[len(q):-len(q)] - return re.sub(r"\\(\'|\"|\\|[abfnrtv]|x.{0,2}|[0-7]{1,3})", escape, s) - -def test(): - for i in range(256): - c = chr(i) - s = repr(c) - e = evalString(s) - if e != c: - print i, c, s, e - - -if __name__ == "__main__": - test() diff --git a/lib/python2.7/lib2to3/pgen2/parse.py b/lib/python2.7/lib2to3/pgen2/parse.py deleted file mode 100644 index 6bebdbb..0000000 --- a/lib/python2.7/lib2to3/pgen2/parse.py +++ /dev/null @@ -1,201 +0,0 @@ -# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. -# Licensed to PSF under a Contributor Agreement. - -"""Parser engine for the grammar tables generated by pgen. - -The grammar table must be loaded first. - -See Parser/parser.c in the Python distribution for additional info on -how this parsing engine works. - -""" - -# Local imports -from . import token - -class ParseError(Exception): - """Exception to signal the parser is stuck.""" - - def __init__(self, msg, type, value, context): - Exception.__init__(self, "%s: type=%r, value=%r, context=%r" % - (msg, type, value, context)) - self.msg = msg - self.type = type - self.value = value - self.context = context - -class Parser(object): - """Parser engine. - - The proper usage sequence is: - - p = Parser(grammar, [converter]) # create instance - p.setup([start]) # prepare for parsing - <for each input token>: - if p.addtoken(...): # parse a token; may raise ParseError - break - root = p.rootnode # root of abstract syntax tree - - A Parser instance may be reused by calling setup() repeatedly. - - A Parser instance contains state pertaining to the current token - sequence, and should not be used concurrently by different threads - to parse separate token sequences. - - See driver.py for how to get input tokens by tokenizing a file or - string. - - Parsing is complete when addtoken() returns True; the root of the - abstract syntax tree can then be retrieved from the rootnode - instance variable. When a syntax error occurs, addtoken() raises - the ParseError exception. There is no error recovery; the parser - cannot be used after a syntax error was reported (but it can be - reinitialized by calling setup()). - - """ - - def __init__(self, grammar, convert=None): - """Constructor. - - The grammar argument is a grammar.Grammar instance; see the - grammar module for more information. - - The parser is not ready yet for parsing; you must call the - setup() method to get it started. - - The optional convert argument is a function mapping concrete - syntax tree nodes to abstract syntax tree nodes. If not - given, no conversion is done and the syntax tree produced is - the concrete syntax tree. If given, it must be a function of - two arguments, the first being the grammar (a grammar.Grammar - instance), and the second being the concrete syntax tree node - to be converted. The syntax tree is converted from the bottom - up. - - A concrete syntax tree node is a (type, value, context, nodes) - tuple, where type is the node type (a token or symbol number), - value is None for symbols and a string for tokens, context is - None or an opaque value used for error reporting (typically a - (lineno, offset) pair), and nodes is a list of children for - symbols, and None for tokens. - - An abstract syntax tree node may be anything; this is entirely - up to the converter function. - - """ - self.grammar = grammar - self.convert = convert or (lambda grammar, node: node) - - def setup(self, start=None): - """Prepare for parsing. - - This *must* be called before starting to parse. - - The optional argument is an alternative start symbol; it - defaults to the grammar's start symbol. - - You can use a Parser instance to parse any number of programs; - each time you call setup() the parser is reset to an initial - state determined by the (implicit or explicit) start symbol. - - """ - if start is None: - start = self.grammar.start - # Each stack entry is a tuple: (dfa, state, node). - # A node is a tuple: (type, value, context, children), - # where children is a list of nodes or None, and context may be None. - newnode = (start, None, None, []) - stackentry = (self.grammar.dfas[start], 0, newnode) - self.stack = [stackentry] - self.rootnode = None - self.used_names = set() # Aliased to self.rootnode.used_names in pop() - - def addtoken(self, type, value, context): - """Add a token; return True iff this is the end of the program.""" - # Map from token to label - ilabel = self.classify(type, value, context) - # Loop until the token is shifted; may raise exceptions - while True: - dfa, state, node = self.stack[-1] - states, first = dfa - arcs = states[state] - # Look for a state with this label - for i, newstate in arcs: - t, v = self.grammar.labels[i] - if ilabel == i: - # Look it up in the list of labels - assert t < 256 - # Shift a token; we're done with it - self.shift(type, value, newstate, context) - # Pop while we are in an accept-only state - state = newstate - while states[state] == [(0, state)]: - self.pop() - if not self.stack: - # Done parsing! - return True - dfa, state, node = self.stack[-1] - states, first = dfa - # Done with this token - return False - elif t >= 256: - # See if it's a symbol and if we're in its first set - itsdfa = self.grammar.dfas[t] - itsstates, itsfirst = itsdfa - if ilabel in itsfirst: - # Push a symbol - self.push(t, self.grammar.dfas[t], newstate, context) - break # To continue the outer while loop - else: - if (0, state) in arcs: - # An accepting state, pop it and try something else - self.pop() - if not self.stack: - # Done parsing, but another token is input - raise ParseError("too much input", - type, value, context) - else: - # No success finding a transition - raise ParseError("bad input", type, value, context) - - def classify(self, type, value, context): - """Turn a token into a label. (Internal)""" - if type == token.NAME: - # Keep a listing of all used names - self.used_names.add(value) - # Check for reserved words - ilabel = self.grammar.keywords.get(value) - if ilabel is not None: - return ilabel - ilabel = self.grammar.tokens.get(type) - if ilabel is None: - raise ParseError("bad token", type, value, context) - return ilabel - - def shift(self, type, value, newstate, context): - """Shift a token. (Internal)""" - dfa, state, node = self.stack[-1] - newnode = (type, value, context, None) - newnode = self.convert(self.grammar, newnode) - if newnode is not None: - node[-1].append(newnode) - self.stack[-1] = (dfa, newstate, node) - - def push(self, type, newdfa, newstate, context): - """Push a nonterminal. (Internal)""" - dfa, state, node = self.stack[-1] - newnode = (type, None, context, []) - self.stack[-1] = (dfa, newstate, node) - self.stack.append((newdfa, 0, newnode)) - - def pop(self): - """Pop a nonterminal. (Internal)""" - popdfa, popstate, popnode = self.stack.pop() - newnode = self.convert(self.grammar, popnode) - if newnode is not None: - if self.stack: - dfa, state, node = self.stack[-1] - node[-1].append(newnode) - else: - self.rootnode = newnode - self.rootnode.used_names = self.used_names diff --git a/lib/python2.7/lib2to3/pgen2/pgen.py b/lib/python2.7/lib2to3/pgen2/pgen.py deleted file mode 100644 index 63084a4..0000000 --- a/lib/python2.7/lib2to3/pgen2/pgen.py +++ /dev/null @@ -1,386 +0,0 @@ -# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. -# Licensed to PSF under a Contributor Agreement. - -# Pgen imports -from . import grammar, token, tokenize - -class PgenGrammar(grammar.Grammar): - pass - -class ParserGenerator(object): - - def __init__(self, filename, stream=None): - close_stream = None - if stream is None: - stream = open(filename) - close_stream = stream.close - self.filename = filename - self.stream = stream - self.generator = tokenize.generate_tokens(stream.readline) - self.gettoken() # Initialize lookahead - self.dfas, self.startsymbol = self.parse() - if close_stream is not None: - close_stream() - self.first = {} # map from symbol name to set of tokens - self.addfirstsets() - - def make_grammar(self): - c = PgenGrammar() - names = self.dfas.keys() - names.sort() - names.remove(self.startsymbol) - names.insert(0, self.startsymbol) - for name in names: - i = 256 + len(c.symbol2number) - c.symbol2number[name] = i - c.number2symbol[i] = name - for name in names: - dfa = self.dfas[name] - states = [] - for state in dfa: - arcs = [] - for label, next in state.arcs.iteritems(): - arcs.append((self.make_label(c, label), dfa.index(next))) - if state.isfinal: - arcs.append((0, dfa.index(state))) - states.append(arcs) - c.states.append(states) - c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name)) - c.start = c.symbol2number[self.startsymbol] - return c - - def make_first(self, c, name): - rawfirst = self.first[name] - first = {} - for label in rawfirst: - ilabel = self.make_label(c, label) - ##assert ilabel not in first # XXX failed on <> ... != - first[ilabel] = 1 - return first - - def make_label(self, c, label): - # XXX Maybe this should be a method on a subclass of converter? - ilabel = len(c.labels) - if label[0].isalpha(): - # Either a symbol name or a named token - if label in c.symbol2number: - # A symbol name (a non-terminal) - if label in c.symbol2label: - return c.symbol2label[label] - else: - c.labels.append((c.symbol2number[label], None)) - c.symbol2label[label] = ilabel - return ilabel - else: - # A named token (NAME, NUMBER, STRING) - itoken = getattr(token, label, None) - assert isinstance(itoken, int), label - assert itoken in token.tok_name, label - if itoken in c.tokens: - return c.tokens[itoken] - else: - c.labels.append((itoken, None)) - c.tokens[itoken] = ilabel - return ilabel - else: - # Either a keyword or an operator - assert label[0] in ('"', "'"), label - value = eval(label) - if value[0].isalpha(): - # A keyword - if value in c.keywords: - return c.keywords[value] - else: - c.labels.append((token.NAME, value)) - c.keywords[value] = ilabel - return ilabel - else: - # An operator (any non-numeric token) - itoken = grammar.opmap[value] # Fails if unknown token - if itoken in c.tokens: - return c.tokens[itoken] - else: - c.labels.append((itoken, None)) - c.tokens[itoken] = ilabel - return ilabel - - def addfirstsets(self): - names = self.dfas.keys() - names.sort() - for name in names: - if name not in self.first: - self.calcfirst(name) - #print name, self.first[name].keys() - - def calcfirst(self, name): - dfa = self.dfas[name] - self.first[name] = None # dummy to detect left recursion - state = dfa[0] - totalset = {} - overlapcheck = {} - for label, next in state.arcs.iteritems(): - if label in self.dfas: - if label in self.first: - fset = self.first[label] - if fset is None: - raise ValueError("recursion for rule %r" % name) - else: - self.calcfirst(label) - fset = self.first[label] - totalset.update(fset) - overlapcheck[label] = fset - else: - totalset[label] = 1 - overlapcheck[label] = {label: 1} - inverse = {} - for label, itsfirst in overlapcheck.iteritems(): - for symbol in itsfirst: - if symbol in inverse: - raise ValueError("rule %s is ambiguous; %s is in the" - " first sets of %s as well as %s" % - (name, symbol, label, inverse[symbol])) - inverse[symbol] = label - self.first[name] = totalset - - def parse(self): - dfas = {} - startsymbol = None - # MSTART: (NEWLINE | RULE)* ENDMARKER - while self.type != token.ENDMARKER: - while self.type == token.NEWLINE: - self.gettoken() - # RULE: NAME ':' RHS NEWLINE - name = self.expect(token.NAME) - self.expect(token.OP, ":") - a, z = self.parse_rhs() - self.expect(token.NEWLINE) - #self.dump_nfa(name, a, z) - dfa = self.make_dfa(a, z) - #self.dump_dfa(name, dfa) - oldlen = len(dfa) - self.simplify_dfa(dfa) - newlen = len(dfa) - dfas[name] = dfa - #print name, oldlen, newlen - if startsymbol is None: - startsymbol = name - return dfas, startsymbol - - def make_dfa(self, start, finish): - # To turn an NFA into a DFA, we define the states of the DFA - # to correspond to *sets* of states of the NFA. Then do some - # state reduction. Let's represent sets as dicts with 1 for - # values. - assert isinstance(start, NFAState) - assert isinstance(finish, NFAState) - def closure(state): - base = {} - addclosure(state, base) - return base - def addclosure(state, base): - assert isinstance(state, NFAState) - if state in base: - return - base[state] = 1 - for label, next in state.arcs: - if label is None: - addclosure(next, base) - states = [DFAState(closure(start), finish)] - for state in states: # NB states grows while we're iterating - arcs = {} - for nfastate in state.nfaset: - for label, next in nfastate.arcs: - if label is not None: - addclosure(next, arcs.setdefault(label, {})) - for label, nfaset in arcs.iteritems(): - for st in states: - if st.nfaset == nfaset: - break - else: - st = DFAState(nfaset, finish) - states.append(st) - state.addarc(st, label) - return states # List of DFAState instances; first one is start - - def dump_nfa(self, name, start, finish): - print "Dump of NFA for", name - todo = [start] - for i, state in enumerate(todo): - print " State", i, state is finish and "(final)" or "" - for label, next in state.arcs: - if next in todo: - j = todo.index(next) - else: - j = len(todo) - todo.append(next) - if label is None: - print " -> %d" % j - else: - print " %s -> %d" % (label, j) - - def dump_dfa(self, name, dfa): - print "Dump of DFA for", name - for i, state in enumerate(dfa): - print " State", i, state.isfinal and "(final)" or "" - for label, next in state.arcs.iteritems(): - print " %s -> %d" % (label, dfa.index(next)) - - def simplify_dfa(self, dfa): - # This is not theoretically optimal, but works well enough. - # Algorithm: repeatedly look for two states that have the same - # set of arcs (same labels pointing to the same nodes) and - # unify them, until things stop changing. - - # dfa is a list of DFAState instances - changes = True - while changes: - changes = False - for i, state_i in enumerate(dfa): - for j in range(i+1, len(dfa)): - state_j = dfa[j] - if state_i == state_j: - #print " unify", i, j - del dfa[j] - for state in dfa: - state.unifystate(state_j, state_i) - changes = True - break - - def parse_rhs(self): - # RHS: ALT ('|' ALT)* - a, z = self.parse_alt() - if self.value != "|": - return a, z - else: - aa = NFAState() - zz = NFAState() - aa.addarc(a) - z.addarc(zz) - while self.value == "|": - self.gettoken() - a, z = self.parse_alt() - aa.addarc(a) - z.addarc(zz) - return aa, zz - - def parse_alt(self): - # ALT: ITEM+ - a, b = self.parse_item() - while (self.value in ("(", "[") or - self.type in (token.NAME, token.STRING)): - c, d = self.parse_item() - b.addarc(c) - b = d - return a, b - - def parse_item(self): - # ITEM: '[' RHS ']' | ATOM ['+' | '*'] - if self.value == "[": - self.gettoken() - a, z = self.parse_rhs() - self.expect(token.OP, "]") - a.addarc(z) - return a, z - else: - a, z = self.parse_atom() - value = self.value - if value not in ("+", "*"): - return a, z - self.gettoken() - z.addarc(a) - if value == "+": - return a, z - else: - return a, a - - def parse_atom(self): - # ATOM: '(' RHS ')' | NAME | STRING - if self.value == "(": - self.gettoken() - a, z = self.parse_rhs() - self.expect(token.OP, ")") - return a, z - elif self.type in (token.NAME, token.STRING): - a = NFAState() - z = NFAState() - a.addarc(z, self.value) - self.gettoken() - return a, z - else: - self.raise_error("expected (...) or NAME or STRING, got %s/%s", - self.type, self.value) - - def expect(self, type, value=None): - if self.type != type or (value is not None and self.value != value): - self.raise_error("expected %s/%s, got %s/%s", - type, value, self.type, self.value) - value = self.value - self.gettoken() - return value - - def gettoken(self): - tup = self.generator.next() - while tup[0] in (tokenize.COMMENT, tokenize.NL): - tup = self.generator.next() - self.type, self.value, self.begin, self.end, self.line = tup - #print token.tok_name[self.type], repr(self.value) - - def raise_error(self, msg, *args): - if args: - try: - msg = msg % args - except: - msg = " ".join([msg] + map(str, args)) - raise SyntaxError(msg, (self.filename, self.end[0], - self.end[1], self.line)) - -class NFAState(object): - - def __init__(self): - self.arcs = [] # list of (label, NFAState) pairs - - def addarc(self, next, label=None): - assert label is None or isinstance(label, str) - assert isinstance(next, NFAState) - self.arcs.append((label, next)) - -class DFAState(object): - - def __init__(self, nfaset, final): - assert isinstance(nfaset, dict) - assert isinstance(iter(nfaset).next(), NFAState) - assert isinstance(final, NFAState) - self.nfaset = nfaset - self.isfinal = final in nfaset - self.arcs = {} # map from label to DFAState - - def addarc(self, next, label): - assert isinstance(label, str) - assert label not in self.arcs - assert isinstance(next, DFAState) - self.arcs[label] = next - - def unifystate(self, old, new): - for label, next in self.arcs.iteritems(): - if next is old: - self.arcs[label] = new - - def __eq__(self, other): - # Equality test -- ignore the nfaset instance variable - assert isinstance(other, DFAState) - if self.isfinal != other.isfinal: - return False - # Can't just return self.arcs == other.arcs, because that - # would invoke this method recursively, with cycles... - if len(self.arcs) != len(other.arcs): - return False - for label, next in self.arcs.iteritems(): - if next is not other.arcs.get(label): - return False - return True - - __hash__ = None # For Py3 compatibility. - -def generate_grammar(filename="Grammar.txt"): - p = ParserGenerator(filename) - return p.make_grammar() diff --git a/lib/python2.7/lib2to3/pgen2/token.py b/lib/python2.7/lib2to3/pgen2/token.py deleted file mode 100755 index 61468b3..0000000 --- a/lib/python2.7/lib2to3/pgen2/token.py +++ /dev/null @@ -1,82 +0,0 @@ -#! /usr/bin/env python - -"""Token constants (from "token.h").""" - -# Taken from Python (r53757) and modified to include some tokens -# originally monkeypatched in by pgen2.tokenize - -#--start constants-- -ENDMARKER = 0 -NAME = 1 -NUMBER = 2 -STRING = 3 -NEWLINE = 4 -INDENT = 5 -DEDENT = 6 -LPAR = 7 -RPAR = 8 -LSQB = 9 -RSQB = 10 -COLON = 11 -COMMA = 12 -SEMI = 13 -PLUS = 14 -MINUS = 15 -STAR = 16 -SLASH = 17 -VBAR = 18 -AMPER = 19 -LESS = 20 -GREATER = 21 -EQUAL = 22 -DOT = 23 -PERCENT = 24 -BACKQUOTE = 25 -LBRACE = 26 -RBRACE = 27 -EQEQUAL = 28 -NOTEQUAL = 29 -LESSEQUAL = 30 -GREATEREQUAL = 31 -TILDE = 32 -CIRCUMFLEX = 33 -LEFTSHIFT = 34 -RIGHTSHIFT = 35 -DOUBLESTAR = 36 -PLUSEQUAL = 37 -MINEQUAL = 38 -STAREQUAL = 39 -SLASHEQUAL = 40 -PERCENTEQUAL = 41 -AMPEREQUAL = 42 -VBAREQUAL = 43 -CIRCUMFLEXEQUAL = 44 -LEFTSHIFTEQUAL = 45 -RIGHTSHIFTEQUAL = 46 -DOUBLESTAREQUAL = 47 -DOUBLESLASH = 48 -DOUBLESLASHEQUAL = 49 -AT = 50 -OP = 51 -COMMENT = 52 -NL = 53 -RARROW = 54 -ERRORTOKEN = 55 -N_TOKENS = 56 -NT_OFFSET = 256 -#--end constants-- - -tok_name = {} -for _name, _value in globals().items(): - if type(_value) is type(0): - tok_name[_value] = _name - - -def ISTERMINAL(x): - return x < NT_OFFSET - -def ISNONTERMINAL(x): - return x >= NT_OFFSET - -def ISEOF(x): - return x == ENDMARKER diff --git a/lib/python2.7/lib2to3/pgen2/tokenize.py b/lib/python2.7/lib2to3/pgen2/tokenize.py deleted file mode 100644 index e090aa9..0000000 --- a/lib/python2.7/lib2to3/pgen2/tokenize.py +++ /dev/null @@ -1,500 +0,0 @@ -# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation. -# All rights reserved. - -"""Tokenization help for Python programs. - -generate_tokens(readline) is a generator that breaks a stream of -text into Python tokens. It accepts a readline-like method which is called -repeatedly to get the next line of input (or "" for EOF). It generates -5-tuples with these members: - - the token type (see token.py) - the token (a string) - the starting (row, column) indices of the token (a 2-tuple of ints) - the ending (row, column) indices of the token (a 2-tuple of ints) - the original line (string) - -It is designed to match the working of the Python tokenizer exactly, except -that it produces COMMENT tokens for comments and gives type OP for all -operators - -Older entry points - tokenize_loop(readline, tokeneater) - tokenize(readline, tokeneater=printtoken) -are the same, except instead of generating tokens, tokeneater is a callback -function to which the 5 fields described above are passed as 5 arguments, -each time a new token is found.""" - -__author__ = 'Ka-Ping Yee <ping@lfw.org>' -__credits__ = \ - 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro' - -import string, re -from codecs import BOM_UTF8, lookup -from lib2to3.pgen2.token import * - -from . import token -__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize", - "generate_tokens", "untokenize"] -del token - -try: - bytes -except NameError: - # Support bytes type in Python <= 2.5, so 2to3 turns itself into - # valid Python 3 code. - bytes = str - -def group(*choices): return '(' + '|'.join(choices) + ')' -def any(*choices): return group(*choices) + '*' -def maybe(*choices): return group(*choices) + '?' - -Whitespace = r'[ \f\t]*' -Comment = r'#[^\r\n]*' -Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) -Name = r'[a-zA-Z_]\w*' - -Binnumber = r'0[bB][01]*' -Hexnumber = r'0[xX][\da-fA-F]*[lL]?' -Octnumber = r'0[oO]?[0-7]*[lL]?' -Decnumber = r'[1-9]\d*[lL]?' -Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber) -Exponent = r'[eE][-+]?\d+' -Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent) -Expfloat = r'\d+' + Exponent -Floatnumber = group(Pointfloat, Expfloat) -Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]') -Number = group(Imagnumber, Floatnumber, Intnumber) - -# Tail end of ' string. -Single = r"[^'\\]*(?:\\.[^'\\]*)*'" -# Tail end of " string. -Double = r'[^"\\]*(?:\\.[^"\\]*)*"' -# Tail end of ''' string. -Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" -# Tail end of """ string. -Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' -Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""') -# Single-line ' or " string. -String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'", - r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"') - -# Because of leftmost-then-longest match semantics, be sure to put the -# longest operators first (e.g., if = came before ==, == would get -# recognized as two instances of =). -Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=", - r"//=?", r"->", - r"[+\-*/%&|^=<>]=?", - r"~") - -Bracket = '[][(){}]' -Special = group(r'\r?\n', r'[:;.,`@]') -Funny = group(Operator, Bracket, Special) - -PlainToken = group(Number, Funny, String, Name) -Token = Ignore + PlainToken - -# First (or only) line of ' or " string. -ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" + - group("'", r'\\\r?\n'), - r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' + - group('"', r'\\\r?\n')) -PseudoExtras = group(r'\\\r?\n', Comment, Triple) -PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) - -tokenprog, pseudoprog, single3prog, double3prog = map( - re.compile, (Token, PseudoToken, Single3, Double3)) -endprogs = {"'": re.compile(Single), '"': re.compile(Double), - "'''": single3prog, '"""': double3prog, - "r'''": single3prog, 'r"""': double3prog, - "u'''": single3prog, 'u"""': double3prog, - "b'''": single3prog, 'b"""': double3prog, - "ur'''": single3prog, 'ur"""': double3prog, - "br'''": single3prog, 'br"""': double3prog, - "R'''": single3prog, 'R"""': double3prog, - "U'''": single3prog, 'U"""': double3prog, - "B'''": single3prog, 'B"""': double3prog, - "uR'''": single3prog, 'uR"""': double3prog, - "Ur'''": single3prog, 'Ur"""': double3prog, - "UR'''": single3prog, 'UR"""': double3prog, - "bR'''": single3prog, 'bR"""': double3prog, - "Br'''": single3prog, 'Br"""': double3prog, - "BR'''": single3prog, 'BR"""': double3prog, - 'r': None, 'R': None, - 'u': None, 'U': None, - 'b': None, 'B': None} - -triple_quoted = {} -for t in ("'''", '"""', - "r'''", 'r"""', "R'''", 'R"""', - "u'''", 'u"""', "U'''", 'U"""', - "b'''", 'b"""', "B'''", 'B"""', - "ur'''", 'ur"""', "Ur'''", 'Ur"""', - "uR'''", 'uR"""', "UR'''", 'UR"""', - "br'''", 'br"""', "Br'''", 'Br"""', - "bR'''", 'bR"""', "BR'''", 'BR"""',): - triple_quoted[t] = t -single_quoted = {} -for t in ("'", '"', - "r'", 'r"', "R'", 'R"', - "u'", 'u"', "U'", 'U"', - "b'", 'b"', "B'", 'B"', - "ur'", 'ur"', "Ur'", 'Ur"', - "uR'", 'uR"', "UR'", 'UR"', - "br'", 'br"', "Br'", 'Br"', - "bR'", 'bR"', "BR'", 'BR"', ): - single_quoted[t] = t - -tabsize = 8 - -class TokenError(Exception): pass - -class StopTokenizing(Exception): pass - -def printtoken(type, token, start, end, line): # for testing - (srow, scol) = start - (erow, ecol) = end - print "%d,%d-%d,%d:\t%s\t%s" % \ - (srow, scol, erow, ecol, tok_name[type], repr(token)) - -def tokenize(readline, tokeneater=printtoken): - """ - The tokenize() function accepts two parameters: one representing the - input stream, and one providing an output mechanism for tokenize(). - - The first parameter, readline, must be a callable object which provides - the same interface as the readline() method of built-in file objects. - Each call to the function should return one line of input as a string. - - The second parameter, tokeneater, must also be a callable object. It is - called once for each token, with five arguments, corresponding to the - tuples generated by generate_tokens(). - """ - try: - tokenize_loop(readline, tokeneater) - except StopTokenizing: - pass - -# backwards compatible interface -def tokenize_loop(readline, tokeneater): - for token_info in generate_tokens(readline): - tokeneater(*token_info) - -class Untokenizer: - - def __init__(self): - self.tokens = [] - self.prev_row = 1 - self.prev_col = 0 - - def add_whitespace(self, start): - row, col = start - assert row <= self.prev_row - col_offset = col - self.prev_col - if col_offset: - self.tokens.append(" " * col_offset) - - def untokenize(self, iterable): - for t in iterable: - if len(t) == 2: - self.compat(t, iterable) - break - tok_type, token, start, end, line = t - self.add_whitespace(start) - self.tokens.append(token) - self.prev_row, self.prev_col = end - if tok_type in (NEWLINE, NL): - self.prev_row += 1 - self.prev_col = 0 - return "".join(self.tokens) - - def compat(self, token, iterable): - startline = False - indents = [] - toks_append = self.tokens.append - toknum, tokval = token - if toknum in (NAME, NUMBER): - tokval += ' ' - if toknum in (NEWLINE, NL): - startline = True - for tok in iterable: - toknum, tokval = tok[:2] - - if toknum in (NAME, NUMBER): - tokval += ' ' - - if toknum == INDENT: - indents.append(tokval) - continue - elif toknum == DEDENT: - indents.pop() - continue - elif toknum in (NEWLINE, NL): - startline = True - elif startline and indents: - toks_append(indents[-1]) - startline = False - toks_append(tokval) - -cookie_re = re.compile("coding[:=]\s*([-\w.]+)") - -def _get_normal_name(orig_enc): - """Imitates get_normal_name in tokenizer.c.""" - # Only care about the first 12 characters. - enc = orig_enc[:12].lower().replace("_", "-") - if enc == "utf-8" or enc.startswith("utf-8-"): - return "utf-8" - if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \ - enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")): - return "iso-8859-1" - return orig_enc - -def detect_encoding(readline): - """ - The detect_encoding() function is used to detect the encoding that should - be used to decode a Python source file. It requires one argment, readline, - in the same way as the tokenize() generator. - - It will call readline a maximum of twice, and return the encoding used - (as a string) and a list of any lines (left as bytes) it has read - in. - - It detects the encoding from the presence of a utf-8 bom or an encoding - cookie as specified in pep-0263. If both a bom and a cookie are present, but - disagree, a SyntaxError will be raised. If the encoding cookie is an invalid - charset, raise a SyntaxError. Note that if a utf-8 bom is found, - 'utf-8-sig' is returned. - - If no encoding is specified, then the default of 'utf-8' will be returned. - """ - bom_found = False - encoding = None - default = 'utf-8' - def read_or_stop(): - try: - return readline() - except StopIteration: - return bytes() - - def find_cookie(line): - try: - line_string = line.decode('ascii') - except UnicodeDecodeError: - return None - - matches = cookie_re.findall(line_string) - if not matches: - return None - encoding = _get_normal_name(matches[0]) - try: - codec = lookup(encoding) - except LookupError: - # This behaviour mimics the Python interpreter - raise SyntaxError("unknown encoding: " + encoding) - - if bom_found: - if codec.name != 'utf-8': - # This behaviour mimics the Python interpreter - raise SyntaxError('encoding problem: utf-8') - encoding += '-sig' - return encoding - - first = read_or_stop() - if first.startswith(BOM_UTF8): - bom_found = True - first = first[3:] - default = 'utf-8-sig' - if not first: - return default, [] - - encoding = find_cookie(first) - if encoding: - return encoding, [first] - - second = read_or_stop() - if not second: - return default, [first] - - encoding = find_cookie(second) - if encoding: - return encoding, [first, second] - - return default, [first, second] - -def untokenize(iterable): - """Transform tokens back into Python source code. - - Each element returned by the iterable must be a token sequence - with at least two elements, a token number and token value. If - only two tokens are passed, the resulting output is poor. - - Round-trip invariant for full input: - Untokenized source will match input source exactly - - Round-trip invariant for limited intput: - # Output text will tokenize the back to the input - t1 = [tok[:2] for tok in generate_tokens(f.readline)] - newcode = untokenize(t1) - readline = iter(newcode.splitlines(1)).next - t2 = [tok[:2] for tokin generate_tokens(readline)] - assert t1 == t2 - """ - ut = Untokenizer() - return ut.untokenize(iterable) - -def generate_tokens(readline): - """ - The generate_tokens() generator requires one argment, readline, which - must be a callable object which provides the same interface as the - readline() method of built-in file objects. Each call to the function - should return one line of input as a string. Alternately, readline - can be a callable function terminating with StopIteration: - readline = open(myfile).next # Example of alternate readline - - The generator produces 5-tuples with these members: the token type; the - token string; a 2-tuple (srow, scol) of ints specifying the row and - column where the token begins in the source; a 2-tuple (erow, ecol) of - ints specifying the row and column where the token ends in the source; - and the line on which the token was found. The line passed is the - logical line; continuation lines are included. - """ - lnum = parenlev = continued = 0 - namechars, numchars = string.ascii_letters + '_', '0123456789' - contstr, needcont = '', 0 - contline = None - indents = [0] - - while 1: # loop over lines in stream - try: - line = readline() - except StopIteration: - line = '' - lnum = lnum + 1 - pos, max = 0, len(line) - - if contstr: # continued string - if not line: - raise TokenError, ("EOF in multi-line string", strstart) - endmatch = endprog.match(line) - if endmatch: - pos = end = endmatch.end(0) - yield (STRING, contstr + line[:end], - strstart, (lnum, end), contline + line) - contstr, needcont = '', 0 - contline = None - elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': - yield (ERRORTOKEN, contstr + line, - strstart, (lnum, len(line)), contline) - contstr = '' - contline = None - continue - else: - contstr = contstr + line - contline = contline + line - continue - - elif parenlev == 0 and not continued: # new statement - if not line: break - column = 0 - while pos < max: # measure leading whitespace - if line[pos] == ' ': column = column + 1 - elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize - elif line[pos] == '\f': column = 0 - else: break - pos = pos + 1 - if pos == max: break - - if line[pos] in '#\r\n': # skip comments or blank lines - if line[pos] == '#': - comment_token = line[pos:].rstrip('\r\n') - nl_pos = pos + len(comment_token) - yield (COMMENT, comment_token, - (lnum, pos), (lnum, pos + len(comment_token)), line) - yield (NL, line[nl_pos:], - (lnum, nl_pos), (lnum, len(line)), line) - else: - yield ((NL, COMMENT)[line[pos] == '#'], line[pos:], - (lnum, pos), (lnum, len(line)), line) - continue - - if column > indents[-1]: # count indents or dedents - indents.append(column) - yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line) - while column < indents[-1]: - if column not in indents: - raise IndentationError( - "unindent does not match any outer indentation level", - ("<tokenize>", lnum, pos, line)) - indents = indents[:-1] - yield (DEDENT, '', (lnum, pos), (lnum, pos), line) - - else: # continued statement - if not line: - raise TokenError, ("EOF in multi-line statement", (lnum, 0)) - continued = 0 - - while pos < max: - pseudomatch = pseudoprog.match(line, pos) - if pseudomatch: # scan for tokens - start, end = pseudomatch.span(1) - spos, epos, pos = (lnum, start), (lnum, end), end - token, initial = line[start:end], line[start] - - if initial in numchars or \ - (initial == '.' and token != '.'): # ordinary number - yield (NUMBER, token, spos, epos, line) - elif initial in '\r\n': - newline = NEWLINE - if parenlev > 0: - newline = NL - yield (newline, token, spos, epos, line) - elif initial == '#': - assert not token.endswith("\n") - yield (COMMENT, token, spos, epos, line) - elif token in triple_quoted: - endprog = endprogs[token] - endmatch = endprog.match(line, pos) - if endmatch: # all on one line - pos = endmatch.end(0) - token = line[start:pos] - yield (STRING, token, spos, (lnum, pos), line) - else: - strstart = (lnum, start) # multiple lines - contstr = line[start:] - contline = line - break - elif initial in single_quoted or \ - token[:2] in single_quoted or \ - token[:3] in single_quoted: - if token[-1] == '\n': # continued string - strstart = (lnum, start) - endprog = (endprogs[initial] or endprogs[token[1]] or - endprogs[token[2]]) - contstr, needcont = line[start:], 1 - contline = line - break - else: # ordinary string - yield (STRING, token, spos, epos, line) - elif initial in namechars: # ordinary name - yield (NAME, token, spos, epos, line) - elif initial == '\\': # continued stmt - # This yield is new; needed for better idempotency: - yield (NL, token, spos, (lnum, pos), line) - continued = 1 - else: - if initial in '([{': parenlev = parenlev + 1 - elif initial in ')]}': parenlev = parenlev - 1 - yield (OP, token, spos, epos, line) - else: - yield (ERRORTOKEN, line[pos], - (lnum, pos), (lnum, pos+1), line) - pos = pos + 1 - - for indent in indents[1:]: # pop remaining indent levels - yield (DEDENT, '', (lnum, 0), (lnum, 0), '') - yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '') - -if __name__ == '__main__': # testing - import sys - if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline) - else: tokenize(sys.stdin.readline) |