diff options
Diffstat (limited to 'python/helpers/coverage/parser.py')
-rw-r--r-- | python/helpers/coverage/parser.py | 442 |
1 files changed, 171 insertions, 271 deletions
diff --git a/python/helpers/coverage/parser.py b/python/helpers/coverage/parser.py index cbbb5a6a2ec1..7a145a2a5346 100644 --- a/python/helpers/coverage/parser.py +++ b/python/helpers/coverage/parser.py @@ -1,9 +1,11 @@ """Code parsing for Coverage.""" -import glob, opcode, os, re, sys, token, tokenize +import dis, re, sys, token, tokenize from coverage.backward import set, sorted, StringIO # pylint: disable=W0622 -from coverage.backward import open_source +from coverage.backward import open_source, range # pylint: disable=W0622 +from coverage.backward import reversed # pylint: disable=W0622 +from coverage.backward import bytes_to_ints from coverage.bytecode import ByteCodes, CodeObjects from coverage.misc import nice_pair, expensive, join_regex from coverage.misc import CoverageException, NoSource, NotPython @@ -32,9 +34,13 @@ class CodeParser(object): except IOError: _, err, _ = sys.exc_info() raise NoSource( - "No source for code: %r: %s" % (self.filename, err) + "No source for code: '%s': %s" % (self.filename, err) ) + # Scrap the BOM if it exists. + if self.text and ord(self.text[0]) == 0xfeff: + self.text = self.text[1:] + self.exclude = exclude self.show_tokens = False @@ -102,9 +108,9 @@ class CodeParser(object): first_line = None empty = True - tokgen = tokenize.generate_tokens(StringIO(self.text).readline) + tokgen = generate_tokens(self.text) for toktype, ttext, (slineno, _), (elineno, _), ltext in tokgen: - if self.show_tokens: # pragma: no cover + if self.show_tokens: # pragma: not covered print("%10s %5s %-20r %r" % ( tokenize.tok_name.get(toktype, toktype), nice_pair((slineno, elineno)), ttext, ltext @@ -130,8 +136,7 @@ class CodeParser(object): # (a trick from trace.py in the stdlib.) This works for # 99.9999% of cases. For the rest (!) see: # http://stackoverflow.com/questions/1769332/x/1769794#1769794 - for i in range(slineno, elineno+1): - self.docstrings.add(i) + self.docstrings.update(range(slineno, elineno+1)) elif toktype == token.NEWLINE: if first_line is not None and elineno != first_line: # We're at the end of a line, and we've ended on a @@ -170,16 +175,18 @@ class CodeParser(object): first_line = line return first_line - def first_lines(self, lines, ignore=None): + def first_lines(self, lines, *ignores): """Map the line numbers in `lines` to the correct first line of the statement. - Skip any line mentioned in `ignore`. + Skip any line mentioned in any of the sequences in `ignores`. - Returns a sorted list of the first lines. + Returns a set of the first lines. """ - ignore = ignore or [] + ignore = set() + for ign in ignores: + ignore.update(ign) lset = set() for l in lines: if l in ignore: @@ -187,23 +194,34 @@ class CodeParser(object): new_l = self.first_line(l) if new_l not in ignore: lset.add(new_l) - return sorted(lset) + return lset def parse_source(self): """Parse source text to find executable lines, excluded lines, etc. - Return values are 1) a sorted list of executable line numbers, and - 2) a sorted list of excluded line numbers. + Return values are 1) a set of executable line numbers, and 2) a set of + excluded line numbers. Reported line numbers are normalized to the first line of multi-line statements. """ - self._raw_parse() + try: + self._raw_parse() + except (tokenize.TokenError, IndentationError): + _, tokerr, _ = sys.exc_info() + msg, lineno = tokerr.args + raise NotPython( + "Couldn't parse '%s' as Python source: '%s' at %s" % + (self.filename, msg, lineno) + ) excluded_lines = self.first_lines(self.excluded) - ignore = excluded_lines + list(self.docstrings) - lines = self.first_lines(self.statement_starts, ignore) + lines = self.first_lines( + self.statement_starts, + excluded_lines, + self.docstrings + ) return lines, excluded_lines @@ -258,8 +276,8 @@ class CodeParser(object): ## Opcodes that guide the ByteParser. def _opcode(name): - """Return the opcode by name from the opcode module.""" - return opcode.opmap[name] + """Return the opcode by name from the dis module.""" + return dis.opmap[name] def _opcode_set(*names): """Return a set of opcodes by the names in `names`.""" @@ -297,7 +315,7 @@ OPS_EXCEPT_BLOCKS = _opcode_set('SETUP_EXCEPT', 'SETUP_FINALLY') OPS_POP_BLOCK = _opcode_set('POP_BLOCK') # Opcodes that have a jump destination, but aren't really a jump. -OPS_NO_JUMP = _opcode_set('SETUP_EXCEPT', 'SETUP_FINALLY') +OPS_NO_JUMP = OPS_PUSH_BLOCK # Individual opcodes we need below. OP_BREAK_LOOP = _opcode('BREAK_LOOP') @@ -314,6 +332,7 @@ class ByteParser(object): def __init__(self, code=None, text=None, filename=None): if code: self.code = code + self.text = text else: if not text: assert filename, "If no code or text, need a filename" @@ -322,6 +341,7 @@ class ByteParser(object): text = sourcef.read() finally: sourcef.close() + self.text = text try: # Python 2.3 and 2.4 don't like partial last lines, so be sure @@ -350,69 +370,54 @@ class ByteParser(object): The iteration includes `self` as its first value. """ - return map(lambda c: ByteParser(code=c), CodeObjects(self.code)) - - # Getting numbers from the lnotab value changed in Py3.0. - if sys.version_info >= (3, 0): - def _lnotab_increments(self, lnotab): - """Return a list of ints from the lnotab bytes in 3.x""" - return list(lnotab) - else: - def _lnotab_increments(self, lnotab): - """Return a list of ints from the lnotab string in 2.x""" - return [ord(c) for c in lnotab] + children = CodeObjects(self.code) + return [ByteParser(code=c, text=self.text) for c in children] def _bytes_lines(self): """Map byte offsets to line numbers in `code`. Uses co_lnotab described in Python/compile.c to map byte offsets to - line numbers. Returns a list: [(b0, l0), (b1, l1), ...] + line numbers. Produces a sequence: (b0, l0), (b1, l1), ... + + Only byte offsets that correspond to line numbers are included in the + results. """ # Adapted from dis.py in the standard library. - byte_increments = self._lnotab_increments(self.code.co_lnotab[0::2]) - line_increments = self._lnotab_increments(self.code.co_lnotab[1::2]) + byte_increments = bytes_to_ints(self.code.co_lnotab[0::2]) + line_increments = bytes_to_ints(self.code.co_lnotab[1::2]) - bytes_lines = [] last_line_num = None line_num = self.code.co_firstlineno byte_num = 0 for byte_incr, line_incr in zip(byte_increments, line_increments): if byte_incr: if line_num != last_line_num: - bytes_lines.append((byte_num, line_num)) + yield (byte_num, line_num) last_line_num = line_num byte_num += byte_incr line_num += line_incr if line_num != last_line_num: - bytes_lines.append((byte_num, line_num)) - return bytes_lines + yield (byte_num, line_num) def _find_statements(self): """Find the statements in `self.code`. - Return a set of line numbers that start statements. Recurses into all - code objects reachable from `self.code`. + Produce a sequence of line numbers that start statements. Recurses + into all code objects reachable from `self.code`. """ - stmts = set() for bp in self.child_parsers(): # Get all of the lineno information from this code. for _, l in bp._bytes_lines(): - stmts.add(l) - return stmts - - def _disassemble(self): # pragma: no cover - """Disassemble code, for ad-hoc experimenting.""" - - import dis - - for bp in self.child_parsers(): - print("\n%s: " % bp.code) - dis.dis(bp.code) - print("Bytes lines: %r" % bp._bytes_lines()) + yield l - print("") + def _block_stack_repr(self, block_stack): + """Get a string version of `block_stack`, for debugging.""" + blocks = ", ".join( + ["(%s, %r)" % (dis.opname[b[0]], b[1]) for b in block_stack] + ) + return "[" + blocks + "]" def _split_into_chunks(self): """Split the code object into a list of `Chunk` objects. @@ -423,10 +428,11 @@ class ByteParser(object): Returns a list of `Chunk` objects. """ - # The list of chunks so far, and the one we're working on. chunks = [] chunk = None + + # A dict mapping byte offsets of line starts to the line numbers. bytes_lines_map = dict(self._bytes_lines()) # The block stack: loops and try blocks get pushed here for the @@ -441,24 +447,38 @@ class ByteParser(object): # We have to handle the last two bytecodes specially. ult = penult = None - for bc in ByteCodes(self.code.co_code): + # Get a set of all of the jump-to points. + jump_to = set() + bytecodes = list(ByteCodes(self.code.co_code)) + for bc in bytecodes: + if bc.jump_to >= 0: + jump_to.add(bc.jump_to) + + chunk_lineno = 0 + + # Walk the byte codes building chunks. + for bc in bytecodes: # Maybe have to start a new chunk + start_new_chunk = False + first_chunk = False if bc.offset in bytes_lines_map: # Start a new chunk for each source line number. - if chunk: - chunk.exits.add(bc.offset) - chunk = Chunk(bc.offset, bytes_lines_map[bc.offset]) - chunks.append(chunk) + start_new_chunk = True + chunk_lineno = bytes_lines_map[bc.offset] + first_chunk = True + elif bc.offset in jump_to: + # To make chunks have a single entrance, we have to make a new + # chunk when we get to a place some bytecode jumps to. + start_new_chunk = True elif bc.op in OPS_CHUNK_BEGIN: # Jumps deserve their own unnumbered chunk. This fixes # problems with jumps to jumps getting confused. + start_new_chunk = True + + if not chunk or start_new_chunk: if chunk: chunk.exits.add(bc.offset) - chunk = Chunk(bc.offset) - chunks.append(chunk) - - if not chunk: - chunk = Chunk(bc.offset) + chunk = Chunk(bc.offset, chunk_lineno, first_chunk) chunks.append(chunk) # Look at the opcode @@ -487,15 +507,11 @@ class ByteParser(object): chunk.exits.add(block_stack[-1][1]) chunk = None if bc.op == OP_END_FINALLY: - if block_stack: - # A break that goes through a finally will jump to whatever - # block is on top of the stack. - chunk.exits.add(block_stack[-1][1]) # For the finally clause we need to find the closest exception # block, and use its jump target as an exit. - for iblock in range(len(block_stack)-1, -1, -1): - if block_stack[iblock][0] in OPS_EXCEPT_BLOCKS: - chunk.exits.add(block_stack[iblock][1]) + for block in reversed(block_stack): + if block[0] in OPS_EXCEPT_BLOCKS: + chunk.exits.add(block[1]) break if bc.op == OP_COMPARE_OP and bc.arg == COMPARE_EXCEPTION: # This is an except clause. We want to overlook the next @@ -521,23 +537,33 @@ class ByteParser(object): last_chunk = chunks[-1] last_chunk.exits.remove(ex) last_chunk.exits.add(penult.offset) - chunk = Chunk(penult.offset) + chunk = Chunk( + penult.offset, last_chunk.line, False + ) chunk.exits.add(ex) chunks.append(chunk) # Give all the chunks a length. - chunks[-1].length = bc.next_offset - chunks[-1].byte + chunks[-1].length = bc.next_offset - chunks[-1].byte # pylint: disable=W0631,C0301 for i in range(len(chunks)-1): chunks[i].length = chunks[i+1].byte - chunks[i].byte + #self.validate_chunks(chunks) return chunks + def validate_chunks(self, chunks): + """Validate the rule that chunks have a single entrance.""" + # starts is the entrances to the chunks + starts = set([ch.byte for ch in chunks]) + for ch in chunks: + assert all([(ex in starts or ex < 0) for ex in ch.exits]) + def _arcs(self): """Find the executable arcs in the code. - Returns a set of pairs, (from,to). From and to are integer line - numbers. If from is < 0, then the arc is an entrance into the code - object. If to is < 0, the arc is an exit from the code object. + Yields pairs: (from,to). From and to are integer line numbers. If + from is < 0, then the arc is an entrance into the code object. If to + is < 0, the arc is an exit from the code object. """ chunks = self._split_into_chunks() @@ -545,65 +571,43 @@ class ByteParser(object): # A map from byte offsets to chunks jumped into. byte_chunks = dict([(c.byte, c) for c in chunks]) - # Build a map from byte offsets to actual lines reached. - byte_lines = {} - bytes_to_add = set([c.byte for c in chunks]) + # There's always an entrance at the first chunk. + yield (-1, byte_chunks[0].line) - while bytes_to_add: - byte_to_add = bytes_to_add.pop() - if byte_to_add in byte_lines or byte_to_add < 0: + # Traverse from the first chunk in each line, and yield arcs where + # the trace function will be invoked. + for chunk in chunks: + if not chunk.first: continue - # Which lines does this chunk lead to? - bytes_considered = set() - bytes_to_consider = [byte_to_add] - lines = set() - - while bytes_to_consider: - byte = bytes_to_consider.pop() - bytes_considered.add(byte) - - # Find chunk for byte - try: - ch = byte_chunks[byte] - except KeyError: - for ch in chunks: - if ch.byte <= byte < ch.byte+ch.length: - break - else: - # No chunk for this byte! - raise Exception("Couldn't find chunk @ %d" % byte) - byte_chunks[byte] = ch - - if ch.line: - lines.add(ch.line) - else: - for ex in ch.exits: - if ex < 0: - lines.add(ex) - elif ex not in bytes_considered: - bytes_to_consider.append(ex) - - bytes_to_add.update(ch.exits) - - byte_lines[byte_to_add] = lines - - # Figure out for each chunk where the exits go. - arcs = set() - for chunk in chunks: - if chunk.line: - for ex in chunk.exits: + chunks_considered = set() + chunks_to_consider = [chunk] + while chunks_to_consider: + # Get the chunk we're considering, and make sure we don't + # consider it again + this_chunk = chunks_to_consider.pop() + chunks_considered.add(this_chunk) + + # For each exit, add the line number if the trace function + # would be triggered, or add the chunk to those being + # considered if not. + for ex in this_chunk.exits: if ex < 0: - exit_lines = [ex] + yield (chunk.line, ex) else: - exit_lines = byte_lines[ex] - for exit_line in exit_lines: - if chunk.line != exit_line: - arcs.add((chunk.line, exit_line)) - for line in byte_lines[0]: - arcs.add((-1, line)) - - return arcs + next_chunk = byte_chunks[ex] + if next_chunk in chunks_considered: + continue + + # The trace function is invoked if visiting the first + # bytecode in a line, or if the transition is a + # backward jump. + backward_jump = next_chunk.byte < this_chunk.byte + if next_chunk.first or backward_jump: + if next_chunk.line != chunk.line: + yield (chunk.line, next_chunk.line) + else: + chunks_to_consider.append(next_chunk) def _all_chunks(self): """Returns a list of `Chunk` objects for this code and its children. @@ -631,11 +635,11 @@ class ByteParser(object): class Chunk(object): - """A sequence of bytecodes with a single entrance. + """A sequence of byte codes with a single entrance. To analyze byte code, we have to divide it into chunks, sequences of byte - codes such that each basic block has only one entrance, the first - instruction in the block. + codes such that each chunk has only one entrance, the first instruction in + the block. This is almost the CS concept of `basic block`_, except that we're willing to have many exits from a chunk, and "basic block" is a more cumbersome @@ -643,158 +647,54 @@ class Chunk(object): .. _basic block: http://en.wikipedia.org/wiki/Basic_block + `line` is the source line number containing this chunk. + + `first` is true if this is the first chunk in the source line. + An exit < 0 means the chunk can leave the code (return). The exit is the negative of the starting line number of the code block. """ - def __init__(self, byte, line=0): + def __init__(self, byte, line, first): self.byte = byte self.line = line + self.first = first self.length = 0 self.exits = set() def __repr__(self): - return "<%d+%d @%d %r>" % ( - self.byte, self.length, self.line, list(self.exits) + if self.first: + bang = "!" + else: + bang = "" + return "<%d+%d @%d%s %r>" % ( + self.byte, self.length, self.line, bang, list(self.exits) ) -class AdHocMain(object): # pragma: no cover - """An ad-hoc main for code parsing experiments.""" +class CachedTokenizer(object): + """A one-element cache around tokenize.generate_tokens. - def main(self, args): - """A main function for trying the code from the command line.""" + When reporting, coverage.py tokenizes files twice, once to find the + structure of the file, and once to syntax-color it. Tokenizing is + expensive, and easily cached. - from optparse import OptionParser + This is a one-element cache so that our twice-in-a-row tokenizing doesn't + actually tokenize twice. - parser = OptionParser() - parser.add_option( - "-c", action="store_true", dest="chunks", - help="Show basic block chunks" - ) - parser.add_option( - "-d", action="store_true", dest="dis", - help="Disassemble" - ) - parser.add_option( - "-R", action="store_true", dest="recursive", - help="Recurse to find source files" - ) - parser.add_option( - "-s", action="store_true", dest="source", - help="Show analyzed source" - ) - parser.add_option( - "-t", action="store_true", dest="tokens", - help="Show tokens" + """ + def __init__(self): + self.last_text = None + self.last_tokens = None + + def generate_tokens(self, text): + """A stand-in for `tokenize.generate_tokens`.""" + if text != self.last_text: + self.last_text = text + self.last_tokens = list( + tokenize.generate_tokens(StringIO(text).readline) ) + return self.last_tokens - options, args = parser.parse_args() - if options.recursive: - if args: - root = args[0] - else: - root = "." - for root, _, _ in os.walk(root): - for f in glob.glob(root + "/*.py"): - self.adhoc_one_file(options, f) - else: - self.adhoc_one_file(options, args[0]) - - def adhoc_one_file(self, options, filename): - """Process just one file.""" - - if options.dis or options.chunks: - try: - bp = ByteParser(filename=filename) - except CoverageException: - _, err, _ = sys.exc_info() - print("%s" % (err,)) - return - - if options.dis: - print("Main code:") - bp._disassemble() - - if options.chunks: - chunks = bp._all_chunks() - if options.recursive: - print("%6d: %s" % (len(chunks), filename)) - else: - print("Chunks: %r" % chunks) - arcs = bp._all_arcs() - print("Arcs: %r" % sorted(arcs)) - - if options.source or options.tokens: - cp = CodeParser(filename=filename, exclude=r"no\s*cover") - cp.show_tokens = options.tokens - cp._raw_parse() - - if options.source: - if options.chunks: - arc_width, arc_chars = self.arc_ascii_art(arcs) - else: - arc_width, arc_chars = 0, {} - - exit_counts = cp.exit_counts() - - for i, ltext in enumerate(cp.lines): - lineno = i+1 - m0 = m1 = m2 = m3 = a = ' ' - if lineno in cp.statement_starts: - m0 = '-' - exits = exit_counts.get(lineno, 0) - if exits > 1: - m1 = str(exits) - if lineno in cp.docstrings: - m2 = '"' - if lineno in cp.classdefs: - m2 = 'C' - if lineno in cp.excluded: - m3 = 'x' - a = arc_chars.get(lineno, '').ljust(arc_width) - print("%4d %s%s%s%s%s %s" % - (lineno, m0, m1, m2, m3, a, ltext) - ) - - def arc_ascii_art(self, arcs): - """Draw arcs as ascii art. - - Returns a width of characters needed to draw all the arcs, and a - dictionary mapping line numbers to ascii strings to draw for that line. - - """ - arc_chars = {} - for lfrom, lto in sorted(arcs): - if lfrom < 0: - arc_chars[lto] = arc_chars.get(lto, '') + 'v' - elif lto < 0: - arc_chars[lfrom] = arc_chars.get(lfrom, '') + '^' - else: - if lfrom == lto - 1: - # Don't show obvious arcs. - continue - if lfrom < lto: - l1, l2 = lfrom, lto - else: - l1, l2 = lto, lfrom - w = max([len(arc_chars.get(l, '')) for l in range(l1, l2+1)]) - for l in range(l1, l2+1): - if l == lfrom: - ch = '<' - elif l == lto: - ch = '>' - else: - ch = '|' - arc_chars[l] = arc_chars.get(l, '').ljust(w) + ch - arc_width = 0 - - if arc_chars: - arc_width = max([len(a) for a in arc_chars.values()]) - else: - arc_width = 0 - - return arc_width, arc_chars - -if __name__ == '__main__': - AdHocMain().main(sys.argv[1:]) +# Create our generate_tokens cache as a callable replacement function. +generate_tokens = CachedTokenizer().generate_tokens |