diff options
Diffstat (limited to 'antlr-3.4/runtime/Python/antlr3/recognizers.py')
-rw-r--r-- | antlr-3.4/runtime/Python/antlr3/recognizers.py | 1485 |
1 files changed, 1485 insertions, 0 deletions
diff --git a/antlr-3.4/runtime/Python/antlr3/recognizers.py b/antlr-3.4/runtime/Python/antlr3/recognizers.py new file mode 100644 index 0000000..d48280a --- /dev/null +++ b/antlr-3.4/runtime/Python/antlr3/recognizers.py @@ -0,0 +1,1485 @@ +"""ANTLR3 runtime package""" + +# begin[licence] +# +# [The "BSD licence"] +# Copyright (c) 2005-2008 Terence Parr +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# 3. The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# end[licence] + +import sys +import inspect + +from antlr3 import compatible_api_versions +from antlr3.constants import DEFAULT_CHANNEL, HIDDEN_CHANNEL, EOF, \ + EOR_TOKEN_TYPE, INVALID_TOKEN_TYPE +from antlr3.exceptions import RecognitionException, MismatchedTokenException, \ + MismatchedRangeException, MismatchedTreeNodeException, \ + NoViableAltException, EarlyExitException, MismatchedSetException, \ + MismatchedNotSetException, FailedPredicateException, \ + BacktrackingFailed, UnwantedTokenException, MissingTokenException +from antlr3.tokens import CommonToken, SKIP_TOKEN +from antlr3.compat import set, frozenset, reversed + + +class RecognizerSharedState(object): + """ + The set of fields needed by an abstract recognizer to recognize input + and recover from errors etc... As a separate state object, it can be + shared among multiple grammars; e.g., when one grammar imports another. + + These fields are publically visible but the actual state pointer per + parser is protected. + """ + + def __init__(self): + # Track the set of token types that can follow any rule invocation. + # Stack grows upwards. + self.following = [] + + # This is true when we see an error and before having successfully + # matched a token. Prevents generation of more than one error message + # per error. + self.errorRecovery = False + + # The index into the input stream where the last error occurred. + # This is used to prevent infinite loops where an error is found + # but no token is consumed during recovery...another error is found, + # ad naseum. This is a failsafe mechanism to guarantee that at least + # one token/tree node is consumed for two errors. + self.lastErrorIndex = -1 + + # If 0, no backtracking is going on. Safe to exec actions etc... + # If >0 then it's the level of backtracking. + self.backtracking = 0 + + # An array[size num rules] of Map<Integer,Integer> that tracks + # the stop token index for each rule. ruleMemo[ruleIndex] is + # the memoization table for ruleIndex. For key ruleStartIndex, you + # get back the stop token for associated rule or MEMO_RULE_FAILED. + # + # This is only used if rule memoization is on (which it is by default). + self.ruleMemo = None + + ## Did the recognizer encounter a syntax error? Track how many. + self.syntaxErrors = 0 + + + # LEXER FIELDS (must be in same state object to avoid casting + # constantly in generated code and Lexer object) :( + + + ## The goal of all lexer rules/methods is to create a token object. + # This is an instance variable as multiple rules may collaborate to + # create a single token. nextToken will return this object after + # matching lexer rule(s). If you subclass to allow multiple token + # emissions, then set this to the last token to be matched or + # something nonnull so that the auto token emit mechanism will not + # emit another token. + self.token = None + + ## What character index in the stream did the current token start at? + # Needed, for example, to get the text for current token. Set at + # the start of nextToken. + self.tokenStartCharIndex = -1 + + ## The line on which the first character of the token resides + self.tokenStartLine = None + + ## The character position of first character within the line + self.tokenStartCharPositionInLine = None + + ## The channel number for the current token + self.channel = None + + ## The token type for the current token + self.type = None + + ## You can set the text for the current token to override what is in + # the input char buffer. Use setText() or can set this instance var. + self.text = None + + +class BaseRecognizer(object): + """ + @brief Common recognizer functionality. + + A generic recognizer that can handle recognizers generated from + lexer, parser, and tree grammars. This is all the parsing + support code essentially; most of it is error recovery stuff and + backtracking. + """ + + MEMO_RULE_FAILED = -2 + MEMO_RULE_UNKNOWN = -1 + + # copies from Token object for convenience in actions + DEFAULT_TOKEN_CHANNEL = DEFAULT_CHANNEL + + # for convenience in actions + HIDDEN = HIDDEN_CHANNEL + + # overridden by generated subclasses + tokenNames = None + + # The api_version attribute has been introduced in 3.3. If it is not + # overwritten in the generated recognizer, we assume a default of v0. + api_version = 0 + + def __init__(self, state=None): + # Input stream of the recognizer. Must be initialized by a subclass. + self.input = None + + ## State of a lexer, parser, or tree parser are collected into a state + # object so the state can be shared. This sharing is needed to + # have one grammar import others and share same error variables + # and other state variables. It's a kind of explicit multiple + # inheritance via delegation of methods and shared state. + if state is None: + state = RecognizerSharedState() + self._state = state + + if self.api_version not in compatible_api_versions: + raise RuntimeError( + ("ANTLR version mismatch: " + "The recognizer has been generated with API V%s, " + "but this runtime does not support this.") + % self.api_version) + + # this one only exists to shut up pylint :( + def setInput(self, input): + self.input = input + + + def reset(self): + """ + reset the parser's state; subclasses must rewinds the input stream + """ + + # wack everything related to error recovery + if self._state is None: + # no shared state work to do + return + + self._state.following = [] + self._state.errorRecovery = False + self._state.lastErrorIndex = -1 + self._state.syntaxErrors = 0 + # wack everything related to backtracking and memoization + self._state.backtracking = 0 + if self._state.ruleMemo is not None: + self._state.ruleMemo = {} + + + def match(self, input, ttype, follow): + """ + Match current input symbol against ttype. Attempt + single token insertion or deletion error recovery. If + that fails, throw MismatchedTokenException. + + To turn off single token insertion or deletion error + recovery, override recoverFromMismatchedToken() and have it + throw an exception. See TreeParser.recoverFromMismatchedToken(). + This way any error in a rule will cause an exception and + immediate exit from rule. Rule would recover by resynchronizing + to the set of symbols that can follow rule ref. + """ + + matchedSymbol = self.getCurrentInputSymbol(input) + if self.input.LA(1) == ttype: + self.input.consume() + self._state.errorRecovery = False + return matchedSymbol + + if self._state.backtracking > 0: + # FIXME: need to return matchedSymbol here as well. damn!! + raise BacktrackingFailed + + matchedSymbol = self.recoverFromMismatchedToken(input, ttype, follow) + return matchedSymbol + + + def matchAny(self, input): + """Match the wildcard: in a symbol""" + + self._state.errorRecovery = False + self.input.consume() + + + def mismatchIsUnwantedToken(self, input, ttype): + return input.LA(2) == ttype + + + def mismatchIsMissingToken(self, input, follow): + if follow is None: + # we have no information about the follow; we can only consume + # a single token and hope for the best + return False + + # compute what can follow this grammar element reference + if EOR_TOKEN_TYPE in follow: + viableTokensFollowingThisRule = self.computeContextSensitiveRuleFOLLOW() + follow = follow | viableTokensFollowingThisRule + + if len(self._state.following) > 0: + # remove EOR if we're not the start symbol + follow = follow - set([EOR_TOKEN_TYPE]) + + # if current token is consistent with what could come after set + # then we know we're missing a token; error recovery is free to + # "insert" the missing token + if input.LA(1) in follow or EOR_TOKEN_TYPE in follow: + return True + + return False + + + def reportError(self, e): + """Report a recognition problem. + + This method sets errorRecovery to indicate the parser is recovering + not parsing. Once in recovery mode, no errors are generated. + To get out of recovery mode, the parser must successfully match + a token (after a resync). So it will go: + + 1. error occurs + 2. enter recovery mode, report error + 3. consume until token found in resynch set + 4. try to resume parsing + 5. next match() will reset errorRecovery mode + + If you override, make sure to update syntaxErrors if you care about + that. + + """ + + # if we've already reported an error and have not matched a token + # yet successfully, don't report any errors. + if self._state.errorRecovery: + return + + self._state.syntaxErrors += 1 # don't count spurious + self._state.errorRecovery = True + + self.displayRecognitionError(self.tokenNames, e) + + + def displayRecognitionError(self, tokenNames, e): + hdr = self.getErrorHeader(e) + msg = self.getErrorMessage(e, tokenNames) + self.emitErrorMessage(hdr+" "+msg) + + + def getErrorMessage(self, e, tokenNames): + """ + What error message should be generated for the various + exception types? + + Not very object-oriented code, but I like having all error message + generation within one method rather than spread among all of the + exception classes. This also makes it much easier for the exception + handling because the exception classes do not have to have pointers back + to this object to access utility routines and so on. Also, changing + the message for an exception type would be difficult because you + would have to subclassing exception, but then somehow get ANTLR + to make those kinds of exception objects instead of the default. + This looks weird, but trust me--it makes the most sense in terms + of flexibility. + + For grammar debugging, you will want to override this to add + more information such as the stack frame with + getRuleInvocationStack(e, this.getClass().getName()) and, + for no viable alts, the decision description and state etc... + + Override this to change the message generated for one or more + exception types. + """ + + if isinstance(e, UnwantedTokenException): + tokenName = "<unknown>" + if e.expecting == EOF: + tokenName = "EOF" + + else: + tokenName = self.tokenNames[e.expecting] + + msg = "extraneous input %s expecting %s" % ( + self.getTokenErrorDisplay(e.getUnexpectedToken()), + tokenName + ) + + elif isinstance(e, MissingTokenException): + tokenName = "<unknown>" + if e.expecting == EOF: + tokenName = "EOF" + + else: + tokenName = self.tokenNames[e.expecting] + + msg = "missing %s at %s" % ( + tokenName, self.getTokenErrorDisplay(e.token) + ) + + elif isinstance(e, MismatchedTokenException): + tokenName = "<unknown>" + if e.expecting == EOF: + tokenName = "EOF" + else: + tokenName = self.tokenNames[e.expecting] + + msg = "mismatched input " \ + + self.getTokenErrorDisplay(e.token) \ + + " expecting " \ + + tokenName + + elif isinstance(e, MismatchedTreeNodeException): + tokenName = "<unknown>" + if e.expecting == EOF: + tokenName = "EOF" + else: + tokenName = self.tokenNames[e.expecting] + + msg = "mismatched tree node: %s expecting %s" \ + % (e.node, tokenName) + + elif isinstance(e, NoViableAltException): + msg = "no viable alternative at input " \ + + self.getTokenErrorDisplay(e.token) + + elif isinstance(e, EarlyExitException): + msg = "required (...)+ loop did not match anything at input " \ + + self.getTokenErrorDisplay(e.token) + + elif isinstance(e, MismatchedSetException): + msg = "mismatched input " \ + + self.getTokenErrorDisplay(e.token) \ + + " expecting set " \ + + repr(e.expecting) + + elif isinstance(e, MismatchedNotSetException): + msg = "mismatched input " \ + + self.getTokenErrorDisplay(e.token) \ + + " expecting set " \ + + repr(e.expecting) + + elif isinstance(e, FailedPredicateException): + msg = "rule " \ + + e.ruleName \ + + " failed predicate: {" \ + + e.predicateText \ + + "}?" + + else: + msg = str(e) + + return msg + + + def getNumberOfSyntaxErrors(self): + """ + Get number of recognition errors (lexer, parser, tree parser). Each + recognizer tracks its own number. So parser and lexer each have + separate count. Does not count the spurious errors found between + an error and next valid token match + + See also reportError() + """ + return self._state.syntaxErrors + + + def getErrorHeader(self, e): + """ + What is the error header, normally line/character position information? + """ + + source_name = self.getSourceName() + if source_name is not None: + return "%s line %d:%d" % (source_name, e.line, e.charPositionInLine) + return "line %d:%d" % (e.line, e.charPositionInLine) + + + def getTokenErrorDisplay(self, t): + """ + How should a token be displayed in an error message? The default + is to display just the text, but during development you might + want to have a lot of information spit out. Override in that case + to use t.toString() (which, for CommonToken, dumps everything about + the token). This is better than forcing you to override a method in + your token objects because you don't have to go modify your lexer + so that it creates a new Java type. + """ + + s = t.text + if s is None: + if t.type == EOF: + s = "<EOF>" + else: + s = "<"+t.type+">" + + return repr(s) + + + def emitErrorMessage(self, msg): + """Override this method to change where error messages go""" + sys.stderr.write(msg + '\n') + + + def recover(self, input, re): + """ + Recover from an error found on the input stream. This is + for NoViableAlt and mismatched symbol exceptions. If you enable + single token insertion and deletion, this will usually not + handle mismatched symbol exceptions but there could be a mismatched + token that the match() routine could not recover from. + """ + + # PROBLEM? what if input stream is not the same as last time + # perhaps make lastErrorIndex a member of input + if self._state.lastErrorIndex == input.index(): + # uh oh, another error at same token index; must be a case + # where LT(1) is in the recovery token set so nothing is + # consumed; consume a single token so at least to prevent + # an infinite loop; this is a failsafe. + input.consume() + + self._state.lastErrorIndex = input.index() + followSet = self.computeErrorRecoverySet() + + self.beginResync() + self.consumeUntil(input, followSet) + self.endResync() + + + def beginResync(self): + """ + A hook to listen in on the token consumption during error recovery. + The DebugParser subclasses this to fire events to the listenter. + """ + + pass + + + def endResync(self): + """ + A hook to listen in on the token consumption during error recovery. + The DebugParser subclasses this to fire events to the listenter. + """ + + pass + + + def computeErrorRecoverySet(self): + """ + Compute the error recovery set for the current rule. During + rule invocation, the parser pushes the set of tokens that can + follow that rule reference on the stack; this amounts to + computing FIRST of what follows the rule reference in the + enclosing rule. This local follow set only includes tokens + from within the rule; i.e., the FIRST computation done by + ANTLR stops at the end of a rule. + + EXAMPLE + + When you find a "no viable alt exception", the input is not + consistent with any of the alternatives for rule r. The best + thing to do is to consume tokens until you see something that + can legally follow a call to r *or* any rule that called r. + You don't want the exact set of viable next tokens because the + input might just be missing a token--you might consume the + rest of the input looking for one of the missing tokens. + + Consider grammar: + + a : '[' b ']' + | '(' b ')' + ; + b : c '^' INT ; + c : ID + | INT + ; + + At each rule invocation, the set of tokens that could follow + that rule is pushed on a stack. Here are the various "local" + follow sets: + + FOLLOW(b1_in_a) = FIRST(']') = ']' + FOLLOW(b2_in_a) = FIRST(')') = ')' + FOLLOW(c_in_b) = FIRST('^') = '^' + + Upon erroneous input "[]", the call chain is + + a -> b -> c + + and, hence, the follow context stack is: + + depth local follow set after call to rule + 0 \<EOF> a (from main()) + 1 ']' b + 3 '^' c + + Notice that ')' is not included, because b would have to have + been called from a different context in rule a for ')' to be + included. + + For error recovery, we cannot consider FOLLOW(c) + (context-sensitive or otherwise). We need the combined set of + all context-sensitive FOLLOW sets--the set of all tokens that + could follow any reference in the call chain. We need to + resync to one of those tokens. Note that FOLLOW(c)='^' and if + we resync'd to that token, we'd consume until EOF. We need to + sync to context-sensitive FOLLOWs for a, b, and c: {']','^'}. + In this case, for input "[]", LA(1) is in this set so we would + not consume anything and after printing an error rule c would + return normally. It would not find the required '^' though. + At this point, it gets a mismatched token error and throws an + exception (since LA(1) is not in the viable following token + set). The rule exception handler tries to recover, but finds + the same recovery set and doesn't consume anything. Rule b + exits normally returning to rule a. Now it finds the ']' (and + with the successful match exits errorRecovery mode). + + So, you cna see that the parser walks up call chain looking + for the token that was a member of the recovery set. + + Errors are not generated in errorRecovery mode. + + ANTLR's error recovery mechanism is based upon original ideas: + + "Algorithms + Data Structures = Programs" by Niklaus Wirth + + and + + "A note on error recovery in recursive descent parsers": + http://portal.acm.org/citation.cfm?id=947902.947905 + + Later, Josef Grosch had some good ideas: + + "Efficient and Comfortable Error Recovery in Recursive Descent + Parsers": + ftp://www.cocolab.com/products/cocktail/doca4.ps/ell.ps.zip + + Like Grosch I implemented local FOLLOW sets that are combined + at run-time upon error to avoid overhead during parsing. + """ + + return self.combineFollows(False) + + + def computeContextSensitiveRuleFOLLOW(self): + """ + Compute the context-sensitive FOLLOW set for current rule. + This is set of token types that can follow a specific rule + reference given a specific call chain. You get the set of + viable tokens that can possibly come next (lookahead depth 1) + given the current call chain. Contrast this with the + definition of plain FOLLOW for rule r: + + FOLLOW(r)={x | S=>*alpha r beta in G and x in FIRST(beta)} + + where x in T* and alpha, beta in V*; T is set of terminals and + V is the set of terminals and nonterminals. In other words, + FOLLOW(r) is the set of all tokens that can possibly follow + references to r in *any* sentential form (context). At + runtime, however, we know precisely which context applies as + we have the call chain. We may compute the exact (rather + than covering superset) set of following tokens. + + For example, consider grammar: + + stat : ID '=' expr ';' // FOLLOW(stat)=={EOF} + | "return" expr '.' + ; + expr : atom ('+' atom)* ; // FOLLOW(expr)=={';','.',')'} + atom : INT // FOLLOW(atom)=={'+',')',';','.'} + | '(' expr ')' + ; + + The FOLLOW sets are all inclusive whereas context-sensitive + FOLLOW sets are precisely what could follow a rule reference. + For input input "i=(3);", here is the derivation: + + stat => ID '=' expr ';' + => ID '=' atom ('+' atom)* ';' + => ID '=' '(' expr ')' ('+' atom)* ';' + => ID '=' '(' atom ')' ('+' atom)* ';' + => ID '=' '(' INT ')' ('+' atom)* ';' + => ID '=' '(' INT ')' ';' + + At the "3" token, you'd have a call chain of + + stat -> expr -> atom -> expr -> atom + + What can follow that specific nested ref to atom? Exactly ')' + as you can see by looking at the derivation of this specific + input. Contrast this with the FOLLOW(atom)={'+',')',';','.'}. + + You want the exact viable token set when recovering from a + token mismatch. Upon token mismatch, if LA(1) is member of + the viable next token set, then you know there is most likely + a missing token in the input stream. "Insert" one by just not + throwing an exception. + """ + + return self.combineFollows(True) + + + def combineFollows(self, exact): + followSet = set() + for idx, localFollowSet in reversed(list(enumerate(self._state.following))): + followSet |= localFollowSet + if exact: + # can we see end of rule? + if EOR_TOKEN_TYPE in localFollowSet: + # Only leave EOR in set if at top (start rule); this lets + # us know if have to include follow(start rule); i.e., EOF + if idx > 0: + followSet.remove(EOR_TOKEN_TYPE) + + else: + # can't see end of rule, quit + break + + return followSet + + + def recoverFromMismatchedToken(self, input, ttype, follow): + """Attempt to recover from a single missing or extra token. + + EXTRA TOKEN + + LA(1) is not what we are looking for. If LA(2) has the right token, + however, then assume LA(1) is some extra spurious token. Delete it + and LA(2) as if we were doing a normal match(), which advances the + input. + + MISSING TOKEN + + If current token is consistent with what could come after + ttype then it is ok to 'insert' the missing token, else throw + exception For example, Input 'i=(3;' is clearly missing the + ')'. When the parser returns from the nested call to expr, it + will have call chain: + + stat -> expr -> atom + + and it will be trying to match the ')' at this point in the + derivation: + + => ID '=' '(' INT ')' ('+' atom)* ';' + ^ + match() will see that ';' doesn't match ')' and report a + mismatched token error. To recover, it sees that LA(1)==';' + is in the set of tokens that can follow the ')' token + reference in rule atom. It can assume that you forgot the ')'. + """ + + e = None + + # if next token is what we are looking for then "delete" this token + if self.mismatchIsUnwantedToken(input, ttype): + e = UnwantedTokenException(ttype, input) + + self.beginResync() + input.consume() # simply delete extra token + self.endResync() + + # report after consuming so AW sees the token in the exception + self.reportError(e) + + # we want to return the token we're actually matching + matchedSymbol = self.getCurrentInputSymbol(input) + + # move past ttype token as if all were ok + input.consume() + return matchedSymbol + + # can't recover with single token deletion, try insertion + if self.mismatchIsMissingToken(input, follow): + inserted = self.getMissingSymbol(input, e, ttype, follow) + e = MissingTokenException(ttype, input, inserted) + + # report after inserting so AW sees the token in the exception + self.reportError(e) + return inserted + + # even that didn't work; must throw the exception + e = MismatchedTokenException(ttype, input) + raise e + + + def recoverFromMismatchedSet(self, input, e, follow): + """Not currently used""" + + if self.mismatchIsMissingToken(input, follow): + self.reportError(e) + # we don't know how to conjure up a token for sets yet + return self.getMissingSymbol(input, e, INVALID_TOKEN_TYPE, follow) + + # TODO do single token deletion like above for Token mismatch + raise e + + + def getCurrentInputSymbol(self, input): + """ + Match needs to return the current input symbol, which gets put + into the label for the associated token ref; e.g., x=ID. Token + and tree parsers need to return different objects. Rather than test + for input stream type or change the IntStream interface, I use + a simple method to ask the recognizer to tell me what the current + input symbol is. + + This is ignored for lexers. + """ + + return None + + + def getMissingSymbol(self, input, e, expectedTokenType, follow): + """Conjure up a missing token during error recovery. + + The recognizer attempts to recover from single missing + symbols. But, actions might refer to that missing symbol. + For example, x=ID {f($x);}. The action clearly assumes + that there has been an identifier matched previously and that + $x points at that token. If that token is missing, but + the next token in the stream is what we want we assume that + this token is missing and we keep going. Because we + have to return some token to replace the missing token, + we have to conjure one up. This method gives the user control + over the tokens returned for missing tokens. Mostly, + you will want to create something special for identifier + tokens. For literals such as '{' and ',', the default + action in the parser or tree parser works. It simply creates + a CommonToken of the appropriate type. The text will be the token. + If you change what tokens must be created by the lexer, + override this method to create the appropriate tokens. + """ + + return None + + +## def recoverFromMissingElement(self, input, e, follow): +## """ +## This code is factored out from mismatched token and mismatched set +## recovery. It handles "single token insertion" error recovery for +## both. No tokens are consumed to recover from insertions. Return +## true if recovery was possible else return false. +## """ + +## if self.mismatchIsMissingToken(input, follow): +## self.reportError(e) +## return True + +## # nothing to do; throw exception +## return False + + + def consumeUntil(self, input, tokenTypes): + """ + Consume tokens until one matches the given token or token set + + tokenTypes can be a single token type or a set of token types + + """ + + if not isinstance(tokenTypes, (set, frozenset)): + tokenTypes = frozenset([tokenTypes]) + + ttype = input.LA(1) + while ttype != EOF and ttype not in tokenTypes: + input.consume() + ttype = input.LA(1) + + + def getRuleInvocationStack(self): + """ + Return List<String> of the rules in your parser instance + leading up to a call to this method. You could override if + you want more details such as the file/line info of where + in the parser java code a rule is invoked. + + This is very useful for error messages and for context-sensitive + error recovery. + + You must be careful, if you subclass a generated recognizers. + The default implementation will only search the module of self + for rules, but the subclass will not contain any rules. + You probably want to override this method to look like + + def getRuleInvocationStack(self): + return self._getRuleInvocationStack(<class>.__module__) + + where <class> is the class of the generated recognizer, e.g. + the superclass of self. + """ + + return self._getRuleInvocationStack(self.__module__) + + + def _getRuleInvocationStack(cls, module): + """ + A more general version of getRuleInvocationStack where you can + pass in, for example, a RecognitionException to get it's rule + stack trace. This routine is shared with all recognizers, hence, + static. + + TODO: move to a utility class or something; weird having lexer call + this + """ + + # mmmhhh,... perhaps look at the first argument + # (f_locals[co_varnames[0]]?) and test if it's a (sub)class of + # requested recognizer... + + rules = [] + for frame in reversed(inspect.stack()): + code = frame[0].f_code + codeMod = inspect.getmodule(code) + if codeMod is None: + continue + + # skip frames not in requested module + if codeMod.__name__ != module: + continue + + # skip some unwanted names + if code.co_name in ('nextToken', '<module>'): + continue + + rules.append(code.co_name) + + return rules + + _getRuleInvocationStack = classmethod(_getRuleInvocationStack) + + + def getBacktrackingLevel(self): + return self._state.backtracking + + def setBacktrackingLevel(self, n): + self._state.backtracking = n + + + def getGrammarFileName(self): + """For debugging and other purposes, might want the grammar name. + + Have ANTLR generate an implementation for this method. + """ + + return self.grammarFileName + + + def getSourceName(self): + raise NotImplementedError + + + def toStrings(self, tokens): + """A convenience method for use most often with template rewrites. + + Convert a List<Token> to List<String> + """ + + if tokens is None: + return None + + return [token.text for token in tokens] + + + def getRuleMemoization(self, ruleIndex, ruleStartIndex): + """ + Given a rule number and a start token index number, return + MEMO_RULE_UNKNOWN if the rule has not parsed input starting from + start index. If this rule has parsed input starting from the + start index before, then return where the rule stopped parsing. + It returns the index of the last token matched by the rule. + """ + + if ruleIndex not in self._state.ruleMemo: + self._state.ruleMemo[ruleIndex] = {} + + return self._state.ruleMemo[ruleIndex].get( + ruleStartIndex, self.MEMO_RULE_UNKNOWN + ) + + + def alreadyParsedRule(self, input, ruleIndex): + """ + Has this rule already parsed input at the current index in the + input stream? Return the stop token index or MEMO_RULE_UNKNOWN. + If we attempted but failed to parse properly before, return + MEMO_RULE_FAILED. + + This method has a side-effect: if we have seen this input for + this rule and successfully parsed before, then seek ahead to + 1 past the stop token matched for this rule last time. + """ + + stopIndex = self.getRuleMemoization(ruleIndex, input.index()) + if stopIndex == self.MEMO_RULE_UNKNOWN: + return False + + if stopIndex == self.MEMO_RULE_FAILED: + raise BacktrackingFailed + + else: + input.seek(stopIndex + 1) + + return True + + + def memoize(self, input, ruleIndex, ruleStartIndex, success): + """ + Record whether or not this rule parsed the input at this position + successfully. + """ + + if success: + stopTokenIndex = input.index() - 1 + else: + stopTokenIndex = self.MEMO_RULE_FAILED + + if ruleIndex in self._state.ruleMemo: + self._state.ruleMemo[ruleIndex][ruleStartIndex] = stopTokenIndex + + + def traceIn(self, ruleName, ruleIndex, inputSymbol): + sys.stdout.write("enter %s %s" % (ruleName, inputSymbol)) + + if self._state.backtracking > 0: + sys.stdout.write(" backtracking=%s" % self._state.backtracking) + + sys.stdout.write('\n') + + + def traceOut(self, ruleName, ruleIndex, inputSymbol): + sys.stdout.write("exit %s %s" % (ruleName, inputSymbol)) + + if self._state.backtracking > 0: + sys.stdout.write(" backtracking=%s" % self._state.backtracking) + + # mmmm... we use BacktrackingFailed exceptions now. So how could we + # get that information here? + #if self._state.failed: + # sys.stdout.write(" failed") + #else: + # sys.stdout.write(" succeeded") + + sys.stdout.write('\n') + + +class TokenSource(object): + """ + @brief Abstract baseclass for token producers. + + A source of tokens must provide a sequence of tokens via nextToken() + and also must reveal it's source of characters; CommonToken's text is + computed from a CharStream; it only store indices into the char stream. + + Errors from the lexer are never passed to the parser. Either you want + to keep going or you do not upon token recognition error. If you do not + want to continue lexing then you do not want to continue parsing. Just + throw an exception not under RecognitionException and Java will naturally + toss you all the way out of the recognizers. If you want to continue + lexing then you should not throw an exception to the parser--it has already + requested a token. Keep lexing until you get a valid one. Just report + errors and keep going, looking for a valid token. + """ + + def nextToken(self): + """Return a Token object from your input stream (usually a CharStream). + + Do not fail/return upon lexing error; keep chewing on the characters + until you get a good one; errors are not passed through to the parser. + """ + + raise NotImplementedError + + + def __iter__(self): + """The TokenSource is an interator. + + The iteration will not include the final EOF token, see also the note + for the next() method. + + """ + + return self + + + def next(self): + """Return next token or raise StopIteration. + + Note that this will raise StopIteration when hitting the EOF token, + so EOF will not be part of the iteration. + + """ + + token = self.nextToken() + if token is None or token.type == EOF: + raise StopIteration + return token + + +class Lexer(BaseRecognizer, TokenSource): + """ + @brief Baseclass for generated lexer classes. + + A lexer is recognizer that draws input symbols from a character stream. + lexer grammars result in a subclass of this object. A Lexer object + uses simplified match() and error recovery mechanisms in the interest + of speed. + """ + + def __init__(self, input, state=None): + BaseRecognizer.__init__(self, state) + TokenSource.__init__(self) + + # Where is the lexer drawing characters from? + self.input = input + + + def reset(self): + BaseRecognizer.reset(self) # reset all recognizer state variables + + if self.input is not None: + # rewind the input + self.input.seek(0) + + if self._state is None: + # no shared state work to do + return + + # wack Lexer state variables + self._state.token = None + self._state.type = INVALID_TOKEN_TYPE + self._state.channel = DEFAULT_CHANNEL + self._state.tokenStartCharIndex = -1 + self._state.tokenStartLine = -1 + self._state.tokenStartCharPositionInLine = -1 + self._state.text = None + + + def makeEOFToken(self): + eof = CommonToken( + type=EOF, channel=DEFAULT_CHANNEL, + input=self.input, + start=self.input.index(), stop=self.input.index()) + eof.line = self.input.line + eof.charPositionInLine = self.input.charPositionInLine + return eof + + def nextToken(self): + """ + Return a token from this source; i.e., match a token on the char + stream. + """ + + while 1: + self._state.token = None + self._state.channel = DEFAULT_CHANNEL + self._state.tokenStartCharIndex = self.input.index() + self._state.tokenStartCharPositionInLine = self.input.charPositionInLine + self._state.tokenStartLine = self.input.line + self._state.text = None + if self.input.LA(1) == EOF: + return self.makeEOFToken() + + try: + self.mTokens() + + if self._state.token is None: + self.emit() + + elif self._state.token == SKIP_TOKEN: + continue + + return self._state.token + + except NoViableAltException, re: + self.reportError(re) + self.recover(re) # throw out current char and try again + + except RecognitionException, re: + self.reportError(re) + # match() routine has already called recover() + + + def skip(self): + """ + Instruct the lexer to skip creating a token for current lexer rule + and look for another token. nextToken() knows to keep looking when + a lexer rule finishes with token set to SKIP_TOKEN. Recall that + if token==null at end of any token rule, it creates one for you + and emits it. + """ + + self._state.token = SKIP_TOKEN + + + def mTokens(self): + """This is the lexer entry point that sets instance var 'token'""" + + # abstract method + raise NotImplementedError + + + def setCharStream(self, input): + """Set the char stream and reset the lexer""" + self.input = None + self.reset() + self.input = input + + + def getSourceName(self): + return self.input.getSourceName() + + + def emit(self, token=None): + """ + The standard method called to automatically emit a token at the + outermost lexical rule. The token object should point into the + char buffer start..stop. If there is a text override in 'text', + use that to set the token's text. Override this method to emit + custom Token objects. + + If you are building trees, then you should also override + Parser or TreeParser.getMissingSymbol(). + """ + + if token is None: + token = CommonToken( + input=self.input, + type=self._state.type, + channel=self._state.channel, + start=self._state.tokenStartCharIndex, + stop=self.getCharIndex()-1 + ) + token.line = self._state.tokenStartLine + token.text = self._state.text + token.charPositionInLine = self._state.tokenStartCharPositionInLine + + self._state.token = token + + return token + + + def match(self, s): + if isinstance(s, basestring): + for c in s: + if self.input.LA(1) != ord(c): + if self._state.backtracking > 0: + raise BacktrackingFailed + + mte = MismatchedTokenException(c, self.input) + self.recover(mte) + raise mte + + self.input.consume() + + else: + if self.input.LA(1) != s: + if self._state.backtracking > 0: + raise BacktrackingFailed + + mte = MismatchedTokenException(unichr(s), self.input) + self.recover(mte) # don't really recover; just consume in lexer + raise mte + + self.input.consume() + + + def matchAny(self): + self.input.consume() + + + def matchRange(self, a, b): + if self.input.LA(1) < a or self.input.LA(1) > b: + if self._state.backtracking > 0: + raise BacktrackingFailed + + mre = MismatchedRangeException(unichr(a), unichr(b), self.input) + self.recover(mre) + raise mre + + self.input.consume() + + + def getLine(self): + return self.input.line + + + def getCharPositionInLine(self): + return self.input.charPositionInLine + + + def getCharIndex(self): + """What is the index of the current character of lookahead?""" + + return self.input.index() + + + def getText(self): + """ + Return the text matched so far for the current token or any + text override. + """ + if self._state.text is not None: + return self._state.text + + return self.input.substring( + self._state.tokenStartCharIndex, + self.getCharIndex()-1 + ) + + + def setText(self, text): + """ + Set the complete text of this token; it wipes any previous + changes to the text. + """ + self._state.text = text + + + text = property(getText, setText) + + + def reportError(self, e): + ## TODO: not thought about recovery in lexer yet. + + ## # if we've already reported an error and have not matched a token + ## # yet successfully, don't report any errors. + ## if self.errorRecovery: + ## #System.err.print("[SPURIOUS] "); + ## return; + ## + ## self.errorRecovery = True + + self.displayRecognitionError(self.tokenNames, e) + + + def getErrorMessage(self, e, tokenNames): + msg = None + + if isinstance(e, MismatchedTokenException): + msg = "mismatched character " \ + + self.getCharErrorDisplay(e.c) \ + + " expecting " \ + + self.getCharErrorDisplay(e.expecting) + + elif isinstance(e, NoViableAltException): + msg = "no viable alternative at character " \ + + self.getCharErrorDisplay(e.c) + + elif isinstance(e, EarlyExitException): + msg = "required (...)+ loop did not match anything at character " \ + + self.getCharErrorDisplay(e.c) + + elif isinstance(e, MismatchedNotSetException): + msg = "mismatched character " \ + + self.getCharErrorDisplay(e.c) \ + + " expecting set " \ + + repr(e.expecting) + + elif isinstance(e, MismatchedSetException): + msg = "mismatched character " \ + + self.getCharErrorDisplay(e.c) \ + + " expecting set " \ + + repr(e.expecting) + + elif isinstance(e, MismatchedRangeException): + msg = "mismatched character " \ + + self.getCharErrorDisplay(e.c) \ + + " expecting set " \ + + self.getCharErrorDisplay(e.a) \ + + ".." \ + + self.getCharErrorDisplay(e.b) + + else: + msg = BaseRecognizer.getErrorMessage(self, e, tokenNames) + + return msg + + + def getCharErrorDisplay(self, c): + if c == EOF: + c = '<EOF>' + return repr(c) + + + def recover(self, re): + """ + Lexers can normally match any char in it's vocabulary after matching + a token, so do the easy thing and just kill a character and hope + it all works out. You can instead use the rule invocation stack + to do sophisticated error recovery if you are in a fragment rule. + """ + + self.input.consume() + + + def traceIn(self, ruleName, ruleIndex): + inputSymbol = "%s line=%d:%s" % (self.input.LT(1), + self.getLine(), + self.getCharPositionInLine() + ) + + BaseRecognizer.traceIn(self, ruleName, ruleIndex, inputSymbol) + + + def traceOut(self, ruleName, ruleIndex): + inputSymbol = "%s line=%d:%s" % (self.input.LT(1), + self.getLine(), + self.getCharPositionInLine() + ) + + BaseRecognizer.traceOut(self, ruleName, ruleIndex, inputSymbol) + + + +class Parser(BaseRecognizer): + """ + @brief Baseclass for generated parser classes. + """ + + def __init__(self, lexer, state=None): + BaseRecognizer.__init__(self, state) + + self.input = lexer + + + def reset(self): + BaseRecognizer.reset(self) # reset all recognizer state variables + if self.input is not None: + self.input.seek(0) # rewind the input + + + def getCurrentInputSymbol(self, input): + return input.LT(1) + + + def getMissingSymbol(self, input, e, expectedTokenType, follow): + if expectedTokenType == EOF: + tokenText = "<missing EOF>" + else: + tokenText = "<missing " + self.tokenNames[expectedTokenType] + ">" + t = CommonToken(type=expectedTokenType, text=tokenText) + current = input.LT(1) + if current.type == EOF: + current = input.LT(-1) + + if current is not None: + t.line = current.line + t.charPositionInLine = current.charPositionInLine + t.channel = DEFAULT_CHANNEL + return t + + + def setTokenStream(self, input): + """Set the token stream and reset the parser""" + + self.input = None + self.reset() + self.input = input + + + def getTokenStream(self): + return self.input + + + def getSourceName(self): + return self.input.getSourceName() + + + def traceIn(self, ruleName, ruleIndex): + BaseRecognizer.traceIn(self, ruleName, ruleIndex, self.input.LT(1)) + + + def traceOut(self, ruleName, ruleIndex): + BaseRecognizer.traceOut(self, ruleName, ruleIndex, self.input.LT(1)) + + +class RuleReturnScope(object): + """ + Rules can return start/stop info as well as possible trees and templates. + """ + + def getStart(self): + """Return the start token or tree.""" + return None + + + def getStop(self): + """Return the stop token or tree.""" + return None + + + def getTree(self): + """Has a value potentially if output=AST.""" + return None + + + def getTemplate(self): + """Has a value potentially if output=template.""" + return None + + +class ParserRuleReturnScope(RuleReturnScope): + """ + Rules that return more than a single value must return an object + containing all the values. Besides the properties defined in + RuleLabelScope.predefinedRulePropertiesScope there may be user-defined + return values. This class simply defines the minimum properties that + are always defined and methods to access the others that might be + available depending on output option such as template and tree. + + Note text is not an actual property of the return value, it is computed + from start and stop using the input stream's toString() method. I + could add a ctor to this so that we can pass in and store the input + stream, but I'm not sure we want to do that. It would seem to be undefined + to get the .text property anyway if the rule matches tokens from multiple + input streams. + + I do not use getters for fields of objects that are used simply to + group values such as this aggregate. The getters/setters are there to + satisfy the superclass interface. + """ + + def __init__(self): + self.start = None + self.stop = None + self.tree = None # only used when output=AST + + + def getStart(self): + return self.start + + + def getStop(self): + return self.stop + + + def getTree(self): + return self.tree |