diff options
Diffstat (limited to 'pycparser/c_lexer.py')
-rw-r--r-- | pycparser/c_lexer.py | 46 |
1 files changed, 38 insertions, 8 deletions
diff --git a/pycparser/c_lexer.py b/pycparser/c_lexer.py index de8445e..045d24e 100644 --- a/pycparser/c_lexer.py +++ b/pycparser/c_lexer.py @@ -19,7 +19,7 @@ class CLexer(object): tokens. The public attribute filename can be set to an initial - filaneme, but the lexer will update it upon #line + filename, but the lexer will update it upon #line directives. """ def __init__(self, error_func, on_lbrace_func, on_rbrace_func, @@ -130,7 +130,7 @@ class CLexer(object): 'TYPEID', # constants - 'INT_CONST_DEC', 'INT_CONST_OCT', 'INT_CONST_HEX', 'INT_CONST_BIN', + 'INT_CONST_DEC', 'INT_CONST_OCT', 'INT_CONST_HEX', 'INT_CONST_BIN', 'INT_CONST_CHAR', 'FLOAT_CONST', 'HEX_FLOAT_CONST', 'CHAR_CONST', 'WCHAR_CONST', @@ -205,23 +205,49 @@ class CLexer(object): # parse all correct code, even if it means to sometimes parse incorrect # code. # - simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])""" - decimal_escape = r"""(\d+)""" - hex_escape = r"""(x[0-9a-fA-F]+)""" - bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])""" + # The original regexes were taken verbatim from the C syntax definition, + # and were later modified to avoid worst-case exponential running time. + # + # simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])""" + # decimal_escape = r"""(\d+)""" + # hex_escape = r"""(x[0-9a-fA-F]+)""" + # bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])""" + # + # The following modifications were made to avoid the ambiguity that allowed backtracking: + # (https://github.com/eliben/pycparser/issues/61) + # + # - \x was removed from simple_escape, unless it was not followed by a hex digit, to avoid ambiguity with hex_escape. + # - hex_escape allows one or more hex characters, but requires that the next character(if any) is not hex + # - decimal_escape allows one or more decimal characters, but requires that the next character(if any) is not a decimal + # - bad_escape does not allow any decimals (8-9), to avoid conflicting with the permissive decimal_escape. + # + # Without this change, python's `re` module would recursively try parsing each ambiguous escape sequence in multiple ways. + # e.g. `\123` could be parsed as `\1`+`23`, `\12`+`3`, and `\123`. + + simple_escape = r"""([a-wyzA-Z._~!=&\^\-\\?'"]|x(?![0-9a-fA-F]))""" + decimal_escape = r"""(\d+)(?!\d)""" + hex_escape = r"""(x[0-9a-fA-F]+)(?![0-9a-fA-F])""" + bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-9])""" escape_sequence = r"""(\\("""+simple_escape+'|'+decimal_escape+'|'+hex_escape+'))' + + # This complicated regex with lookahead might be slow for strings, so because all of the valid escapes (including \x) allowed + # 0 or more non-escaped characters after the first character, simple_escape+decimal_escape+hex_escape got simplified to + + escape_sequence_start_in_string = r"""(\\[0-9a-zA-Z._~!=&\^\-\\?'"])""" + cconst_char = r"""([^'\\\n]|"""+escape_sequence+')' char_const = "'"+cconst_char+"'" wchar_const = 'L'+char_const + multicharacter_constant = "'"+cconst_char+"{2,4}'" unmatched_quote = "('"+cconst_char+"*\\n)|('"+cconst_char+"*$)" bad_char_const = r"""('"""+cconst_char+"""[^'\n]+')|('')|('"""+bad_escape+r"""[^'\n]*')""" # string literals (K&R2: A.2.6) - string_char = r"""([^"\\\n]|"""+escape_sequence+')' + string_char = r"""([^"\\\n]|"""+escape_sequence_start_in_string+')' string_literal = '"'+string_char+'*"' wstring_literal = 'L'+string_literal - bad_string_literal = '"'+string_char+'*?'+bad_escape+string_char+'*"' + bad_string_literal = '"'+string_char+'*'+bad_escape+string_char+'*"' # floating constants (K&R2: A.2.5.3) exponent_part = r"""([eE][-+]?[0-9]+)""" @@ -443,6 +469,10 @@ class CLexer(object): # Must come before bad_char_const, to prevent it from # catching valid char constants as invalid # + @TOKEN(multicharacter_constant) + def t_INT_CONST_CHAR(self, t): + return t + @TOKEN(char_const) def t_CHAR_CONST(self, t): return t |