aboutsummaryrefslogtreecommitdiff
path: root/pycparser/c_lexer.py
diff options
context:
space:
mode:
Diffstat (limited to 'pycparser/c_lexer.py')
-rw-r--r--pycparser/c_lexer.py46
1 files changed, 38 insertions, 8 deletions
diff --git a/pycparser/c_lexer.py b/pycparser/c_lexer.py
index de8445e..045d24e 100644
--- a/pycparser/c_lexer.py
+++ b/pycparser/c_lexer.py
@@ -19,7 +19,7 @@ class CLexer(object):
tokens.
The public attribute filename can be set to an initial
- filaneme, but the lexer will update it upon #line
+ filename, but the lexer will update it upon #line
directives.
"""
def __init__(self, error_func, on_lbrace_func, on_rbrace_func,
@@ -130,7 +130,7 @@ class CLexer(object):
'TYPEID',
# constants
- 'INT_CONST_DEC', 'INT_CONST_OCT', 'INT_CONST_HEX', 'INT_CONST_BIN',
+ 'INT_CONST_DEC', 'INT_CONST_OCT', 'INT_CONST_HEX', 'INT_CONST_BIN', 'INT_CONST_CHAR',
'FLOAT_CONST', 'HEX_FLOAT_CONST',
'CHAR_CONST',
'WCHAR_CONST',
@@ -205,23 +205,49 @@ class CLexer(object):
# parse all correct code, even if it means to sometimes parse incorrect
# code.
#
- simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])"""
- decimal_escape = r"""(\d+)"""
- hex_escape = r"""(x[0-9a-fA-F]+)"""
- bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])"""
+ # The original regexes were taken verbatim from the C syntax definition,
+ # and were later modified to avoid worst-case exponential running time.
+ #
+ # simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])"""
+ # decimal_escape = r"""(\d+)"""
+ # hex_escape = r"""(x[0-9a-fA-F]+)"""
+ # bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])"""
+ #
+ # The following modifications were made to avoid the ambiguity that allowed backtracking:
+ # (https://github.com/eliben/pycparser/issues/61)
+ #
+ # - \x was removed from simple_escape, unless it was not followed by a hex digit, to avoid ambiguity with hex_escape.
+ # - hex_escape allows one or more hex characters, but requires that the next character(if any) is not hex
+ # - decimal_escape allows one or more decimal characters, but requires that the next character(if any) is not a decimal
+ # - bad_escape does not allow any decimals (8-9), to avoid conflicting with the permissive decimal_escape.
+ #
+ # Without this change, python's `re` module would recursively try parsing each ambiguous escape sequence in multiple ways.
+ # e.g. `\123` could be parsed as `\1`+`23`, `\12`+`3`, and `\123`.
+
+ simple_escape = r"""([a-wyzA-Z._~!=&\^\-\\?'"]|x(?![0-9a-fA-F]))"""
+ decimal_escape = r"""(\d+)(?!\d)"""
+ hex_escape = r"""(x[0-9a-fA-F]+)(?![0-9a-fA-F])"""
+ bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-9])"""
escape_sequence = r"""(\\("""+simple_escape+'|'+decimal_escape+'|'+hex_escape+'))'
+
+ # This complicated regex with lookahead might be slow for strings, so because all of the valid escapes (including \x) allowed
+ # 0 or more non-escaped characters after the first character, simple_escape+decimal_escape+hex_escape got simplified to
+
+ escape_sequence_start_in_string = r"""(\\[0-9a-zA-Z._~!=&\^\-\\?'"])"""
+
cconst_char = r"""([^'\\\n]|"""+escape_sequence+')'
char_const = "'"+cconst_char+"'"
wchar_const = 'L'+char_const
+ multicharacter_constant = "'"+cconst_char+"{2,4}'"
unmatched_quote = "('"+cconst_char+"*\\n)|('"+cconst_char+"*$)"
bad_char_const = r"""('"""+cconst_char+"""[^'\n]+')|('')|('"""+bad_escape+r"""[^'\n]*')"""
# string literals (K&R2: A.2.6)
- string_char = r"""([^"\\\n]|"""+escape_sequence+')'
+ string_char = r"""([^"\\\n]|"""+escape_sequence_start_in_string+')'
string_literal = '"'+string_char+'*"'
wstring_literal = 'L'+string_literal
- bad_string_literal = '"'+string_char+'*?'+bad_escape+string_char+'*"'
+ bad_string_literal = '"'+string_char+'*'+bad_escape+string_char+'*"'
# floating constants (K&R2: A.2.5.3)
exponent_part = r"""([eE][-+]?[0-9]+)"""
@@ -443,6 +469,10 @@ class CLexer(object):
# Must come before bad_char_const, to prevent it from
# catching valid char constants as invalid
#
+ @TOKEN(multicharacter_constant)
+ def t_INT_CONST_CHAR(self, t):
+ return t
+
@TOKEN(char_const)
def t_CHAR_CONST(self, t):
return t