diff options
author | Tyson Andre <tysonandre775@hotmail.com> | 2019-08-26 17:18:38 -0400 |
---|---|---|
committer | Eli Bendersky <eliben@users.noreply.github.com> | 2019-08-26 14:18:38 -0700 |
commit | 62ee4ba5fbe58f469c72e7b5b02e88584577a147 (patch) | |
tree | 47a09a14d4eae3ccec918e9474f7c6560e8a4795 | |
parent | 5d5904d2538e054356ac01ba9ef965783f73e36b (diff) | |
download | pycparser-62ee4ba5fbe58f469c72e7b5b02e88584577a147.tar.gz |
Fix slow backtracking when parsing strings (no external deps) (#347)
* Fix slow backtracking when parsing strings (no external deps)
Fixes #61
This uses negative lookaheads to avoid ambiguity in how string should be
parsed by the regex.
- https://docs.python.org/2/library/re.html#regular-expression-syntax
- Previously, if it didn't immediately succeed at parsing an escape
sequence such as `\123`, it would have to try `\1`+`23`, `\12` + `3`,
and `\123`, which multiplied the time taken by 3 per additional escape
sequence. This solves that by only allowing `\123`
- The same fix was added for hex escapes.
Also fix a test that relied on the incorrect handling of regexes.
The implementation documentation says that it intends to allow
**decimal** escapes permissively.
* WIP debug
* Fix ambiguity caused by allowing #path directives
Solve this by allowing "\x" when not followed by hex,
in the regular string literal.
In the previous commits,
`\x12` could be parsed both as `\x`+`12` and `\x12`,
which caused exponential options for backtracking.
* Document changes to lexer, remove debug code
* Optimize this for strings
-rw-r--r-- | pycparser/c_lexer.py | 39 | ||||
-rw-r--r-- | tests/test_c_lexer.py | 31 |
2 files changed, 62 insertions, 8 deletions
diff --git a/pycparser/c_lexer.py b/pycparser/c_lexer.py index de8445e..371e996 100644 --- a/pycparser/c_lexer.py +++ b/pycparser/c_lexer.py @@ -19,7 +19,7 @@ class CLexer(object): tokens. The public attribute filename can be set to an initial - filaneme, but the lexer will update it upon #line + filename, but the lexer will update it upon #line directives. """ def __init__(self, error_func, on_lbrace_func, on_rbrace_func, @@ -205,12 +205,37 @@ class CLexer(object): # parse all correct code, even if it means to sometimes parse incorrect # code. # - simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])""" - decimal_escape = r"""(\d+)""" - hex_escape = r"""(x[0-9a-fA-F]+)""" - bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])""" + # The original regexes were taken verbatim from the C syntax definition, + # and were later modified to avoid worst-case exponential running time. + # + # simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])""" + # decimal_escape = r"""(\d+)""" + # hex_escape = r"""(x[0-9a-fA-F]+)""" + # bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])""" + # + # The following modifications were made to avoid the ambiguity that allowed backtracking: + # (https://github.com/eliben/pycparser/issues/61) + # + # - \x was removed from simple_escape, unless it was not followed by a hex digit, to avoid ambiguity with hex_escape. + # - hex_escape allows one or more hex characters, but requires that the next character(if any) is not hex + # - decimal_escape allows one or more decimal characters, but requires that the next character(if any) is not a decimal + # - bad_escape does not allow any decimals (8-9), to avoid conflicting with the permissive decimal_escape. + # + # Without this change, python's `re` module would recursively try parsing each ambiguous escape sequence in multiple ways. + # e.g. `\123` could be parsed as `\1`+`23`, `\12`+`3`, and `\123`. + + simple_escape = r"""([a-wyzA-Z._~!=&\^\-\\?'"]|x(?![0-9a-fA-F]))""" + decimal_escape = r"""(\d+)(?!\d)""" + hex_escape = r"""(x[0-9a-fA-F]+)(?![0-9a-fA-F])""" + bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-9])""" escape_sequence = r"""(\\("""+simple_escape+'|'+decimal_escape+'|'+hex_escape+'))' + + # This complicated regex with lookahead might be slow for strings, so because all of the valid escapes (including \x) allowed + # 0 or more non-escaped characters after the first character, simple_escape+decimal_escape+hex_escape got simplified to + + escape_sequence_start_in_string = r"""(\\[0-9a-zA-Z._~!=&\^\-\\?'"])""" + cconst_char = r"""([^'\\\n]|"""+escape_sequence+')' char_const = "'"+cconst_char+"'" wchar_const = 'L'+char_const @@ -218,10 +243,10 @@ class CLexer(object): bad_char_const = r"""('"""+cconst_char+"""[^'\n]+')|('')|('"""+bad_escape+r"""[^'\n]*')""" # string literals (K&R2: A.2.6) - string_char = r"""([^"\\\n]|"""+escape_sequence+')' + string_char = r"""([^"\\\n]|"""+escape_sequence_start_in_string+')' string_literal = '"'+string_char+'*"' wstring_literal = 'L'+string_literal - bad_string_literal = '"'+string_char+'*?'+bad_escape+string_char+'*"' + bad_string_literal = '"'+string_char+'*'+bad_escape+string_char+'*"' # floating constants (K&R2: A.2.5.3) exponent_part = r"""([eE][-+]?[0-9]+)""" diff --git a/tests/test_c_lexer.py b/tests/test_c_lexer.py index 11c7b26..3a70c18 100644 --- a/tests/test_c_lexer.py +++ b/tests/test_c_lexer.py @@ -116,6 +116,7 @@ class TestCLexerNoErrors(unittest.TestCase): self.assertTokensTypes(r"""'\t'""", ['CHAR_CONST']) self.assertTokensTypes(r"""'\''""", ['CHAR_CONST']) self.assertTokensTypes(r"""'\?'""", ['CHAR_CONST']) + self.assertTokensTypes(r"""'\0'""", ['CHAR_CONST']) self.assertTokensTypes(r"""'\012'""", ['CHAR_CONST']) self.assertTokensTypes(r"""'\x2f'""", ['CHAR_CONST']) self.assertTokensTypes(r"""'\x2f12'""", ['CHAR_CONST']) @@ -149,6 +150,24 @@ class TestCLexerNoErrors(unittest.TestCase): self.assertTokensTypes( '"\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123"', ['STRING_LITERAL']) + # Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line + # directives with Windows paths as filenames (..\..\dir\file) + self.assertTokensTypes( + r'"\x"', + ['STRING_LITERAL']) + self.assertTokensTypes( + r'"\a\b\c\d\e\f\g\h\i\j\k\l\m\n\o\p\q\r\s\t\u\v\w\x\y\z\A\B\C\D\E\F\G\H\I\J\K\L\M\N\O\P\Q\R\S\T\U\V\W\X\Y\Z"', + ['STRING_LITERAL']) + self.assertTokensTypes( + r'"C:\x\fa\x1e\xited"', + ['STRING_LITERAL']) + # The lexer is permissive and allows decimal escapes (not just octal) + self.assertTokensTypes( + '"jx\9"', + ['STRING_LITERAL']) + self.assertTokensTypes( + '"fo\9999999"', + ['STRING_LITERAL']) def test_mess(self): self.assertTokensTypes( @@ -428,14 +447,24 @@ class TestCLexerErrors(unittest.TestCase): def test_char_constants(self): self.assertLexerError("'", ERR_UNMATCHED_QUOTE) self.assertLexerError("'b\n", ERR_UNMATCHED_QUOTE) + self.assertLexerError("'\\xaa\n'", ERR_UNMATCHED_QUOTE) + self.assertLexerError(r"'\12a'", ERR_INVALID_CCONST) + self.assertLexerError(r"'\xabg'", ERR_INVALID_CCONST) + self.assertLexerError("''", ERR_INVALID_CCONST) self.assertLexerError("'jx'", ERR_INVALID_CCONST) self.assertLexerError(r"'\*'", ERR_INVALID_CCONST) def test_string_literals(self): - self.assertLexerError(r'"jx\9"', ERR_STRING_ESCAPE) + self.assertLexerError(r'"jx\`"', ERR_STRING_ESCAPE) self.assertLexerError(r'"hekllo\* on ix"', ERR_STRING_ESCAPE) self.assertLexerError(r'L"hekllo\* on ix"', ERR_STRING_ESCAPE) + # Should not suffer from slow backtracking + self.assertLexerError(r'"\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\`\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123"', ERR_STRING_ESCAPE) + self.assertLexerError(r'"\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\x23\`\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23"', ERR_STRING_ESCAPE) + # Should not suffer from slow backtracking when there's no end quote + self.assertLexerError(r'"\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\`\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\12\123456', ERR_ILLEGAL_CHAR) + self.assertLexerError(r'"\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\`\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x2\x23456', ERR_ILLEGAL_CHAR) def test_preprocessor(self): self.assertLexerError('#line "ka"', ERR_FILENAME_BEFORE_LINE) |