diff options
Diffstat (limited to 'src/parser/trivia.rs')
-rw-r--r-- | src/parser/trivia.rs | 156 |
1 files changed, 156 insertions, 0 deletions
diff --git a/src/parser/trivia.rs b/src/parser/trivia.rs new file mode 100644 index 0000000..a359805 --- /dev/null +++ b/src/parser/trivia.rs @@ -0,0 +1,156 @@ +use std::ops::RangeInclusive; + +use winnow::combinator::alt; +use winnow::combinator::eof; +use winnow::combinator::opt; +use winnow::combinator::repeat; +use winnow::combinator::terminated; +use winnow::prelude::*; +use winnow::token::one_of; +use winnow::token::take_while; + +use crate::parser::prelude::*; + +pub(crate) unsafe fn from_utf8_unchecked<'b>( + bytes: &'b [u8], + safety_justification: &'static str, +) -> &'b str { + if cfg!(debug_assertions) { + // Catch problems more quickly when testing + std::str::from_utf8(bytes).expect(safety_justification) + } else { + std::str::from_utf8_unchecked(bytes) + } +} + +// wschar = ( %x20 / ; Space +// %x09 ) ; Horizontal tab +pub(crate) const WSCHAR: (u8, u8) = (b' ', b'\t'); + +// ws = *wschar +pub(crate) fn ws<'i>(input: &mut Input<'i>) -> PResult<&'i str> { + take_while(0.., WSCHAR) + .map(|b| unsafe { from_utf8_unchecked(b, "`is_wschar` filters out on-ASCII") }) + .parse_next(input) +} + +// non-ascii = %x80-D7FF / %xE000-10FFFF +// - ASCII is 0xxxxxxx +// - First byte for UTF-8 is 11xxxxxx +// - Subsequent UTF-8 bytes are 10xxxxxx +pub(crate) const NON_ASCII: RangeInclusive<u8> = 0x80..=0xff; + +// non-eol = %x09 / %x20-7E / non-ascii +pub(crate) const NON_EOL: (u8, RangeInclusive<u8>, RangeInclusive<u8>) = + (0x09, 0x20..=0x7E, NON_ASCII); + +// comment-start-symbol = %x23 ; # +pub(crate) const COMMENT_START_SYMBOL: u8 = b'#'; + +// comment = comment-start-symbol *non-eol +pub(crate) fn comment<'i>(input: &mut Input<'i>) -> PResult<&'i [u8]> { + (COMMENT_START_SYMBOL, take_while(0.., NON_EOL)) + .recognize() + .parse_next(input) +} + +// newline = ( %x0A / ; LF +// %x0D.0A ) ; CRLF +pub(crate) fn newline(input: &mut Input<'_>) -> PResult<u8> { + alt(( + one_of(LF).value(b'\n'), + (one_of(CR), one_of(LF)).value(b'\n'), + )) + .parse_next(input) +} +pub(crate) const LF: u8 = b'\n'; +pub(crate) const CR: u8 = b'\r'; + +// ws-newline = *( wschar / newline ) +pub(crate) fn ws_newline<'i>(input: &mut Input<'i>) -> PResult<&'i str> { + repeat( + 0.., + alt((newline.value(&b"\n"[..]), take_while(1.., WSCHAR))), + ) + .map(|()| ()) + .recognize() + .map(|b| unsafe { from_utf8_unchecked(b, "`is_wschar` and `newline` filters out on-ASCII") }) + .parse_next(input) +} + +// ws-newlines = newline *( wschar / newline ) +pub(crate) fn ws_newlines<'i>(input: &mut Input<'i>) -> PResult<&'i str> { + (newline, ws_newline) + .recognize() + .map(|b| unsafe { + from_utf8_unchecked(b, "`is_wschar` and `newline` filters out on-ASCII") + }) + .parse_next(input) +} + +// note: this rule is not present in the original grammar +// ws-comment-newline = *( ws-newline-nonempty / comment ) +pub(crate) fn ws_comment_newline<'i>(input: &mut Input<'i>) -> PResult<&'i [u8]> { + repeat( + 0.., + alt(( + repeat( + 1.., + alt((take_while(1.., WSCHAR), newline.value(&b"\n"[..]))), + ) + .map(|()| ()), + comment.value(()), + )), + ) + .map(|()| ()) + .recognize() + .parse_next(input) +} + +// note: this rule is not present in the original grammar +// line-ending = newline / eof +pub(crate) fn line_ending<'i>(input: &mut Input<'i>) -> PResult<&'i str> { + alt((newline.value("\n"), eof.value(""))).parse_next(input) +} + +// note: this rule is not present in the original grammar +// line-trailing = ws [comment] skip-line-ending +pub(crate) fn line_trailing(input: &mut Input<'_>) -> PResult<std::ops::Range<usize>> { + terminated((ws, opt(comment)).span(), line_ending).parse_next(input) +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn trivia() { + let inputs = [ + "", + r#" "#, + r#" +"#, + r#" +# comment + +# comment2 + + +"#, + r#" + "#, + r#"# comment +# comment2 + + + "#, + ]; + for input in inputs { + dbg!(input); + let parsed = ws_comment_newline.parse(new_input(input)); + assert!(parsed.is_ok(), "{:?}", parsed); + let parsed = parsed.unwrap(); + assert_eq!(parsed, input.as_bytes()); + } + } +} |