aboutsummaryrefslogtreecommitdiff
path: root/src/parser/strings.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/parser/strings.rs')
-rw-r--r--src/parser/strings.rs478
1 files changed, 478 insertions, 0 deletions
diff --git a/src/parser/strings.rs b/src/parser/strings.rs
new file mode 100644
index 0000000..26f9cc2
--- /dev/null
+++ b/src/parser/strings.rs
@@ -0,0 +1,478 @@
+use std::borrow::Cow;
+use std::char;
+use std::ops::RangeInclusive;
+
+use winnow::combinator::alt;
+use winnow::combinator::cut_err;
+use winnow::combinator::delimited;
+use winnow::combinator::fail;
+use winnow::combinator::opt;
+use winnow::combinator::peek;
+use winnow::combinator::preceded;
+use winnow::combinator::repeat;
+use winnow::combinator::success;
+use winnow::combinator::terminated;
+use winnow::prelude::*;
+use winnow::stream::Stream;
+use winnow::token::any;
+use winnow::token::none_of;
+use winnow::token::one_of;
+use winnow::token::tag;
+use winnow::token::take_while;
+use winnow::trace::trace;
+
+use crate::parser::errors::CustomError;
+use crate::parser::numbers::HEXDIG;
+use crate::parser::prelude::*;
+use crate::parser::trivia::{from_utf8_unchecked, newline, ws, ws_newlines, NON_ASCII, WSCHAR};
+
+// ;; String
+
+// string = ml-basic-string / basic-string / ml-literal-string / literal-string
+pub(crate) fn string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
+ trace(
+ "string",
+ alt((
+ ml_basic_string,
+ basic_string,
+ ml_literal_string,
+ literal_string.map(Cow::Borrowed),
+ )),
+ )
+ .parse_next(input)
+}
+
+// ;; Basic String
+
+// basic-string = quotation-mark *basic-char quotation-mark
+pub(crate) fn basic_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
+ trace("basic-string", |input: &mut Input<'i>| {
+ let _ = one_of(QUOTATION_MARK).parse_next(input)?;
+
+ let mut c = Cow::Borrowed("");
+ if let Some(ci) = opt(basic_chars).parse_next(input)? {
+ c = ci;
+ }
+ while let Some(ci) = opt(basic_chars).parse_next(input)? {
+ c.to_mut().push_str(&ci);
+ }
+
+ let _ = cut_err(one_of(QUOTATION_MARK))
+ .context(StrContext::Label("basic string"))
+ .parse_next(input)?;
+
+ Ok(c)
+ })
+ .parse_next(input)
+}
+
+// quotation-mark = %x22 ; "
+pub(crate) const QUOTATION_MARK: u8 = b'"';
+
+// basic-char = basic-unescaped / escaped
+fn basic_chars<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
+ alt((
+ // Deviate from the official grammar by batching the unescaped chars so we build a string a
+ // chunk at a time, rather than a `char` at a time.
+ take_while(1.., BASIC_UNESCAPED)
+ .try_map(std::str::from_utf8)
+ .map(Cow::Borrowed),
+ escaped.map(|c| Cow::Owned(String::from(c))),
+ ))
+ .parse_next(input)
+}
+
+// basic-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
+pub(crate) const BASIC_UNESCAPED: (
+ (u8, u8),
+ u8,
+ RangeInclusive<u8>,
+ RangeInclusive<u8>,
+ RangeInclusive<u8>,
+) = (WSCHAR, 0x21, 0x23..=0x5B, 0x5D..=0x7E, NON_ASCII);
+
+// escaped = escape escape-seq-char
+fn escaped(input: &mut Input<'_>) -> PResult<char> {
+ preceded(ESCAPE, escape_seq_char).parse_next(input)
+}
+
+// escape = %x5C ; \
+pub(crate) const ESCAPE: u8 = b'\\';
+
+// escape-seq-char = %x22 ; " quotation mark U+0022
+// escape-seq-char =/ %x5C ; \ reverse solidus U+005C
+// escape-seq-char =/ %x62 ; b backspace U+0008
+// escape-seq-char =/ %x66 ; f form feed U+000C
+// escape-seq-char =/ %x6E ; n line feed U+000A
+// escape-seq-char =/ %x72 ; r carriage return U+000D
+// escape-seq-char =/ %x74 ; t tab U+0009
+// escape-seq-char =/ %x75 4HEXDIG ; uXXXX U+XXXX
+// escape-seq-char =/ %x55 8HEXDIG ; UXXXXXXXX U+XXXXXXXX
+fn escape_seq_char(input: &mut Input<'_>) -> PResult<char> {
+ dispatch! {any;
+ b'b' => success('\u{8}'),
+ b'f' => success('\u{c}'),
+ b'n' => success('\n'),
+ b'r' => success('\r'),
+ b't' => success('\t'),
+ b'u' => cut_err(hexescape::<4>).context(StrContext::Label("unicode 4-digit hex code")),
+ b'U' => cut_err(hexescape::<8>).context(StrContext::Label("unicode 8-digit hex code")),
+ b'\\' => success('\\'),
+ b'"' => success('"'),
+ _ => {
+ cut_err(fail::<_, char, _>)
+ .context(StrContext::Label("escape sequence"))
+ .context(StrContext::Expected(StrContextValue::CharLiteral('b')))
+ .context(StrContext::Expected(StrContextValue::CharLiteral('f')))
+ .context(StrContext::Expected(StrContextValue::CharLiteral('n')))
+ .context(StrContext::Expected(StrContextValue::CharLiteral('r')))
+ .context(StrContext::Expected(StrContextValue::CharLiteral('t')))
+ .context(StrContext::Expected(StrContextValue::CharLiteral('u')))
+ .context(StrContext::Expected(StrContextValue::CharLiteral('U')))
+ .context(StrContext::Expected(StrContextValue::CharLiteral('\\')))
+ .context(StrContext::Expected(StrContextValue::CharLiteral('"')))
+ }
+ }
+ .parse_next(input)
+}
+
+pub(crate) fn hexescape<const N: usize>(input: &mut Input<'_>) -> PResult<char> {
+ take_while(0..=N, HEXDIG)
+ .verify(|b: &[u8]| b.len() == N)
+ .map(|b: &[u8]| unsafe { from_utf8_unchecked(b, "`is_ascii_digit` filters out on-ASCII") })
+ .verify_map(|s| u32::from_str_radix(s, 16).ok())
+ .try_map(|h| char::from_u32(h).ok_or(CustomError::OutOfRange))
+ .parse_next(input)
+}
+
+// ;; Multiline Basic String
+
+// ml-basic-string = ml-basic-string-delim [ newline ] ml-basic-body
+// ml-basic-string-delim
+fn ml_basic_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
+ trace(
+ "ml-basic-string",
+ delimited(
+ ML_BASIC_STRING_DELIM,
+ preceded(opt(newline), cut_err(ml_basic_body)),
+ cut_err(ML_BASIC_STRING_DELIM),
+ )
+ .context(StrContext::Label("multiline basic string")),
+ )
+ .parse_next(input)
+}
+
+// ml-basic-string-delim = 3quotation-mark
+pub(crate) const ML_BASIC_STRING_DELIM: &[u8] = b"\"\"\"";
+
+// ml-basic-body = *mlb-content *( mlb-quotes 1*mlb-content ) [ mlb-quotes ]
+fn ml_basic_body<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
+ let mut c = Cow::Borrowed("");
+ if let Some(ci) = opt(mlb_content).parse_next(input)? {
+ c = ci;
+ }
+ while let Some(ci) = opt(mlb_content).parse_next(input)? {
+ c.to_mut().push_str(&ci);
+ }
+
+ while let Some(qi) = opt(mlb_quotes(none_of(b'\"').value(()))).parse_next(input)? {
+ if let Some(ci) = opt(mlb_content).parse_next(input)? {
+ c.to_mut().push_str(qi);
+ c.to_mut().push_str(&ci);
+ while let Some(ci) = opt(mlb_content).parse_next(input)? {
+ c.to_mut().push_str(&ci);
+ }
+ } else {
+ break;
+ }
+ }
+
+ if let Some(qi) = opt(mlb_quotes(tag(ML_BASIC_STRING_DELIM).value(()))).parse_next(input)? {
+ c.to_mut().push_str(qi);
+ }
+
+ Ok(c)
+}
+
+// mlb-content = mlb-char / newline / mlb-escaped-nl
+// mlb-char = mlb-unescaped / escaped
+fn mlb_content<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
+ alt((
+ // Deviate from the official grammar by batching the unescaped chars so we build a string a
+ // chunk at a time, rather than a `char` at a time.
+ take_while(1.., MLB_UNESCAPED)
+ .try_map(std::str::from_utf8)
+ .map(Cow::Borrowed),
+ // Order changed fromg grammar so `escaped` can more easily `cut_err` on bad escape sequences
+ mlb_escaped_nl.map(|_| Cow::Borrowed("")),
+ escaped.map(|c| Cow::Owned(String::from(c))),
+ newline.map(|_| Cow::Borrowed("\n")),
+ ))
+ .parse_next(input)
+}
+
+// mlb-quotes = 1*2quotation-mark
+fn mlb_quotes<'i>(
+ mut term: impl winnow::Parser<Input<'i>, (), ContextError>,
+) -> impl Parser<Input<'i>, &'i str, ContextError> {
+ move |input: &mut Input<'i>| {
+ let start = input.checkpoint();
+ let res = terminated(b"\"\"", peek(term.by_ref()))
+ .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") })
+ .parse_next(input);
+
+ match res {
+ Err(winnow::error::ErrMode::Backtrack(_)) => {
+ input.reset(start);
+ terminated(b"\"", peek(term.by_ref()))
+ .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") })
+ .parse_next(input)
+ }
+ res => res,
+ }
+ }
+}
+
+// mlb-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
+pub(crate) const MLB_UNESCAPED: (
+ (u8, u8),
+ u8,
+ RangeInclusive<u8>,
+ RangeInclusive<u8>,
+ RangeInclusive<u8>,
+) = (WSCHAR, 0x21, 0x23..=0x5B, 0x5D..=0x7E, NON_ASCII);
+
+// mlb-escaped-nl = escape ws newline *( wschar / newline
+// When the last non-whitespace character on a line is a \,
+// it will be trimmed along with all whitespace
+// (including newlines) up to the next non-whitespace
+// character or closing delimiter.
+fn mlb_escaped_nl(input: &mut Input<'_>) -> PResult<()> {
+ repeat(1.., (ESCAPE, ws, ws_newlines))
+ .map(|()| ())
+ .value(())
+ .parse_next(input)
+}
+
+// ;; Literal String
+
+// literal-string = apostrophe *literal-char apostrophe
+pub(crate) fn literal_string<'i>(input: &mut Input<'i>) -> PResult<&'i str> {
+ trace(
+ "literal-string",
+ delimited(
+ APOSTROPHE,
+ cut_err(take_while(0.., LITERAL_CHAR)),
+ cut_err(APOSTROPHE),
+ )
+ .try_map(std::str::from_utf8)
+ .context(StrContext::Label("literal string")),
+ )
+ .parse_next(input)
+}
+
+// apostrophe = %x27 ; ' apostrophe
+pub(crate) const APOSTROPHE: u8 = b'\'';
+
+// literal-char = %x09 / %x20-26 / %x28-7E / non-ascii
+pub(crate) const LITERAL_CHAR: (
+ u8,
+ RangeInclusive<u8>,
+ RangeInclusive<u8>,
+ RangeInclusive<u8>,
+) = (0x9, 0x20..=0x26, 0x28..=0x7E, NON_ASCII);
+
+// ;; Multiline Literal String
+
+// ml-literal-string = ml-literal-string-delim [ newline ] ml-literal-body
+// ml-literal-string-delim
+fn ml_literal_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
+ trace(
+ "ml-literal-string",
+ delimited(
+ (ML_LITERAL_STRING_DELIM, opt(newline)),
+ cut_err(ml_literal_body.map(|t| {
+ if t.contains("\r\n") {
+ Cow::Owned(t.replace("\r\n", "\n"))
+ } else {
+ Cow::Borrowed(t)
+ }
+ })),
+ cut_err(ML_LITERAL_STRING_DELIM),
+ )
+ .context(StrContext::Label("multiline literal string")),
+ )
+ .parse_next(input)
+}
+
+// ml-literal-string-delim = 3apostrophe
+pub(crate) const ML_LITERAL_STRING_DELIM: &[u8] = b"'''";
+
+// ml-literal-body = *mll-content *( mll-quotes 1*mll-content ) [ mll-quotes ]
+fn ml_literal_body<'i>(input: &mut Input<'i>) -> PResult<&'i str> {
+ (
+ repeat(0.., mll_content).map(|()| ()),
+ repeat(
+ 0..,
+ (
+ mll_quotes(none_of(APOSTROPHE).value(())),
+ repeat(1.., mll_content).map(|()| ()),
+ ),
+ )
+ .map(|()| ()),
+ opt(mll_quotes(tag(ML_LITERAL_STRING_DELIM).value(()))),
+ )
+ .recognize()
+ .try_map(std::str::from_utf8)
+ .parse_next(input)
+}
+
+// mll-content = mll-char / newline
+fn mll_content(input: &mut Input<'_>) -> PResult<u8> {
+ alt((one_of(MLL_CHAR), newline)).parse_next(input)
+}
+
+// mll-char = %x09 / %x20-26 / %x28-7E / non-ascii
+const MLL_CHAR: (
+ u8,
+ RangeInclusive<u8>,
+ RangeInclusive<u8>,
+ RangeInclusive<u8>,
+) = (0x9, 0x20..=0x26, 0x28..=0x7E, NON_ASCII);
+
+// mll-quotes = 1*2apostrophe
+fn mll_quotes<'i>(
+ mut term: impl winnow::Parser<Input<'i>, (), ContextError>,
+) -> impl Parser<Input<'i>, &'i str, ContextError> {
+ move |input: &mut Input<'i>| {
+ let start = input.checkpoint();
+ let res = terminated(b"''", peek(term.by_ref()))
+ .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") })
+ .parse_next(input);
+
+ match res {
+ Err(winnow::error::ErrMode::Backtrack(_)) => {
+ input.reset(start);
+ terminated(b"'", peek(term.by_ref()))
+ .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") })
+ .parse_next(input)
+ }
+ res => res,
+ }
+ }
+}
+
+#[cfg(test)]
+mod test {
+ use super::*;
+
+ #[test]
+ fn basic_string() {
+ let input =
+ r#""I'm a string. \"You can quote me\". Name\tJos\u00E9\nLocation\tSF. \U0002070E""#;
+ let expected = "I\'m a string. \"You can quote me\". Name\tJosé\nLocation\tSF. \u{2070E}";
+ let parsed = string.parse(new_input(input));
+ assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
+ }
+
+ #[test]
+ fn ml_basic_string() {
+ let cases = [
+ (
+ r#""""
+Roses are red
+Violets are blue""""#,
+ r#"Roses are red
+Violets are blue"#,
+ ),
+ (r#"""" \""" """"#, " \"\"\" "),
+ (r#"""" \\""""#, " \\"),
+ ];
+
+ for &(input, expected) in &cases {
+ let parsed = string.parse(new_input(input));
+ assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
+ }
+
+ let invalid_cases = [r#"""" """#, r#"""" \""""#];
+
+ for input in &invalid_cases {
+ let parsed = string.parse(new_input(input));
+ assert!(parsed.is_err());
+ }
+ }
+
+ #[test]
+ fn ml_basic_string_escape_ws() {
+ let inputs = [
+ r#""""
+The quick brown \
+
+
+ fox jumps over \
+ the lazy dog.""""#,
+ r#""""\
+ The quick brown \
+ fox jumps over \
+ the lazy dog.\
+ """"#,
+ ];
+ for input in &inputs {
+ let expected = "The quick brown fox jumps over the lazy dog.";
+ let parsed = string.parse(new_input(input));
+ assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
+ }
+ let empties = [
+ r#""""\
+ """"#,
+ r#""""
+\
+ \
+""""#,
+ ];
+ for input in &empties {
+ let expected = "";
+ let parsed = string.parse(new_input(input));
+ assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
+ }
+ }
+
+ #[test]
+ fn literal_string() {
+ let inputs = [
+ r#"'C:\Users\nodejs\templates'"#,
+ r#"'\\ServerX\admin$\system32\'"#,
+ r#"'Tom "Dubs" Preston-Werner'"#,
+ r#"'<\i\c*\s*>'"#,
+ ];
+
+ for input in &inputs {
+ let expected = &input[1..input.len() - 1];
+ let parsed = string.parse(new_input(input));
+ assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
+ }
+ }
+
+ #[test]
+ fn ml_literal_string() {
+ let inputs = [
+ r#"'''I [dw]on't need \d{2} apples'''"#,
+ r#"''''one_quote''''"#,
+ ];
+ for input in &inputs {
+ let expected = &input[3..input.len() - 3];
+ let parsed = string.parse(new_input(input));
+ assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
+ }
+
+ let input = r#"'''
+The first newline is
+trimmed in raw strings.
+ All other whitespace
+ is preserved.
+'''"#;
+ let expected = &input[4..input.len() - 3];
+ let parsed = string.parse(new_input(input));
+ assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
+ }
+}