diff options
Diffstat (limited to 'examples/ndjson/parser.rs')
-rw-r--r-- | examples/ndjson/parser.rs | 338 |
1 files changed, 338 insertions, 0 deletions
diff --git a/examples/ndjson/parser.rs b/examples/ndjson/parser.rs new file mode 100644 index 0000000..aaa5c93 --- /dev/null +++ b/examples/ndjson/parser.rs @@ -0,0 +1,338 @@ +use std::collections::HashMap; +use std::str; + +use winnow::prelude::*; +use winnow::{ + ascii::float, + ascii::line_ending, + combinator::alt, + combinator::cut_err, + combinator::{delimited, preceded, separated_pair, terminated}, + combinator::{fold_repeat, separated0}, + error::{AddContext, ParserError}, + stream::Partial, + token::{any, none_of, take, take_while}, +}; + +#[derive(Debug, PartialEq, Clone)] +pub enum JsonValue { + Null, + Boolean(bool), + Str(String), + Num(f64), + Array(Vec<JsonValue>), + Object(HashMap<String, JsonValue>), +} + +/// Use `Partial` to cause `ErrMode::Incomplete` while parsing +pub type Stream<'i> = Partial<&'i str>; + +pub fn ndjson<'i, E: ParserError<Stream<'i>> + AddContext<Stream<'i>, &'static str>>( + input: &mut Stream<'i>, +) -> PResult<Option<JsonValue>, E> { + alt(( + terminated(delimited(ws, json_value, ws), line_ending).map(Some), + line_ending.value(None), + )) + .parse_next(input) +} + +// --Besides `WS`, same as a regular json parser ---------------------------- + +/// `alt` is a combinator that tries multiple parsers one by one, until +/// one of them succeeds +fn json_value<'i, E: ParserError<Stream<'i>> + AddContext<Stream<'i>, &'static str>>( + input: &mut Stream<'i>, +) -> PResult<JsonValue, E> { + // `alt` combines the each value parser. It returns the result of the first + // successful parser, or an error + alt(( + null.value(JsonValue::Null), + boolean.map(JsonValue::Boolean), + string.map(JsonValue::Str), + float.map(JsonValue::Num), + array.map(JsonValue::Array), + object.map(JsonValue::Object), + )) + .parse_next(input) +} + +/// `tag(string)` generates a parser that recognizes the argument string. +/// +/// This also shows returning a sub-slice of the original input +fn null<'i, E: ParserError<Stream<'i>>>(input: &mut Stream<'i>) -> PResult<&'i str, E> { + // This is a parser that returns `"null"` if it sees the string "null", and + // an error otherwise + "null".parse_next(input) +} + +/// We can combine `tag` with other functions, like `value` which returns a given constant value on +/// success. +fn boolean<'i, E: ParserError<Stream<'i>>>(input: &mut Stream<'i>) -> PResult<bool, E> { + // This is a parser that returns `true` if it sees the string "true", and + // an error otherwise + let parse_true = "true".value(true); + + // This is a parser that returns `false` if it sees the string "false", and + // an error otherwise + let parse_false = "false".value(false); + + alt((parse_true, parse_false)).parse_next(input) +} + +/// This parser gathers all `char`s up into a `String`with a parse to recognize the double quote +/// character, before the string (using `preceded`) and after the string (using `terminated`). +fn string<'i, E: ParserError<Stream<'i>> + AddContext<Stream<'i>, &'static str>>( + input: &mut Stream<'i>, +) -> PResult<String, E> { + preceded( + '\"', + // `cut_err` transforms an `ErrMode::Backtrack(e)` to `ErrMode::Cut(e)`, signaling to + // combinators like `alt` that they should not try other parsers. We were in the + // right branch (since we found the `"` character) but encountered an error when + // parsing the string + cut_err(terminated( + fold_repeat(0.., character, String::new, |mut string, c| { + string.push(c); + string + }), + '\"', + )), + ) + // `context` lets you add a static string to errors to provide more information in the + // error chain (to indicate which parser had an error) + .context("string") + .parse_next(input) +} + +/// You can mix the above declarative parsing with an imperative style to handle more unique cases, +/// like escaping +fn character<'i, E: ParserError<Stream<'i>>>(input: &mut Stream<'i>) -> PResult<char, E> { + let c = none_of('"').parse_next(input)?; + if c == '\\' { + alt(( + any.verify_map(|c| { + Some(match c { + '"' | '\\' | '/' => c, + 'b' => '\x08', + 'f' => '\x0C', + 'n' => '\n', + 'r' => '\r', + 't' => '\t', + _ => return None, + }) + }), + preceded('u', unicode_escape), + )) + .parse_next(input) + } else { + Ok(c) + } +} + +fn unicode_escape<'i, E: ParserError<Stream<'i>>>(input: &mut Stream<'i>) -> PResult<char, E> { + alt(( + // Not a surrogate + u16_hex + .verify(|cp| !(0xD800..0xE000).contains(cp)) + .map(|cp| cp as u32), + // See https://en.wikipedia.org/wiki/UTF-16#Code_points_from_U+010000_to_U+10FFFF for details + separated_pair(u16_hex, "\\u", u16_hex) + .verify(|(high, low)| (0xD800..0xDC00).contains(high) && (0xDC00..0xE000).contains(low)) + .map(|(high, low)| { + let high_ten = (high as u32) - 0xD800; + let low_ten = (low as u32) - 0xDC00; + (high_ten << 10) + low_ten + 0x10000 + }), + )) + .verify_map( + // Could be probably replaced with .unwrap() or _unchecked due to the verify checks + std::char::from_u32, + ) + .parse_next(input) +} + +fn u16_hex<'i, E: ParserError<Stream<'i>>>(input: &mut Stream<'i>) -> PResult<u16, E> { + take(4usize) + .verify_map(|s| u16::from_str_radix(s, 16).ok()) + .parse_next(input) +} + +/// Some combinators, like `separated0` or `many0`, will call a parser repeatedly, +/// accumulating results in a `Vec`, until it encounters an error. +/// If you want more control on the parser application, check out the `iterator` +/// combinator (cf `examples/iterator.rs`) +fn array<'i, E: ParserError<Stream<'i>> + AddContext<Stream<'i>, &'static str>>( + input: &mut Stream<'i>, +) -> PResult<Vec<JsonValue>, E> { + preceded( + ('[', ws), + cut_err(terminated(separated0(json_value, (ws, ',', ws)), (ws, ']'))), + ) + .context("array") + .parse_next(input) +} + +fn object<'i, E: ParserError<Stream<'i>> + AddContext<Stream<'i>, &'static str>>( + input: &mut Stream<'i>, +) -> PResult<HashMap<String, JsonValue>, E> { + preceded( + ('{', ws), + cut_err(terminated(separated0(key_value, (ws, ',', ws)), (ws, '}'))), + ) + .context("object") + .parse_next(input) +} + +fn key_value<'i, E: ParserError<Stream<'i>> + AddContext<Stream<'i>, &'static str>>( + input: &mut Stream<'i>, +) -> PResult<(String, JsonValue), E> { + separated_pair(string, cut_err((ws, ':', ws)), json_value).parse_next(input) +} + +/// Parser combinators are constructed from the bottom up: +/// first we write parsers for the smallest elements (here a space character), +/// then we'll combine them in larger parsers +fn ws<'i, E: ParserError<Stream<'i>>>(input: &mut Stream<'i>) -> PResult<&'i str, E> { + // Combinators like `take_while` return a function. That function is the + // parser,to which we can pass the input + take_while(0.., WS).parse_next(input) +} + +const WS: &[char] = &[' ', '\t']; + +#[cfg(test)] +mod test { + #[allow(clippy::useless_attribute)] + #[allow(dead_code)] // its dead for benches + use super::*; + + #[allow(clippy::useless_attribute)] + #[allow(dead_code)] // its dead for benches + type Error<'i> = winnow::error::InputError<Partial<&'i str>>; + + #[test] + fn json_string() { + assert_eq!( + string::<Error<'_>>.parse_peek(Partial::new("\"\"")), + Ok((Partial::new(""), "".to_string())) + ); + assert_eq!( + string::<Error<'_>>.parse_peek(Partial::new("\"abc\"")), + Ok((Partial::new(""), "abc".to_string())) + ); + assert_eq!( + string::<Error<'_>>.parse_peek(Partial::new( + "\"abc\\\"\\\\\\/\\b\\f\\n\\r\\t\\u0001\\u2014\u{2014}def\"" + )), + Ok(( + Partial::new(""), + "abc\"\\/\x08\x0C\n\r\t\x01ββdef".to_string() + )), + ); + assert_eq!( + string::<Error<'_>>.parse_peek(Partial::new("\"\\uD83D\\uDE10\"")), + Ok((Partial::new(""), "π".to_string())) + ); + + assert!(string::<Error<'_>>.parse_peek(Partial::new("\"")).is_err()); + assert!(string::<Error<'_>> + .parse_peek(Partial::new("\"abc")) + .is_err()); + assert!(string::<Error<'_>> + .parse_peek(Partial::new("\"\\\"")) + .is_err()); + assert!(string::<Error<'_>> + .parse_peek(Partial::new("\"\\u123\"")) + .is_err()); + assert!(string::<Error<'_>> + .parse_peek(Partial::new("\"\\uD800\"")) + .is_err()); + assert!(string::<Error<'_>> + .parse_peek(Partial::new("\"\\uD800\\uD800\"")) + .is_err()); + assert!(string::<Error<'_>> + .parse_peek(Partial::new("\"\\uDC00\"")) + .is_err()); + } + + #[test] + fn json_object() { + use JsonValue::{Num, Object, Str}; + + let input = r#"{"a":42,"b":"x"} +"#; + + let expected = Object( + vec![ + ("a".to_string(), Num(42.0)), + ("b".to_string(), Str("x".to_string())), + ] + .into_iter() + .collect(), + ); + + assert_eq!( + ndjson::<Error<'_>>.parse_peek(Partial::new(input)), + Ok((Partial::new(""), Some(expected))) + ); + } + + #[test] + fn json_array() { + use JsonValue::{Array, Num, Str}; + + let input = r#"[42,"x"] +"#; + + let expected = Array(vec![Num(42.0), Str("x".to_string())]); + + assert_eq!( + ndjson::<Error<'_>>.parse_peek(Partial::new(input)), + Ok((Partial::new(""), Some(expected))) + ); + } + + #[test] + fn json_whitespace() { + use JsonValue::{Array, Boolean, Null, Num, Object, Str}; + + let input = r#" { "null" : null, "true" :true , "false": false , "number" : 123e4 , "string" : " abc 123 " , "array" : [ false , 1 , "two" ] , "object" : { "a" : 1.0 , "b" : "c" } , "empty_array" : [ ] , "empty_object" : { } } +"#; + + assert_eq!( + ndjson::<Error<'_>>.parse_peek(Partial::new(input)), + Ok(( + Partial::new(""), + Some(Object( + vec![ + ("null".to_string(), Null), + ("true".to_string(), Boolean(true)), + ("false".to_string(), Boolean(false)), + ("number".to_string(), Num(123e4)), + ("string".to_string(), Str(" abc 123 ".to_string())), + ( + "array".to_string(), + Array(vec![Boolean(false), Num(1.0), Str("two".to_string())]) + ), + ( + "object".to_string(), + Object( + vec![ + ("a".to_string(), Num(1.0)), + ("b".to_string(), Str("c".to_string())), + ] + .into_iter() + .collect() + ) + ), + ("empty_array".to_string(), Array(vec![]),), + ("empty_object".to_string(), Object(HashMap::new()),), + ] + .into_iter() + .collect() + )) + )) + ); + } +} |