From 4152c49a9432c5e51e891f1c34b7e20446496aea Mon Sep 17 00:00:00 2001 From: Jeff Vander Stoep Date: Thu, 9 Mar 2023 10:32:38 +0100 Subject: Import protobuf-support 3.2.0 Test: Treehugger Bug: 270895633 Change-Id: I0e7e700aaa7eab95a91757ea147b08b205db2d0c --- .cargo_vcs_info.json | 6 + Android.bp | 24 ++ Cargo.toml | 35 +++ Cargo.toml.orig | 24 ++ LICENSE | 19 ++ LICENSE.txt | 19 ++ METADATA | 19 ++ MODULE_LICENSE_MIT | 0 OWNERS | 1 + README.md | 8 + cargo2android.json | 10 + src/json_name.rs | 19 ++ src/lexer/float.rs | 56 ++++ src/lexer/int.rs | 12 + src/lexer/json_number_lit.rs | 10 + src/lexer/lexer_impl.rs | 712 +++++++++++++++++++++++++++++++++++++++++++ src/lexer/loc.rs | 28 ++ src/lexer/mod.rs | 12 + src/lexer/num_lit.rs | 5 + src/lexer/parser_language.rs | 10 + src/lexer/str_lit.rs | 77 +++++ src/lexer/token.rs | 47 +++ src/lexer/tokenizer.rs | 330 ++++++++++++++++++++ src/lib.rs | 9 + src/text_format.rs | 75 +++++ src/toposort.rs | 119 ++++++++ 26 files changed, 1686 insertions(+) create mode 100644 .cargo_vcs_info.json create mode 100644 Android.bp create mode 100644 Cargo.toml create mode 100644 Cargo.toml.orig create mode 100644 LICENSE create mode 100644 LICENSE.txt create mode 100644 METADATA create mode 100644 MODULE_LICENSE_MIT create mode 100644 OWNERS create mode 100644 README.md create mode 100644 cargo2android.json create mode 100644 src/json_name.rs create mode 100644 src/lexer/float.rs create mode 100644 src/lexer/int.rs create mode 100644 src/lexer/json_number_lit.rs create mode 100644 src/lexer/lexer_impl.rs create mode 100644 src/lexer/loc.rs create mode 100644 src/lexer/mod.rs create mode 100644 src/lexer/num_lit.rs create mode 100644 src/lexer/parser_language.rs create mode 100644 src/lexer/str_lit.rs create mode 100644 src/lexer/token.rs create mode 100644 src/lexer/tokenizer.rs create mode 100644 src/lib.rs create mode 100644 src/text_format.rs create mode 100644 src/toposort.rs diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json new file mode 100644 index 0000000..1981be8 --- /dev/null +++ b/.cargo_vcs_info.json @@ -0,0 +1,6 @@ +{ + "git": { + "sha1": "7155092f3df112159d55132081937e1fe5c30490" + }, + "path_in_vcs": "protobuf-support" +} \ No newline at end of file diff --git a/Android.bp b/Android.bp new file mode 100644 index 0000000..678d57a --- /dev/null +++ b/Android.bp @@ -0,0 +1,24 @@ +// This file is generated by cargo2android.py --config cargo2android.json. +// Do not modify this file as changes will be overridden on upgrade. + + + +rust_library { + name: "libprotobuf_support", + host_supported: true, + crate_name: "protobuf_support", + cargo_env_compat: true, + cargo_pkg_version: "3.2.0", + srcs: ["src/lib.rs"], + edition: "2021", + rustlibs: [ + "libthiserror", + ], + apex_available: [ + "//apex_available:platform", + "//apex_available:anyapex", + ], + product_available: true, + vendor_available: true, + min_sdk_version: "29", +} diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..0f7368b --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,35 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies. +# +# If you are reading this file be aware that the original Cargo.toml +# will likely look very different (and much more reasonable). +# See Cargo.toml.orig for the original contents. + +[package] +edition = "2021" +name = "protobuf-support" +version = "3.2.0" +authors = ["Stepan Koltsov "] +description = """ +Code supporting protobuf implementation. None of code in this crate is public API. +""" +homepage = "https://github.com/stepancheg/rust-protobuf/" +documentation = "https://github.com/stepancheg/rust-protobuf/blob/master/README.md" +readme = "README.md" +license = "MIT" +repository = "https://github.com/stepancheg/rust-protobuf/" + +[package.metadata.docs.rs] +all-features = true + +[lib] +bench = false + +[dependencies.thiserror] +version = "1.0.30" + +[features] diff --git a/Cargo.toml.orig b/Cargo.toml.orig new file mode 100644 index 0000000..ee6fc99 --- /dev/null +++ b/Cargo.toml.orig @@ -0,0 +1,24 @@ +[package] + +name = "protobuf-support" +version = "3.2.0" +authors = ["Stepan Koltsov "] +edition = "2021" +license = "MIT" +homepage = "https://github.com/stepancheg/rust-protobuf/" +repository = "https://github.com/stepancheg/rust-protobuf/" +documentation = "https://github.com/stepancheg/rust-protobuf/blob/master/README.md" +description = """ +Code supporting protobuf implementation. None of code in this crate is public API. +""" + +[lib] +bench = false + +[features] + +[dependencies] +thiserror = "1.0.30" + +[package.metadata.docs.rs] +all-features = true diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..acce639 --- /dev/null +++ b/LICENSE @@ -0,0 +1,19 @@ +Copyright (c) 2019 Stepan Koltsov + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR +OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE +OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..acce639 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,19 @@ +Copyright (c) 2019 Stepan Koltsov + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR +OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE +OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/METADATA b/METADATA new file mode 100644 index 0000000..517ef56 --- /dev/null +++ b/METADATA @@ -0,0 +1,19 @@ +name: "protobuf-support" +description: "()" +third_party { + url { + type: HOMEPAGE + value: "https://crates.io/crates/protobuf-support" + } + url { + type: ARCHIVE + value: "https://static.crates.io/crates/protobuf-support/protobuf-support-3.2.0.crate" + } + version: "3.2.0" + license_type: NOTICE + last_upgrade_date { + year: 2023 + month: 2 + day: 27 + } +} diff --git a/MODULE_LICENSE_MIT b/MODULE_LICENSE_MIT new file mode 100644 index 0000000..e69de29 diff --git a/OWNERS b/OWNERS new file mode 100644 index 0000000..45dc4dd --- /dev/null +++ b/OWNERS @@ -0,0 +1 @@ +include platform/prebuilts/rust:master:/OWNERS diff --git a/README.md b/README.md new file mode 100644 index 0000000..88bdebb --- /dev/null +++ b/README.md @@ -0,0 +1,8 @@ + + +# Supporting code for protobuf crates + +Code in this crate is used in protobuf crates like `protobuf` or `protobuf-parse`. +None of code in this crate has public API. + + diff --git a/cargo2android.json b/cargo2android.json new file mode 100644 index 0000000..26ba70d --- /dev/null +++ b/cargo2android.json @@ -0,0 +1,10 @@ +{ + "apex-available": [ + "//apex_available:platform", + "//apex_available:anyapex" + ], + "min-sdk-version": "29", + "dependencies": true, + "device": true, + "run": true +} diff --git a/src/json_name.rs b/src/json_name.rs new file mode 100644 index 0000000..f5c9364 --- /dev/null +++ b/src/json_name.rs @@ -0,0 +1,19 @@ +/// Implementation must match exactly +/// `ToJsonName()` function in C++ `descriptor.cc`. +pub fn json_name(input: &str) -> String { + let mut capitalize_next = false; + let mut result = String::with_capacity(input.len()); + + for c in input.chars() { + if c == '_' { + capitalize_next = true; + } else if capitalize_next { + result.extend(c.to_uppercase()); + capitalize_next = false; + } else { + result.push(c); + } + } + + result +} diff --git a/src/lexer/float.rs b/src/lexer/float.rs new file mode 100644 index 0000000..f09c101 --- /dev/null +++ b/src/lexer/float.rs @@ -0,0 +1,56 @@ +#[derive(Debug)] +pub enum ProtobufFloatParseError { + EmptyString, + CannotParseFloat, +} + +pub type ProtobufFloatParseResult = Result; + +pub const PROTOBUF_NAN: &str = "nan"; +pub const PROTOBUF_INF: &str = "inf"; + +/// Format float as in protobuf `.proto` files +pub fn format_protobuf_float(f: f64) -> String { + if f.is_nan() { + PROTOBUF_NAN.to_owned() + } else if f.is_infinite() { + if f > 0.0 { + format!("{}", PROTOBUF_INF) + } else { + format!("-{}", PROTOBUF_INF) + } + } else { + // TODO: make sure doesn't lose precision + format!("{}", f) + } +} + +/// Parse float from `.proto` format +pub fn parse_protobuf_float(s: &str) -> ProtobufFloatParseResult { + if s.is_empty() { + return Err(ProtobufFloatParseError::EmptyString); + } + if s == PROTOBUF_NAN { + return Ok(f64::NAN); + } + if s == PROTOBUF_INF || s == format!("+{}", PROTOBUF_INF) { + return Ok(f64::INFINITY); + } + if s == format!("-{}", PROTOBUF_INF) { + return Ok(f64::NEG_INFINITY); + } + match s.parse() { + Ok(f) => Ok(f), + Err(_) => Err(ProtobufFloatParseError::CannotParseFloat), + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_format_protobuf_float() { + assert_eq!("10", format_protobuf_float(10.0)); + } +} diff --git a/src/lexer/int.rs b/src/lexer/int.rs new file mode 100644 index 0000000..676c1ba --- /dev/null +++ b/src/lexer/int.rs @@ -0,0 +1,12 @@ +pub struct Overflow; + +/// Negate `u64` checking for overflow. +pub fn neg(value: u64) -> Result { + if value <= 0x7fff_ffff_ffff_ffff { + Ok(-(value as i64)) + } else if value == 0x8000_0000_0000_0000 { + Ok(-0x8000_0000_0000_0000) + } else { + Err(Overflow) + } +} diff --git a/src/lexer/json_number_lit.rs b/src/lexer/json_number_lit.rs new file mode 100644 index 0000000..1323517 --- /dev/null +++ b/src/lexer/json_number_lit.rs @@ -0,0 +1,10 @@ +use std::fmt; + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct JsonNumberLit(pub String); + +impl fmt::Display for JsonNumberLit { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Display::fmt(&self.0, f) + } +} diff --git a/src/lexer/lexer_impl.rs b/src/lexer/lexer_impl.rs new file mode 100644 index 0000000..4990c8f --- /dev/null +++ b/src/lexer/lexer_impl.rs @@ -0,0 +1,712 @@ +use std::char; +use std::convert::TryFrom; +use std::num::ParseFloatError; +use std::num::ParseIntError; + +use crate::lexer::float; +use crate::lexer::float::ProtobufFloatParseError; +use crate::lexer::json_number_lit::JsonNumberLit; +use crate::lexer::loc::Loc; +use crate::lexer::loc::FIRST_COL; +use crate::lexer::parser_language::ParserLanguage; +use crate::lexer::str_lit::StrLit; +use crate::lexer::str_lit::StrLitDecodeError; +use crate::lexer::token::Token; +use crate::lexer::token::TokenWithLocation; + +#[derive(Debug, thiserror::Error)] +pub enum LexerError { + // TODO: something better than this + #[error("Incorrect input")] + IncorrectInput, + #[error("Unexpected EOF")] + UnexpectedEof, + #[error("Expecting char: {:?}", .0)] + ExpectChar(char), + #[error("Parse int error")] + ParseIntError, + #[error("Parse float error")] + ParseFloatError, + // TODO: how it is different from ParseFloatError? + #[error("Incorrect float literal")] + IncorrectFloatLit, + #[error("Incorrect JSON escape")] + IncorrectJsonEscape, + #[error("Incorrect JSON number")] + IncorrectJsonNumber, + #[error("Incorrect Unicode character")] + IncorrectUnicodeChar, + #[error("Expecting hex digit")] + ExpectHexDigit, + #[error("Expecting oct digit")] + ExpectOctDigit, + #[error("Expecting dec digit")] + ExpectDecDigit, + #[error(transparent)] + StrLitDecodeError(#[from] StrLitDecodeError), + #[error("Expecting identifier")] + ExpectedIdent, +} + +pub type LexerResult = Result; + +impl From for LexerError { + fn from(_: ParseIntError) -> Self { + LexerError::ParseIntError + } +} + +impl From for LexerError { + fn from(_: ParseFloatError) -> Self { + LexerError::ParseFloatError + } +} + +impl From for LexerError { + fn from(_: ProtobufFloatParseError) -> Self { + LexerError::IncorrectFloatLit + } +} + +#[derive(Copy, Clone)] +pub struct Lexer<'a> { + language: ParserLanguage, + input: &'a str, + pos: usize, + pub loc: Loc, +} + +fn is_letter(c: char) -> bool { + c.is_alphabetic() || c == '_' +} + +impl<'a> Lexer<'a> { + pub fn new(input: &'a str, language: ParserLanguage) -> Lexer<'a> { + Lexer { + language, + input, + pos: 0, + loc: Loc::start(), + } + } + + /// No more chars + pub fn eof(&self) -> bool { + self.pos == self.input.len() + } + + /// Remaining chars + fn rem_chars(&self) -> &'a str { + &self.input[self.pos..] + } + + pub fn lookahead_char_is bool>(&self, p: P) -> bool { + self.lookahead_char().map_or(false, p) + } + + fn lookahead_char_is_in(&self, alphabet: &str) -> bool { + self.lookahead_char_is(|c| alphabet.contains(c)) + } + + fn next_char_opt(&mut self) -> Option { + let rem = self.rem_chars(); + if rem.is_empty() { + None + } else { + let mut char_indices = rem.char_indices(); + let (_, c) = char_indices.next().unwrap(); + let c_len = char_indices.next().map(|(len, _)| len).unwrap_or(rem.len()); + self.pos += c_len; + if c == '\n' { + self.loc.line += 1; + self.loc.col = FIRST_COL; + } else { + self.loc.col += 1; + } + Some(c) + } + } + + fn next_char(&mut self) -> LexerResult { + self.next_char_opt().ok_or(LexerError::UnexpectedEof) + } + + /// Skip whitespaces + fn skip_whitespaces(&mut self) { + self.take_while(|c| c.is_whitespace()); + } + + fn skip_c_comment(&mut self) -> LexerResult<()> { + if self.skip_if_lookahead_is_str("/*") { + let end = "*/"; + match self.rem_chars().find(end) { + None => Err(LexerError::UnexpectedEof), + Some(len) => { + let new_pos = self.pos + len + end.len(); + self.skip_to_pos(new_pos); + Ok(()) + } + } + } else { + Ok(()) + } + } + + fn skip_cpp_comment(&mut self) { + if self.skip_if_lookahead_is_str("//") { + loop { + match self.next_char_opt() { + Some('\n') | None => break, + _ => {} + } + } + } + } + + fn skip_sh_comment(&mut self) { + if self.skip_if_lookahead_is_str("#") { + loop { + match self.next_char_opt() { + Some('\n') | None => break, + _ => {} + } + } + } + } + + fn skip_comment(&mut self) -> LexerResult<()> { + match self.language { + ParserLanguage::Proto => { + self.skip_c_comment()?; + self.skip_cpp_comment(); + } + ParserLanguage::TextFormat => { + self.skip_sh_comment(); + } + ParserLanguage::Json => {} + } + Ok(()) + } + + pub fn skip_ws(&mut self) -> LexerResult<()> { + loop { + let pos = self.pos; + self.skip_whitespaces(); + self.skip_comment()?; + if pos == self.pos { + // Did not advance + return Ok(()); + } + } + } + + pub fn take_while(&mut self, f: F) -> &'a str + where + F: Fn(char) -> bool, + { + let start = self.pos; + while self.lookahead_char().map(&f) == Some(true) { + self.next_char_opt().unwrap(); + } + let end = self.pos; + &self.input[start..end] + } + + fn lookahead_char(&self) -> Option { + self.clone().next_char_opt() + } + + fn lookahead_is_str(&self, s: &str) -> bool { + self.rem_chars().starts_with(s) + } + + fn skip_if_lookahead_is_str(&mut self, s: &str) -> bool { + if self.lookahead_is_str(s) { + let new_pos = self.pos + s.len(); + self.skip_to_pos(new_pos); + true + } else { + false + } + } + + fn next_char_if

(&mut self, p: P) -> Option + where + P: FnOnce(char) -> bool, + { + let mut clone = self.clone(); + match clone.next_char_opt() { + Some(c) if p(c) => { + *self = clone; + Some(c) + } + _ => None, + } + } + + pub fn next_char_if_eq(&mut self, expect: char) -> bool { + self.next_char_if(|c| c == expect) != None + } + + fn next_char_if_in(&mut self, alphabet: &str) -> Option { + for c in alphabet.chars() { + if self.next_char_if_eq(c) { + return Some(c); + } + } + None + } + + fn next_char_expect_eq(&mut self, expect: char) -> LexerResult<()> { + if self.next_char_if_eq(expect) { + Ok(()) + } else { + Err(LexerError::ExpectChar(expect)) + } + } + + fn next_char_expect

(&mut self, expect: P, err: LexerError) -> LexerResult + where + P: FnOnce(char) -> bool, + { + self.next_char_if(expect).ok_or(err) + } + + // str functions + + /// properly update line and column + fn skip_to_pos(&mut self, new_pos: usize) -> &'a str { + assert!(new_pos >= self.pos); + assert!(new_pos <= self.input.len()); + let pos = self.pos; + while self.pos != new_pos { + self.next_char_opt().unwrap(); + } + &self.input[pos..new_pos] + } + + // Protobuf grammar + + // char functions + + // letter = "A" … "Z" | "a" … "z" + // https://github.com/google/protobuf/issues/4565 + fn next_letter_opt(&mut self) -> Option { + self.next_char_if(is_letter) + } + + // capitalLetter = "A" … "Z" + fn _next_capital_letter_opt(&mut self) -> Option { + self.next_char_if(|c| c >= 'A' && c <= 'Z') + } + + fn next_ident_part(&mut self) -> Option { + self.next_char_if(|c| c.is_ascii_alphanumeric() || c == '_') + } + + // Identifiers + + // ident = letter { letter | decimalDigit | "_" } + fn next_ident_opt(&mut self) -> LexerResult> { + if let Some(c) = self.next_letter_opt() { + let mut ident = String::new(); + ident.push(c); + while let Some(c) = self.next_ident_part() { + ident.push(c); + } + Ok(Some(ident)) + } else { + Ok(None) + } + } + + // Integer literals + + // hexLit = "0" ( "x" | "X" ) hexDigit { hexDigit } + fn next_hex_lit_opt(&mut self) -> LexerResult> { + Ok( + if self.skip_if_lookahead_is_str("0x") || self.skip_if_lookahead_is_str("0X") { + let s = self.take_while(|c| c.is_ascii_hexdigit()); + Some(u64::from_str_radix(s, 16)? as u64) + } else { + None + }, + ) + } + + // decimalLit = ( "1" … "9" ) { decimalDigit } + // octalLit = "0" { octalDigit } + fn next_decimal_octal_lit_opt(&mut self) -> LexerResult> { + // do not advance on number parse error + let mut clone = self.clone(); + + let pos = clone.pos; + + Ok(if clone.next_char_if(|c| c.is_ascii_digit()) != None { + clone.take_while(|c| c.is_ascii_digit()); + let value = clone.input[pos..clone.pos].parse()?; + *self = clone; + Some(value) + } else { + None + }) + } + + // hexDigit = "0" … "9" | "A" … "F" | "a" … "f" + fn next_hex_digit(&mut self) -> LexerResult { + let mut clone = self.clone(); + let r = match clone.next_char()? { + c if c >= '0' && c <= '9' => c as u32 - b'0' as u32, + c if c >= 'A' && c <= 'F' => c as u32 - b'A' as u32 + 10, + c if c >= 'a' && c <= 'f' => c as u32 - b'a' as u32 + 10, + _ => return Err(LexerError::ExpectHexDigit), + }; + *self = clone; + Ok(r) + } + + // octalDigit = "0" … "7" + fn next_octal_digit(&mut self) -> LexerResult { + self.next_char_expect(|c| c >= '0' && c <= '9', LexerError::ExpectOctDigit) + .map(|c| c as u32 - '0' as u32) + } + + // decimalDigit = "0" … "9" + fn next_decimal_digit(&mut self) -> LexerResult { + self.next_char_expect(|c| c >= '0' && c <= '9', LexerError::ExpectDecDigit) + .map(|c| c as u32 - '0' as u32) + } + + // decimals = decimalDigit { decimalDigit } + fn next_decimal_digits(&mut self) -> LexerResult<()> { + self.next_decimal_digit()?; + self.take_while(|c| c >= '0' && c <= '9'); + Ok(()) + } + + // intLit = decimalLit | octalLit | hexLit + pub fn next_int_lit_opt(&mut self) -> LexerResult> { + assert_ne!(ParserLanguage::Json, self.language); + + self.skip_ws()?; + if let Some(i) = self.next_hex_lit_opt()? { + return Ok(Some(i)); + } + if let Some(i) = self.next_decimal_octal_lit_opt()? { + return Ok(Some(i)); + } + Ok(None) + } + + // Floating-point literals + + // exponent = ( "e" | "E" ) [ "+" | "-" ] decimals + fn next_exponent_opt(&mut self) -> LexerResult> { + if self.next_char_if_in("eE") != None { + self.next_char_if_in("+-"); + self.next_decimal_digits()?; + Ok(Some(())) + } else { + Ok(None) + } + } + + // floatLit = ( decimals "." [ decimals ] [ exponent ] | decimals exponent | "."decimals [ exponent ] ) | "inf" | "nan" + fn next_float_lit(&mut self) -> LexerResult<()> { + assert_ne!(ParserLanguage::Json, self.language); + + // "inf" and "nan" are handled as part of ident + if self.next_char_if_eq('.') { + self.next_decimal_digits()?; + self.next_exponent_opt()?; + } else { + self.next_decimal_digits()?; + if self.next_char_if_eq('.') { + self.next_decimal_digits()?; + self.next_exponent_opt()?; + } else { + if self.next_exponent_opt()? == None { + return Err(LexerError::IncorrectFloatLit); + } + } + } + Ok(()) + } + + // String literals + + // charValue = hexEscape | octEscape | charEscape | /[^\0\n\\]/ + // hexEscape = '\' ( "x" | "X" ) hexDigit hexDigit + // https://github.com/google/protobuf/issues/4560 + // octEscape = '\' octalDigit octalDigit octalDigit + // charEscape = '\' ( "a" | "b" | "f" | "n" | "r" | "t" | "v" | '\' | "'" | '"' ) + // quote = "'" | '"' + pub fn next_byte_value(&mut self) -> LexerResult { + match self.next_char()? { + '\\' => { + match self.next_char()? { + '\'' => Ok(b'\''), + '"' => Ok(b'"'), + '\\' => Ok(b'\\'), + 'a' => Ok(b'\x07'), + 'b' => Ok(b'\x08'), + 'f' => Ok(b'\x0c'), + 'n' => Ok(b'\n'), + 'r' => Ok(b'\r'), + 't' => Ok(b'\t'), + 'v' => Ok(b'\x0b'), + 'x' => { + let d1 = self.next_hex_digit()? as u8; + let d2 = self.next_hex_digit()? as u8; + Ok(((d1 << 4) | d2) as u8) + } + d if d >= '0' && d <= '7' => { + let mut r = d as u8 - b'0'; + for _ in 0..2 { + match self.next_octal_digit() { + Err(_) => break, + Ok(d) => r = (r << 3) + d as u8, + } + } + Ok(r) + } + // https://github.com/google/protobuf/issues/4562 + // TODO: overflow + c => Ok(c as u8), + } + } + '\n' | '\0' => Err(LexerError::IncorrectInput), + // TODO: check overflow + c => Ok(c as u8), + } + } + + fn char_try_from(i: u32) -> LexerResult { + char::try_from(i).map_err(|_| LexerError::IncorrectUnicodeChar) + } + + pub fn next_json_char_value(&mut self) -> LexerResult { + match self.next_char()? { + '\\' => match self.next_char()? { + '"' => Ok('"'), + '\'' => Ok('\''), + '\\' => Ok('\\'), + '/' => Ok('/'), + 'b' => Ok('\x08'), + 'f' => Ok('\x0c'), + 'n' => Ok('\n'), + 'r' => Ok('\r'), + 't' => Ok('\t'), + 'u' => { + let mut v = 0; + for _ in 0..4 { + let digit = self.next_hex_digit()?; + v = v * 16 + digit; + } + Self::char_try_from(v) + } + _ => Err(LexerError::IncorrectJsonEscape), + }, + c => Ok(c), + } + } + + // https://github.com/google/protobuf/issues/4564 + // strLit = ( "'" { charValue } "'" ) | ( '"' { charValue } '"' ) + fn next_str_lit_raw(&mut self) -> LexerResult { + let mut raw = String::new(); + + let mut first = true; + loop { + if !first { + self.skip_ws()?; + } + + let start = self.pos; + + let q = match self.next_char_if_in("'\"") { + Some(q) => q, + None if !first => break, + None => return Err(LexerError::IncorrectInput), + }; + first = false; + while self.lookahead_char() != Some(q) { + self.next_byte_value()?; + } + self.next_char_expect_eq(q)?; + + raw.push_str(&self.input[start + 1..self.pos - 1]); + } + Ok(raw) + } + + fn next_str_lit_raw_opt(&mut self) -> LexerResult> { + if self.lookahead_char_is_in("'\"") { + Ok(Some(self.next_str_lit_raw()?)) + } else { + Ok(None) + } + } + + /// Parse next token as JSON number + fn next_json_number_opt(&mut self) -> LexerResult> { + assert_eq!(ParserLanguage::Json, self.language); + + fn is_digit(c: char) -> bool { + c >= '0' && c <= '9' + } + + fn is_digit_1_9(c: char) -> bool { + c >= '1' && c <= '9' + } + + if !self.lookahead_char_is_in("-0123456789") { + return Ok(None); + } + + let mut s = String::new(); + if self.next_char_if_eq('-') { + s.push('-'); + } + + if self.next_char_if_eq('0') { + s.push('0'); + } else { + s.push(self.next_char_expect(is_digit_1_9, LexerError::IncorrectJsonNumber)?); + while let Some(c) = self.next_char_if(is_digit) { + s.push(c); + } + } + + if self.next_char_if_eq('.') { + s.push('.'); + s.push(self.next_char_expect(is_digit, LexerError::IncorrectJsonNumber)?); + while let Some(c) = self.next_char_if(is_digit) { + s.push(c); + } + } + + if let Some(c) = self.next_char_if_in("eE") { + s.push(c); + if let Some(c) = self.next_char_if_in("+-") { + s.push(c); + } + s.push(self.next_char_expect(is_digit, LexerError::IncorrectJsonNumber)?); + while let Some(c) = self.next_char_if(is_digit) { + s.push(c); + } + } + + Ok(Some(JsonNumberLit(s))) + } + + fn next_token_inner(&mut self) -> LexerResult { + if self.language == ParserLanguage::Json { + if let Some(v) = self.next_json_number_opt()? { + return Ok(Token::JsonNumber(v)); + } + } + + if let Some(ident) = self.next_ident_opt()? { + let token = if self.language != ParserLanguage::Json && ident == float::PROTOBUF_NAN { + Token::FloatLit(f64::NAN) + } else if self.language != ParserLanguage::Json && ident == float::PROTOBUF_INF { + Token::FloatLit(f64::INFINITY) + } else { + Token::Ident(ident.to_owned()) + }; + return Ok(token); + } + + if self.language != ParserLanguage::Json { + let mut clone = self.clone(); + let pos = clone.pos; + if let Ok(_) = clone.next_float_lit() { + let f = float::parse_protobuf_float(&self.input[pos..clone.pos])?; + *self = clone; + return Ok(Token::FloatLit(f)); + } + + if let Some(lit) = self.next_int_lit_opt()? { + return Ok(Token::IntLit(lit)); + } + } + + if let Some(escaped) = self.next_str_lit_raw_opt()? { + return Ok(Token::StrLit(StrLit { escaped })); + } + + // This branch must be after str lit + if let Some(c) = self.next_char_if(|c| c.is_ascii_punctuation()) { + return Ok(Token::Symbol(c)); + } + + if let Some(ident) = self.next_ident_opt()? { + return Ok(Token::Ident(ident)); + } + + Err(LexerError::IncorrectInput) + } + + pub fn next_token(&mut self) -> LexerResult> { + self.skip_ws()?; + let loc = self.loc; + + Ok(if self.eof() { + None + } else { + let token = self.next_token_inner()?; + // Skip whitespace here to update location + // to the beginning of the next token + self.skip_ws()?; + Some(TokenWithLocation { token, loc }) + }) + } +} + +#[cfg(test)] +mod test { + use super::*; + + fn lex(input: &str, parse_what: P) -> R + where + P: FnOnce(&mut Lexer) -> LexerResult, + { + let mut lexer = Lexer::new(input, ParserLanguage::Proto); + let r = parse_what(&mut lexer).expect(&format!("lexer failed at {}", lexer.loc)); + assert!(lexer.eof(), "check eof failed at {}", lexer.loc); + r + } + + fn lex_opt(input: &str, parse_what: P) -> R + where + P: FnOnce(&mut Lexer) -> LexerResult>, + { + let mut lexer = Lexer::new(input, ParserLanguage::Proto); + let o = parse_what(&mut lexer).expect(&format!("lexer failed at {}", lexer.loc)); + let r = o.expect(&format!("lexer returned none at {}", lexer.loc)); + assert!(lexer.eof(), "check eof failed at {}", lexer.loc); + r + } + + #[test] + fn test_lexer_int_lit() { + let msg = r#"10"#; + let mess = lex_opt(msg, |p| p.next_int_lit_opt()); + assert_eq!(10, mess); + } + + #[test] + fn test_lexer_float_lit() { + let msg = r#"12.3"#; + let mess = lex(msg, |p| p.next_token_inner()); + assert_eq!(Token::FloatLit(12.3), mess); + } + + #[test] + fn test_lexer_float_lit_leading_zeros_in_exp() { + let msg = r#"1e00009"#; + let mess = lex(msg, |p| p.next_token_inner()); + assert_eq!(Token::FloatLit(1_000_000_000.0), mess); + } +} diff --git a/src/lexer/loc.rs b/src/lexer/loc.rs new file mode 100644 index 0000000..ea3fc1a --- /dev/null +++ b/src/lexer/loc.rs @@ -0,0 +1,28 @@ +use std::fmt; + +pub const FIRST_LINE: u32 = 1; +pub const FIRST_COL: u32 = 1; + +/// Location in file +#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd)] +pub struct Loc { + /// 1-based + pub line: u32, + /// 1-based + pub col: u32, +} + +impl fmt::Display for Loc { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}:{}", self.line, self.col) + } +} + +impl Loc { + pub fn start() -> Loc { + Loc { + line: FIRST_LINE, + col: FIRST_COL, + } + } +} diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs new file mode 100644 index 0000000..bde64f5 --- /dev/null +++ b/src/lexer/mod.rs @@ -0,0 +1,12 @@ +//! Implementation of lexer for both protobuf parser and for text format parser. + +pub mod float; +pub mod int; +pub mod json_number_lit; +pub mod lexer_impl; +pub mod loc; +pub mod num_lit; +pub mod parser_language; +pub mod str_lit; +pub mod token; +pub mod tokenizer; diff --git a/src/lexer/num_lit.rs b/src/lexer/num_lit.rs new file mode 100644 index 0000000..cc64cc4 --- /dev/null +++ b/src/lexer/num_lit.rs @@ -0,0 +1,5 @@ +#[derive(Copy, Clone)] +pub enum NumLit { + U64(u64), + F64(f64), +} diff --git a/src/lexer/parser_language.rs b/src/lexer/parser_language.rs new file mode 100644 index 0000000..e356571 --- /dev/null +++ b/src/lexer/parser_language.rs @@ -0,0 +1,10 @@ +/// We use the same lexer/tokenizer for all parsers for simplicity +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub enum ParserLanguage { + // `.proto` files + Proto, + // Protobuf text format + TextFormat, + // JSON + Json, +} diff --git a/src/lexer/str_lit.rs b/src/lexer/str_lit.rs new file mode 100644 index 0000000..0e51a16 --- /dev/null +++ b/src/lexer/str_lit.rs @@ -0,0 +1,77 @@ +use std::fmt; +use std::string::FromUtf8Error; + +use crate::lexer::lexer_impl::Lexer; +use crate::lexer::parser_language::ParserLanguage; + +#[derive(Debug, thiserror::Error)] +pub enum StrLitDecodeError { + #[error(transparent)] + FromUtf8Error(#[from] FromUtf8Error), + #[error("String literal decode error")] + OtherError, +} + +pub type StrLitDecodeResult = Result; + +/// String literal, both `string` and `bytes`. +#[derive(Clone, Eq, PartialEq, Debug)] +pub struct StrLit { + pub escaped: String, +} + +impl fmt::Display for StrLit { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "\"{}\"", &self.escaped) + } +} + +impl StrLit { + /// May fail if not valid UTF8 + pub fn decode_utf8(&self) -> StrLitDecodeResult { + let mut lexer = Lexer::new(&self.escaped, ParserLanguage::Json); + let mut r = Vec::new(); + while !lexer.eof() { + r.push( + lexer + .next_byte_value() + .map_err(|_| StrLitDecodeError::OtherError)?, + ); + } + Ok(String::from_utf8(r)?) + } + + pub fn decode_bytes(&self) -> StrLitDecodeResult> { + let mut lexer = Lexer::new(&self.escaped, ParserLanguage::Json); + let mut r = Vec::new(); + while !lexer.eof() { + r.push( + lexer + .next_byte_value() + .map_err(|_| StrLitDecodeError::OtherError)?, + ); + } + Ok(r) + } + + pub fn quoted(&self) -> String { + format!("\"{}\"", self.escaped) + } +} + +#[cfg(test)] +mod test { + use crate::lexer::str_lit::StrLit; + + #[test] + fn decode_utf8() { + assert_eq!( + "\u{1234}".to_owned(), + StrLit { + escaped: "\\341\\210\\264".to_owned() + } + .decode_utf8() + .unwrap() + ) + } +} diff --git a/src/lexer/token.rs b/src/lexer/token.rs new file mode 100644 index 0000000..b20aba6 --- /dev/null +++ b/src/lexer/token.rs @@ -0,0 +1,47 @@ +use crate::lexer::json_number_lit::JsonNumberLit; +use crate::lexer::lexer_impl::LexerError; +use crate::lexer::lexer_impl::LexerResult; +use crate::lexer::loc::Loc; +use crate::lexer::num_lit::NumLit; +use crate::lexer::str_lit::StrLit; + +#[derive(Clone, Debug, PartialEq)] +pub enum Token { + Ident(String), + Symbol(char), + // Protobuf tokenizer has separate tokens for int and float. + // Tokens do not include sign. + IntLit(u64), + FloatLit(f64), + JsonNumber(JsonNumberLit), + // including quotes + StrLit(StrLit), +} + +impl Token { + /// Back to original + pub fn format(&self) -> String { + match self { + &Token::Ident(ref s) => s.clone(), + &Token::Symbol(c) => c.to_string(), + &Token::IntLit(ref i) => i.to_string(), + &Token::StrLit(ref s) => s.quoted(), + &Token::FloatLit(ref f) => f.to_string(), + &Token::JsonNumber(ref f) => f.to_string(), + } + } + + pub fn to_num_lit(&self) -> LexerResult { + match self { + &Token::IntLit(i) => Ok(NumLit::U64(i)), + &Token::FloatLit(f) => Ok(NumLit::F64(f)), + _ => Err(LexerError::IncorrectInput), + } + } +} + +#[derive(Clone)] +pub struct TokenWithLocation { + pub token: Token, + pub loc: Loc, +} diff --git a/src/lexer/tokenizer.rs b/src/lexer/tokenizer.rs new file mode 100644 index 0000000..c5e84a0 --- /dev/null +++ b/src/lexer/tokenizer.rs @@ -0,0 +1,330 @@ +use crate::lexer::lexer_impl::Lexer; +use crate::lexer::lexer_impl::LexerError; +use crate::lexer::loc::Loc; +use crate::lexer::parser_language::ParserLanguage; +use crate::lexer::str_lit::StrLit; +use crate::lexer::str_lit::StrLitDecodeError; +use crate::lexer::token::Token; +use crate::lexer::token::TokenWithLocation; + +#[derive(Debug, thiserror::Error)] +pub enum TokenizerError { + #[error(transparent)] + LexerError(#[from] LexerError), + #[error(transparent)] + StrLitDecodeError(#[from] StrLitDecodeError), + #[error("Internal tokenizer error")] + InternalError, + // TODO: too broad + #[error("Incorrect input")] + IncorrectInput, + #[error("Not allowed in this context: {0}")] + NotAllowedInThisContext(&'static str), + #[error("Unexpected end of input")] + UnexpectedEof, + #[error("Expecting string literal")] + ExpectStrLit, + #[error("Expecting int literal")] + ExpectIntLit, + #[error("Expecting float literal")] + ExpectFloatLit, + #[error("Expecting identifier")] + ExpectIdent, + #[error("Expecting identifier `{}`", .0)] + ExpectNamedIdent(String), + #[error("While parsing {}, expecting char `{}`", .1, .0)] + ExpectChar(char, &'static str), + #[error("Expecting any char of: {}", .0.iter().map(|c| format!("`{}`", c)).collect::>().join(", "))] + ExpectAnyChar(Vec), +} + +pub type TokenizerResult = Result; + +#[derive(Clone)] +pub struct Tokenizer<'a> { + lexer: Lexer<'a>, + next_token: Option, + last_token_loc: Option, +} + +impl<'a> Tokenizer<'a> { + pub fn new(input: &'a str, comment_style: ParserLanguage) -> Tokenizer<'a> { + Tokenizer { + lexer: Lexer::new(input, comment_style), + next_token: None, + last_token_loc: None, + } + } + + pub fn loc(&self) -> Loc { + // After lookahead return the location of the next token + self.next_token + .as_ref() + .map(|t| t.loc.clone()) + // After token consumed return the location of that token + .or(self.last_token_loc.clone()) + // Otherwise return the position of lexer + .unwrap_or(self.lexer.loc) + } + + pub fn lookahead_loc(&mut self) -> Loc { + drop(self.lookahead()); + // TODO: does not handle EOF properly + self.loc() + } + + fn lookahead(&mut self) -> TokenizerResult> { + Ok(match self.next_token { + Some(ref token) => Some(&token.token), + None => { + self.next_token = self.lexer.next_token()?; + self.last_token_loc = self.next_token.as_ref().map(|t| t.loc.clone()); + match self.next_token { + Some(ref token) => Some(&token.token), + None => None, + } + } + }) + } + + pub fn lookahead_some(&mut self) -> TokenizerResult<&Token> { + match self.lookahead()? { + Some(token) => Ok(token), + None => Err(TokenizerError::UnexpectedEof), + } + } + + fn next(&mut self) -> TokenizerResult> { + self.lookahead()?; + Ok(self + .next_token + .take() + .map(|TokenWithLocation { token, .. }| token)) + } + + pub fn next_some(&mut self) -> TokenizerResult { + match self.next()? { + Some(token) => Ok(token), + None => Err(TokenizerError::UnexpectedEof), + } + } + + /// Can be called only after lookahead, otherwise it's error + pub fn advance(&mut self) -> TokenizerResult { + self.next_token + .take() + .map(|TokenWithLocation { token, .. }| token) + .ok_or(TokenizerError::InternalError) + } + + /// No more tokens + pub fn syntax_eof(&mut self) -> TokenizerResult { + Ok(self.lookahead()?.is_none()) + } + + pub fn next_token_if_map(&mut self, p: P) -> TokenizerResult> + where + P: FnOnce(&Token) -> Option, + { + self.lookahead()?; + let v = match self.next_token { + Some(ref token) => match p(&token.token) { + Some(v) => v, + None => return Ok(None), + }, + _ => return Ok(None), + }; + self.next_token = None; + Ok(Some(v)) + } + + pub fn next_token_check_map(&mut self, p: P) -> Result + where + P: FnOnce(&Token) -> Result, + E: From, + { + self.lookahead()?; + let r = match self.next_token { + Some(ref token) => p(&token.token)?, + None => return Err(TokenizerError::UnexpectedEof.into()), + }; + self.next_token = None; + Ok(r) + } + + fn next_token_if

(&mut self, p: P) -> TokenizerResult> + where + P: FnOnce(&Token) -> bool, + { + self.next_token_if_map(|token| if p(token) { Some(token.clone()) } else { None }) + } + + pub fn next_ident_if_in(&mut self, idents: &[&str]) -> TokenizerResult> { + let v = match self.lookahead()? { + Some(&Token::Ident(ref next)) => { + if idents.into_iter().find(|&i| i == next).is_some() { + next.clone() + } else { + return Ok(None); + } + } + _ => return Ok(None), + }; + self.advance()?; + Ok(Some(v)) + } + + pub fn next_ident_if_eq(&mut self, word: &str) -> TokenizerResult { + Ok(self.next_ident_if_in(&[word])? != None) + } + + pub fn next_ident_expect_eq(&mut self, word: &str) -> TokenizerResult<()> { + if self.next_ident_if_eq(word)? { + Ok(()) + } else { + Err(TokenizerError::ExpectNamedIdent(word.to_owned())) + } + } + + pub fn next_ident_if_eq_error(&mut self, word: &'static str) -> TokenizerResult<()> { + if self.clone().next_ident_if_eq(word)? { + // TODO: which context? + return Err(TokenizerError::NotAllowedInThisContext(word)); + } + Ok(()) + } + + pub fn next_symbol_if_eq(&mut self, symbol: char) -> TokenizerResult { + Ok(self.next_token_if(|token| match token { + &Token::Symbol(c) if c == symbol => true, + _ => false, + })? != None) + } + + pub fn next_symbol_expect_eq( + &mut self, + symbol: char, + desc: &'static str, + ) -> TokenizerResult<()> { + if self.lookahead_is_symbol(symbol)? { + self.advance()?; + Ok(()) + } else { + Err(TokenizerError::ExpectChar(symbol, desc)) + } + } + + pub fn next_symbol_expect_eq_oneof(&mut self, symbols: &[char]) -> TokenizerResult { + for symbol in symbols { + if let Ok(()) = self.next_symbol_expect_eq(*symbol, "ignored") { + return Ok(*symbol); + } + } + Err(TokenizerError::ExpectAnyChar(symbols.to_owned())) + } + + pub fn lookahead_is_str_lit(&mut self) -> TokenizerResult { + Ok(match self.lookahead()? { + Some(&Token::StrLit(..)) => true, + _ => false, + }) + } + + pub fn lookahead_is_int_lit(&mut self) -> TokenizerResult { + Ok(match self.lookahead()? { + Some(&Token::IntLit(..)) => true, + _ => false, + }) + } + + pub fn lookahead_is_json_number(&mut self) -> TokenizerResult { + Ok(match self.lookahead()? { + Some(&Token::JsonNumber(..)) => true, + _ => false, + }) + } + + pub fn lookahead_if_symbol(&mut self) -> TokenizerResult> { + Ok(match self.lookahead()? { + Some(&Token::Symbol(c)) => Some(c), + _ => None, + }) + } + + pub fn lookahead_is_symbol(&mut self, symbol: char) -> TokenizerResult { + Ok(self.lookahead_if_symbol()? == Some(symbol)) + } + + pub fn lookahead_is_ident(&mut self, ident: &str) -> TokenizerResult { + Ok(match self.lookahead()? { + Some(Token::Ident(i)) => i == ident, + _ => false, + }) + } + + pub fn next_ident(&mut self) -> TokenizerResult { + self.next_token_check_map(|token| match token { + &Token::Ident(ref ident) => Ok(ident.clone()), + _ => Err(TokenizerError::ExpectIdent), + }) + } + + pub fn next_str_lit(&mut self) -> TokenizerResult { + self.next_token_check_map(|token| match token { + &Token::StrLit(ref str_lit) => Ok(str_lit.clone()), + _ => Err(TokenizerError::ExpectStrLit), + }) + } + + pub fn next_int_lit(&mut self) -> TokenizerResult { + self.next_token_check_map(|token| match token { + &Token::IntLit(v) => Ok(v), + _ => Err(TokenizerError::ExpectIntLit), + }) + } + + pub fn next_float_lit(&mut self) -> TokenizerResult { + self.next_token_check_map(|token| match token { + &Token::FloatLit(v) => Ok(v), + _ => Err(TokenizerError::ExpectFloatLit), + }) + } +} + +#[cfg(test)] +mod test { + + use super::*; + + fn tokenize(input: &str, what: P) -> R + where + P: FnOnce(&mut Tokenizer) -> TokenizerResult, + { + let mut tokenizer = Tokenizer::new(input, ParserLanguage::Proto); + let r = what(&mut tokenizer).expect(&format!("parse failed at {}", tokenizer.loc())); + let eof = tokenizer + .syntax_eof() + .expect(&format!("check eof failed at {}", tokenizer.loc())); + assert!(eof, "{}", tokenizer.loc()); + r + } + + #[test] + fn test_ident() { + let msg = r#" aabb_c "#; + let mess = tokenize(msg, |p| p.next_ident().map(|s| s.to_owned())); + assert_eq!("aabb_c", mess); + } + + #[test] + fn test_str_lit() { + let msg = r#" "a\nb" "#; + let mess = tokenize(msg, |p| p.next_str_lit()); + assert_eq!( + StrLit { + escaped: r#"a\nb"#.to_owned() + }, + mess + ); + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..63c2a8d --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,9 @@ +//! # Supporting code for protobuf crates +//! +//! Code in this crate is used in protobuf crates like `protobuf` or `protobuf-parse`. +//! None of code in this crate has public API. + +pub mod json_name; +pub mod lexer; +pub mod text_format; +pub mod toposort; diff --git a/src/text_format.rs b/src/text_format.rs new file mode 100644 index 0000000..f49f4c2 --- /dev/null +++ b/src/text_format.rs @@ -0,0 +1,75 @@ +pub fn escape_bytes_to(bytes: &[u8], buf: &mut String) { + for &c in bytes { + match c { + b'\n' => buf.push_str(r"\n"), + b'\r' => buf.push_str(r"\r"), + b'\t' => buf.push_str(r"\t"), + b'\'' => buf.push_str("\\\'"), + b'"' => buf.push_str("\\\""), + b'\\' => buf.push_str(r"\\"), + b'\x20'..=b'\x7e' => buf.push(c as char), + _ => { + buf.push('\\'); + buf.push((b'0' + (c >> 6)) as char); + buf.push((b'0' + ((c >> 3) & 7)) as char); + buf.push((b'0' + (c & 7)) as char); + } + } + } +} + +pub fn quote_bytes_to(bytes: &[u8], buf: &mut String) { + buf.push('"'); + escape_bytes_to(bytes, buf); + buf.push('"'); +} + +#[cfg(test)] +mod test { + use crate::lexer::str_lit::StrLit; + use crate::text_format::escape_bytes_to; + + fn escape(data: &[u8]) -> String { + let mut s = String::with_capacity(data.len() * 4); + escape_bytes_to(data, &mut s); + s + } + + fn unescape_string(escaped: &str) -> Vec { + StrLit { + escaped: escaped.to_owned(), + } + .decode_bytes() + .expect("decode_bytes") + } + + fn test_escape_unescape(text: &str, escaped: &str) { + assert_eq!(text.as_bytes(), &unescape_string(escaped)[..]); + assert_eq!(escaped, &escape(text.as_bytes())[..]); + } + + #[test] + fn test_print_to_bytes() { + assert_eq!("ab", escape(b"ab")); + assert_eq!("a\\\\023", escape(b"a\\023")); + assert_eq!("a\\r\\n\\t \\'\\\"\\\\", escape(b"a\r\n\t '\"\\")); + assert_eq!("\\344\\275\\240\\345\\245\\275", escape("你好".as_bytes())); + } + + #[test] + fn test_unescape_string() { + test_escape_unescape("", ""); + test_escape_unescape("aa", "aa"); + test_escape_unescape("\n", "\\n"); + test_escape_unescape("\r", "\\r"); + test_escape_unescape("\t", "\\t"); + test_escape_unescape("你好", "\\344\\275\\240\\345\\245\\275"); + // hex + assert_eq!(b"aaa\x01bbb", &unescape_string("aaa\\x01bbb")[..]); + assert_eq!(b"aaa\xcdbbb", &unescape_string("aaa\\xCDbbb")[..]); + assert_eq!(b"aaa\xcdbbb", &unescape_string("aaa\\xCDbbb")[..]); + // quotes + assert_eq!(b"aaa\"bbb", &unescape_string("aaa\\\"bbb")[..]); + assert_eq!(b"aaa\'bbb", &unescape_string("aaa\\\'bbb")[..]); + } +} diff --git a/src/toposort.rs b/src/toposort.rs new file mode 100644 index 0000000..5e44590 --- /dev/null +++ b/src/toposort.rs @@ -0,0 +1,119 @@ +use std::collections::HashSet; +use std::hash::Hash; + +#[derive(Debug, thiserror::Error)] +#[error("Cycle detected")] +pub struct TopoSortCycle; + +pub fn toposort( + input: impl IntoIterator, + deps: impl Fn(&K) -> I, +) -> Result, TopoSortCycle> +where + K: Eq + Hash + Clone, + I: Iterator, +{ + struct Ts + where + K: Eq + Hash + Clone, + I: Iterator, + D: Fn(&K) -> I, + { + result_set: HashSet, + result: Vec, + deps: D, + stack: HashSet, + } + + impl Ts + where + K: Eq + Hash + Clone, + I: Iterator, + D: Fn(&K) -> I, + { + fn visit(&mut self, i: &K) -> Result<(), TopoSortCycle> { + if self.result_set.contains(i) { + return Ok(()); + } + + if !self.stack.insert(i.clone()) { + return Err(TopoSortCycle); + } + for dep in (self.deps)(i) { + self.visit(&dep)?; + } + + let removed = self.stack.remove(i); + assert!(removed); + + self.result.push(i.clone()); + self.result_set.insert(i.clone()); + + Ok(()) + } + } + + let mut ts = Ts { + result: Vec::new(), + result_set: HashSet::new(), + deps, + stack: HashSet::new(), + }; + + for i in input { + ts.visit(&i)?; + } + + Ok(ts.result) +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + + use crate::toposort::toposort; + use crate::toposort::TopoSortCycle; + + fn test_toposort(input: &str) -> Result, TopoSortCycle> { + let mut keys: Vec<&str> = Vec::new(); + let mut edges: HashMap<&str, Vec<&str>> = HashMap::new(); + for part in input.split(" ") { + match part.split_once("->") { + Some((k, vs)) => { + keys.push(k); + edges.insert(k, vs.split(",").collect()); + } + None => keys.push(part), + }; + } + + toposort(keys, |k| { + edges + .get(k) + .map(|v| v.as_slice()) + .unwrap_or_default() + .into_iter() + .copied() + }) + } + + fn test_toposort_check(input: &str, expected: &str) { + let sorted = test_toposort(input).unwrap(); + let expected = expected.split(" ").collect::>(); + assert_eq!(expected, sorted); + } + + #[test] + fn test() { + test_toposort_check("1 2 3", "1 2 3"); + test_toposort_check("1->2 2->3 3", "3 2 1"); + test_toposort_check("1 2->1 3->2", "1 2 3"); + test_toposort_check("1->2,3 2->3 3", "3 2 1"); + } + + #[test] + fn cycle() { + assert!(test_toposort("1->1").is_err()); + assert!(test_toposort("1->2 2->1").is_err()); + } +} -- cgit v1.2.3