aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJeff Vander Stoep <jeffv@google.com>2023-03-09 10:32:38 +0100
committerJeff Vander Stoep <jeffv@google.com>2023-03-22 15:13:04 +0100
commit4152c49a9432c5e51e891f1c34b7e20446496aea (patch)
tree69869592aed06a9de8dc3070df31134b1710c5b6
parentae3bfb6e232d7c8753c197e2bf4c24d73537dca1 (diff)
downloadprotobuf-support-4152c49a9432c5e51e891f1c34b7e20446496aea.tar.gz
Import protobuf-support 3.2.0
Test: Treehugger Bug: 270895633 Change-Id: I0e7e700aaa7eab95a91757ea147b08b205db2d0c
-rw-r--r--.cargo_vcs_info.json6
-rw-r--r--Android.bp24
-rw-r--r--Cargo.toml35
-rw-r--r--Cargo.toml.orig24
-rw-r--r--LICENSE19
-rw-r--r--LICENSE.txt19
-rw-r--r--METADATA19
-rw-r--r--MODULE_LICENSE_MIT0
-rw-r--r--OWNERS1
-rw-r--r--README.md8
-rw-r--r--cargo2android.json10
-rw-r--r--src/json_name.rs19
-rw-r--r--src/lexer/float.rs56
-rw-r--r--src/lexer/int.rs12
-rw-r--r--src/lexer/json_number_lit.rs10
-rw-r--r--src/lexer/lexer_impl.rs712
-rw-r--r--src/lexer/loc.rs28
-rw-r--r--src/lexer/mod.rs12
-rw-r--r--src/lexer/num_lit.rs5
-rw-r--r--src/lexer/parser_language.rs10
-rw-r--r--src/lexer/str_lit.rs77
-rw-r--r--src/lexer/token.rs47
-rw-r--r--src/lexer/tokenizer.rs330
-rw-r--r--src/lib.rs9
-rw-r--r--src/text_format.rs75
-rw-r--r--src/toposort.rs119
26 files changed, 1686 insertions, 0 deletions
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json
new file mode 100644
index 0000000..1981be8
--- /dev/null
+++ b/.cargo_vcs_info.json
@@ -0,0 +1,6 @@
+{
+ "git": {
+ "sha1": "7155092f3df112159d55132081937e1fe5c30490"
+ },
+ "path_in_vcs": "protobuf-support"
+} \ No newline at end of file
diff --git a/Android.bp b/Android.bp
new file mode 100644
index 0000000..678d57a
--- /dev/null
+++ b/Android.bp
@@ -0,0 +1,24 @@
+// This file is generated by cargo2android.py --config cargo2android.json.
+// Do not modify this file as changes will be overridden on upgrade.
+
+
+
+rust_library {
+ name: "libprotobuf_support",
+ host_supported: true,
+ crate_name: "protobuf_support",
+ cargo_env_compat: true,
+ cargo_pkg_version: "3.2.0",
+ srcs: ["src/lib.rs"],
+ edition: "2021",
+ rustlibs: [
+ "libthiserror",
+ ],
+ apex_available: [
+ "//apex_available:platform",
+ "//apex_available:anyapex",
+ ],
+ product_available: true,
+ vendor_available: true,
+ min_sdk_version: "29",
+}
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..0f7368b
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,35 @@
+# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
+#
+# When uploading crates to the registry Cargo will automatically
+# "normalize" Cargo.toml files for maximal compatibility
+# with all versions of Cargo and also rewrite `path` dependencies
+# to registry (e.g., crates.io) dependencies.
+#
+# If you are reading this file be aware that the original Cargo.toml
+# will likely look very different (and much more reasonable).
+# See Cargo.toml.orig for the original contents.
+
+[package]
+edition = "2021"
+name = "protobuf-support"
+version = "3.2.0"
+authors = ["Stepan Koltsov <stepan.koltsov@gmail.com>"]
+description = """
+Code supporting protobuf implementation. None of code in this crate is public API.
+"""
+homepage = "https://github.com/stepancheg/rust-protobuf/"
+documentation = "https://github.com/stepancheg/rust-protobuf/blob/master/README.md"
+readme = "README.md"
+license = "MIT"
+repository = "https://github.com/stepancheg/rust-protobuf/"
+
+[package.metadata.docs.rs]
+all-features = true
+
+[lib]
+bench = false
+
+[dependencies.thiserror]
+version = "1.0.30"
+
+[features]
diff --git a/Cargo.toml.orig b/Cargo.toml.orig
new file mode 100644
index 0000000..ee6fc99
--- /dev/null
+++ b/Cargo.toml.orig
@@ -0,0 +1,24 @@
+[package]
+
+name = "protobuf-support"
+version = "3.2.0"
+authors = ["Stepan Koltsov <stepan.koltsov@gmail.com>"]
+edition = "2021"
+license = "MIT"
+homepage = "https://github.com/stepancheg/rust-protobuf/"
+repository = "https://github.com/stepancheg/rust-protobuf/"
+documentation = "https://github.com/stepancheg/rust-protobuf/blob/master/README.md"
+description = """
+Code supporting protobuf implementation. None of code in this crate is public API.
+"""
+
+[lib]
+bench = false
+
+[features]
+
+[dependencies]
+thiserror = "1.0.30"
+
+[package.metadata.docs.rs]
+all-features = true
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..acce639
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,19 @@
+Copyright (c) 2019 Stepan Koltsov
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file
diff --git a/LICENSE.txt b/LICENSE.txt
new file mode 100644
index 0000000..acce639
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,19 @@
+Copyright (c) 2019 Stepan Koltsov
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file
diff --git a/METADATA b/METADATA
new file mode 100644
index 0000000..517ef56
--- /dev/null
+++ b/METADATA
@@ -0,0 +1,19 @@
+name: "protobuf-support"
+description: "()"
+third_party {
+ url {
+ type: HOMEPAGE
+ value: "https://crates.io/crates/protobuf-support"
+ }
+ url {
+ type: ARCHIVE
+ value: "https://static.crates.io/crates/protobuf-support/protobuf-support-3.2.0.crate"
+ }
+ version: "3.2.0"
+ license_type: NOTICE
+ last_upgrade_date {
+ year: 2023
+ month: 2
+ day: 27
+ }
+}
diff --git a/MODULE_LICENSE_MIT b/MODULE_LICENSE_MIT
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/MODULE_LICENSE_MIT
diff --git a/OWNERS b/OWNERS
new file mode 100644
index 0000000..45dc4dd
--- /dev/null
+++ b/OWNERS
@@ -0,0 +1 @@
+include platform/prebuilts/rust:master:/OWNERS
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..88bdebb
--- /dev/null
+++ b/README.md
@@ -0,0 +1,8 @@
+<!-- cargo-sync-readme start -->
+
+# Supporting code for protobuf crates
+
+Code in this crate is used in protobuf crates like `protobuf` or `protobuf-parse`.
+None of code in this crate has public API.
+
+<!-- cargo-sync-readme end -->
diff --git a/cargo2android.json b/cargo2android.json
new file mode 100644
index 0000000..26ba70d
--- /dev/null
+++ b/cargo2android.json
@@ -0,0 +1,10 @@
+{
+ "apex-available": [
+ "//apex_available:platform",
+ "//apex_available:anyapex"
+ ],
+ "min-sdk-version": "29",
+ "dependencies": true,
+ "device": true,
+ "run": true
+}
diff --git a/src/json_name.rs b/src/json_name.rs
new file mode 100644
index 0000000..f5c9364
--- /dev/null
+++ b/src/json_name.rs
@@ -0,0 +1,19 @@
+/// Implementation must match exactly
+/// `ToJsonName()` function in C++ `descriptor.cc`.
+pub fn json_name(input: &str) -> String {
+ let mut capitalize_next = false;
+ let mut result = String::with_capacity(input.len());
+
+ for c in input.chars() {
+ if c == '_' {
+ capitalize_next = true;
+ } else if capitalize_next {
+ result.extend(c.to_uppercase());
+ capitalize_next = false;
+ } else {
+ result.push(c);
+ }
+ }
+
+ result
+}
diff --git a/src/lexer/float.rs b/src/lexer/float.rs
new file mode 100644
index 0000000..f09c101
--- /dev/null
+++ b/src/lexer/float.rs
@@ -0,0 +1,56 @@
+#[derive(Debug)]
+pub enum ProtobufFloatParseError {
+ EmptyString,
+ CannotParseFloat,
+}
+
+pub type ProtobufFloatParseResult<T> = Result<T, ProtobufFloatParseError>;
+
+pub const PROTOBUF_NAN: &str = "nan";
+pub const PROTOBUF_INF: &str = "inf";
+
+/// Format float as in protobuf `.proto` files
+pub fn format_protobuf_float(f: f64) -> String {
+ if f.is_nan() {
+ PROTOBUF_NAN.to_owned()
+ } else if f.is_infinite() {
+ if f > 0.0 {
+ format!("{}", PROTOBUF_INF)
+ } else {
+ format!("-{}", PROTOBUF_INF)
+ }
+ } else {
+ // TODO: make sure doesn't lose precision
+ format!("{}", f)
+ }
+}
+
+/// Parse float from `.proto` format
+pub fn parse_protobuf_float(s: &str) -> ProtobufFloatParseResult<f64> {
+ if s.is_empty() {
+ return Err(ProtobufFloatParseError::EmptyString);
+ }
+ if s == PROTOBUF_NAN {
+ return Ok(f64::NAN);
+ }
+ if s == PROTOBUF_INF || s == format!("+{}", PROTOBUF_INF) {
+ return Ok(f64::INFINITY);
+ }
+ if s == format!("-{}", PROTOBUF_INF) {
+ return Ok(f64::NEG_INFINITY);
+ }
+ match s.parse() {
+ Ok(f) => Ok(f),
+ Err(_) => Err(ProtobufFloatParseError::CannotParseFloat),
+ }
+}
+
+#[cfg(test)]
+mod test {
+ use super::*;
+
+ #[test]
+ fn test_format_protobuf_float() {
+ assert_eq!("10", format_protobuf_float(10.0));
+ }
+}
diff --git a/src/lexer/int.rs b/src/lexer/int.rs
new file mode 100644
index 0000000..676c1ba
--- /dev/null
+++ b/src/lexer/int.rs
@@ -0,0 +1,12 @@
+pub struct Overflow;
+
+/// Negate `u64` checking for overflow.
+pub fn neg(value: u64) -> Result<i64, Overflow> {
+ if value <= 0x7fff_ffff_ffff_ffff {
+ Ok(-(value as i64))
+ } else if value == 0x8000_0000_0000_0000 {
+ Ok(-0x8000_0000_0000_0000)
+ } else {
+ Err(Overflow)
+ }
+}
diff --git a/src/lexer/json_number_lit.rs b/src/lexer/json_number_lit.rs
new file mode 100644
index 0000000..1323517
--- /dev/null
+++ b/src/lexer/json_number_lit.rs
@@ -0,0 +1,10 @@
+use std::fmt;
+
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct JsonNumberLit(pub String);
+
+impl fmt::Display for JsonNumberLit {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ fmt::Display::fmt(&self.0, f)
+ }
+}
diff --git a/src/lexer/lexer_impl.rs b/src/lexer/lexer_impl.rs
new file mode 100644
index 0000000..4990c8f
--- /dev/null
+++ b/src/lexer/lexer_impl.rs
@@ -0,0 +1,712 @@
+use std::char;
+use std::convert::TryFrom;
+use std::num::ParseFloatError;
+use std::num::ParseIntError;
+
+use crate::lexer::float;
+use crate::lexer::float::ProtobufFloatParseError;
+use crate::lexer::json_number_lit::JsonNumberLit;
+use crate::lexer::loc::Loc;
+use crate::lexer::loc::FIRST_COL;
+use crate::lexer::parser_language::ParserLanguage;
+use crate::lexer::str_lit::StrLit;
+use crate::lexer::str_lit::StrLitDecodeError;
+use crate::lexer::token::Token;
+use crate::lexer::token::TokenWithLocation;
+
+#[derive(Debug, thiserror::Error)]
+pub enum LexerError {
+ // TODO: something better than this
+ #[error("Incorrect input")]
+ IncorrectInput,
+ #[error("Unexpected EOF")]
+ UnexpectedEof,
+ #[error("Expecting char: {:?}", .0)]
+ ExpectChar(char),
+ #[error("Parse int error")]
+ ParseIntError,
+ #[error("Parse float error")]
+ ParseFloatError,
+ // TODO: how it is different from ParseFloatError?
+ #[error("Incorrect float literal")]
+ IncorrectFloatLit,
+ #[error("Incorrect JSON escape")]
+ IncorrectJsonEscape,
+ #[error("Incorrect JSON number")]
+ IncorrectJsonNumber,
+ #[error("Incorrect Unicode character")]
+ IncorrectUnicodeChar,
+ #[error("Expecting hex digit")]
+ ExpectHexDigit,
+ #[error("Expecting oct digit")]
+ ExpectOctDigit,
+ #[error("Expecting dec digit")]
+ ExpectDecDigit,
+ #[error(transparent)]
+ StrLitDecodeError(#[from] StrLitDecodeError),
+ #[error("Expecting identifier")]
+ ExpectedIdent,
+}
+
+pub type LexerResult<T> = Result<T, LexerError>;
+
+impl From<ParseIntError> for LexerError {
+ fn from(_: ParseIntError) -> Self {
+ LexerError::ParseIntError
+ }
+}
+
+impl From<ParseFloatError> for LexerError {
+ fn from(_: ParseFloatError) -> Self {
+ LexerError::ParseFloatError
+ }
+}
+
+impl From<ProtobufFloatParseError> for LexerError {
+ fn from(_: ProtobufFloatParseError) -> Self {
+ LexerError::IncorrectFloatLit
+ }
+}
+
+#[derive(Copy, Clone)]
+pub struct Lexer<'a> {
+ language: ParserLanguage,
+ input: &'a str,
+ pos: usize,
+ pub loc: Loc,
+}
+
+fn is_letter(c: char) -> bool {
+ c.is_alphabetic() || c == '_'
+}
+
+impl<'a> Lexer<'a> {
+ pub fn new(input: &'a str, language: ParserLanguage) -> Lexer<'a> {
+ Lexer {
+ language,
+ input,
+ pos: 0,
+ loc: Loc::start(),
+ }
+ }
+
+ /// No more chars
+ pub fn eof(&self) -> bool {
+ self.pos == self.input.len()
+ }
+
+ /// Remaining chars
+ fn rem_chars(&self) -> &'a str {
+ &self.input[self.pos..]
+ }
+
+ pub fn lookahead_char_is<P: FnOnce(char) -> bool>(&self, p: P) -> bool {
+ self.lookahead_char().map_or(false, p)
+ }
+
+ fn lookahead_char_is_in(&self, alphabet: &str) -> bool {
+ self.lookahead_char_is(|c| alphabet.contains(c))
+ }
+
+ fn next_char_opt(&mut self) -> Option<char> {
+ let rem = self.rem_chars();
+ if rem.is_empty() {
+ None
+ } else {
+ let mut char_indices = rem.char_indices();
+ let (_, c) = char_indices.next().unwrap();
+ let c_len = char_indices.next().map(|(len, _)| len).unwrap_or(rem.len());
+ self.pos += c_len;
+ if c == '\n' {
+ self.loc.line += 1;
+ self.loc.col = FIRST_COL;
+ } else {
+ self.loc.col += 1;
+ }
+ Some(c)
+ }
+ }
+
+ fn next_char(&mut self) -> LexerResult<char> {
+ self.next_char_opt().ok_or(LexerError::UnexpectedEof)
+ }
+
+ /// Skip whitespaces
+ fn skip_whitespaces(&mut self) {
+ self.take_while(|c| c.is_whitespace());
+ }
+
+ fn skip_c_comment(&mut self) -> LexerResult<()> {
+ if self.skip_if_lookahead_is_str("/*") {
+ let end = "*/";
+ match self.rem_chars().find(end) {
+ None => Err(LexerError::UnexpectedEof),
+ Some(len) => {
+ let new_pos = self.pos + len + end.len();
+ self.skip_to_pos(new_pos);
+ Ok(())
+ }
+ }
+ } else {
+ Ok(())
+ }
+ }
+
+ fn skip_cpp_comment(&mut self) {
+ if self.skip_if_lookahead_is_str("//") {
+ loop {
+ match self.next_char_opt() {
+ Some('\n') | None => break,
+ _ => {}
+ }
+ }
+ }
+ }
+
+ fn skip_sh_comment(&mut self) {
+ if self.skip_if_lookahead_is_str("#") {
+ loop {
+ match self.next_char_opt() {
+ Some('\n') | None => break,
+ _ => {}
+ }
+ }
+ }
+ }
+
+ fn skip_comment(&mut self) -> LexerResult<()> {
+ match self.language {
+ ParserLanguage::Proto => {
+ self.skip_c_comment()?;
+ self.skip_cpp_comment();
+ }
+ ParserLanguage::TextFormat => {
+ self.skip_sh_comment();
+ }
+ ParserLanguage::Json => {}
+ }
+ Ok(())
+ }
+
+ pub fn skip_ws(&mut self) -> LexerResult<()> {
+ loop {
+ let pos = self.pos;
+ self.skip_whitespaces();
+ self.skip_comment()?;
+ if pos == self.pos {
+ // Did not advance
+ return Ok(());
+ }
+ }
+ }
+
+ pub fn take_while<F>(&mut self, f: F) -> &'a str
+ where
+ F: Fn(char) -> bool,
+ {
+ let start = self.pos;
+ while self.lookahead_char().map(&f) == Some(true) {
+ self.next_char_opt().unwrap();
+ }
+ let end = self.pos;
+ &self.input[start..end]
+ }
+
+ fn lookahead_char(&self) -> Option<char> {
+ self.clone().next_char_opt()
+ }
+
+ fn lookahead_is_str(&self, s: &str) -> bool {
+ self.rem_chars().starts_with(s)
+ }
+
+ fn skip_if_lookahead_is_str(&mut self, s: &str) -> bool {
+ if self.lookahead_is_str(s) {
+ let new_pos = self.pos + s.len();
+ self.skip_to_pos(new_pos);
+ true
+ } else {
+ false
+ }
+ }
+
+ fn next_char_if<P>(&mut self, p: P) -> Option<char>
+ where
+ P: FnOnce(char) -> bool,
+ {
+ let mut clone = self.clone();
+ match clone.next_char_opt() {
+ Some(c) if p(c) => {
+ *self = clone;
+ Some(c)
+ }
+ _ => None,
+ }
+ }
+
+ pub fn next_char_if_eq(&mut self, expect: char) -> bool {
+ self.next_char_if(|c| c == expect) != None
+ }
+
+ fn next_char_if_in(&mut self, alphabet: &str) -> Option<char> {
+ for c in alphabet.chars() {
+ if self.next_char_if_eq(c) {
+ return Some(c);
+ }
+ }
+ None
+ }
+
+ fn next_char_expect_eq(&mut self, expect: char) -> LexerResult<()> {
+ if self.next_char_if_eq(expect) {
+ Ok(())
+ } else {
+ Err(LexerError::ExpectChar(expect))
+ }
+ }
+
+ fn next_char_expect<P>(&mut self, expect: P, err: LexerError) -> LexerResult<char>
+ where
+ P: FnOnce(char) -> bool,
+ {
+ self.next_char_if(expect).ok_or(err)
+ }
+
+ // str functions
+
+ /// properly update line and column
+ fn skip_to_pos(&mut self, new_pos: usize) -> &'a str {
+ assert!(new_pos >= self.pos);
+ assert!(new_pos <= self.input.len());
+ let pos = self.pos;
+ while self.pos != new_pos {
+ self.next_char_opt().unwrap();
+ }
+ &self.input[pos..new_pos]
+ }
+
+ // Protobuf grammar
+
+ // char functions
+
+ // letter = "A" … "Z" | "a" … "z"
+ // https://github.com/google/protobuf/issues/4565
+ fn next_letter_opt(&mut self) -> Option<char> {
+ self.next_char_if(is_letter)
+ }
+
+ // capitalLetter = "A" … "Z"
+ fn _next_capital_letter_opt(&mut self) -> Option<char> {
+ self.next_char_if(|c| c >= 'A' && c <= 'Z')
+ }
+
+ fn next_ident_part(&mut self) -> Option<char> {
+ self.next_char_if(|c| c.is_ascii_alphanumeric() || c == '_')
+ }
+
+ // Identifiers
+
+ // ident = letter { letter | decimalDigit | "_" }
+ fn next_ident_opt(&mut self) -> LexerResult<Option<String>> {
+ if let Some(c) = self.next_letter_opt() {
+ let mut ident = String::new();
+ ident.push(c);
+ while let Some(c) = self.next_ident_part() {
+ ident.push(c);
+ }
+ Ok(Some(ident))
+ } else {
+ Ok(None)
+ }
+ }
+
+ // Integer literals
+
+ // hexLit = "0" ( "x" | "X" ) hexDigit { hexDigit }
+ fn next_hex_lit_opt(&mut self) -> LexerResult<Option<u64>> {
+ Ok(
+ if self.skip_if_lookahead_is_str("0x") || self.skip_if_lookahead_is_str("0X") {
+ let s = self.take_while(|c| c.is_ascii_hexdigit());
+ Some(u64::from_str_radix(s, 16)? as u64)
+ } else {
+ None
+ },
+ )
+ }
+
+ // decimalLit = ( "1" … "9" ) { decimalDigit }
+ // octalLit = "0" { octalDigit }
+ fn next_decimal_octal_lit_opt(&mut self) -> LexerResult<Option<u64>> {
+ // do not advance on number parse error
+ let mut clone = self.clone();
+
+ let pos = clone.pos;
+
+ Ok(if clone.next_char_if(|c| c.is_ascii_digit()) != None {
+ clone.take_while(|c| c.is_ascii_digit());
+ let value = clone.input[pos..clone.pos].parse()?;
+ *self = clone;
+ Some(value)
+ } else {
+ None
+ })
+ }
+
+ // hexDigit = "0" … "9" | "A" … "F" | "a" … "f"
+ fn next_hex_digit(&mut self) -> LexerResult<u32> {
+ let mut clone = self.clone();
+ let r = match clone.next_char()? {
+ c if c >= '0' && c <= '9' => c as u32 - b'0' as u32,
+ c if c >= 'A' && c <= 'F' => c as u32 - b'A' as u32 + 10,
+ c if c >= 'a' && c <= 'f' => c as u32 - b'a' as u32 + 10,
+ _ => return Err(LexerError::ExpectHexDigit),
+ };
+ *self = clone;
+ Ok(r)
+ }
+
+ // octalDigit = "0" … "7"
+ fn next_octal_digit(&mut self) -> LexerResult<u32> {
+ self.next_char_expect(|c| c >= '0' && c <= '9', LexerError::ExpectOctDigit)
+ .map(|c| c as u32 - '0' as u32)
+ }
+
+ // decimalDigit = "0" … "9"
+ fn next_decimal_digit(&mut self) -> LexerResult<u32> {
+ self.next_char_expect(|c| c >= '0' && c <= '9', LexerError::ExpectDecDigit)
+ .map(|c| c as u32 - '0' as u32)
+ }
+
+ // decimals = decimalDigit { decimalDigit }
+ fn next_decimal_digits(&mut self) -> LexerResult<()> {
+ self.next_decimal_digit()?;
+ self.take_while(|c| c >= '0' && c <= '9');
+ Ok(())
+ }
+
+ // intLit = decimalLit | octalLit | hexLit
+ pub fn next_int_lit_opt(&mut self) -> LexerResult<Option<u64>> {
+ assert_ne!(ParserLanguage::Json, self.language);
+
+ self.skip_ws()?;
+ if let Some(i) = self.next_hex_lit_opt()? {
+ return Ok(Some(i));
+ }
+ if let Some(i) = self.next_decimal_octal_lit_opt()? {
+ return Ok(Some(i));
+ }
+ Ok(None)
+ }
+
+ // Floating-point literals
+
+ // exponent = ( "e" | "E" ) [ "+" | "-" ] decimals
+ fn next_exponent_opt(&mut self) -> LexerResult<Option<()>> {
+ if self.next_char_if_in("eE") != None {
+ self.next_char_if_in("+-");
+ self.next_decimal_digits()?;
+ Ok(Some(()))
+ } else {
+ Ok(None)
+ }
+ }
+
+ // floatLit = ( decimals "." [ decimals ] [ exponent ] | decimals exponent | "."decimals [ exponent ] ) | "inf" | "nan"
+ fn next_float_lit(&mut self) -> LexerResult<()> {
+ assert_ne!(ParserLanguage::Json, self.language);
+
+ // "inf" and "nan" are handled as part of ident
+ if self.next_char_if_eq('.') {
+ self.next_decimal_digits()?;
+ self.next_exponent_opt()?;
+ } else {
+ self.next_decimal_digits()?;
+ if self.next_char_if_eq('.') {
+ self.next_decimal_digits()?;
+ self.next_exponent_opt()?;
+ } else {
+ if self.next_exponent_opt()? == None {
+ return Err(LexerError::IncorrectFloatLit);
+ }
+ }
+ }
+ Ok(())
+ }
+
+ // String literals
+
+ // charValue = hexEscape | octEscape | charEscape | /[^\0\n\\]/
+ // hexEscape = '\' ( "x" | "X" ) hexDigit hexDigit
+ // https://github.com/google/protobuf/issues/4560
+ // octEscape = '\' octalDigit octalDigit octalDigit
+ // charEscape = '\' ( "a" | "b" | "f" | "n" | "r" | "t" | "v" | '\' | "'" | '"' )
+ // quote = "'" | '"'
+ pub fn next_byte_value(&mut self) -> LexerResult<u8> {
+ match self.next_char()? {
+ '\\' => {
+ match self.next_char()? {
+ '\'' => Ok(b'\''),
+ '"' => Ok(b'"'),
+ '\\' => Ok(b'\\'),
+ 'a' => Ok(b'\x07'),
+ 'b' => Ok(b'\x08'),
+ 'f' => Ok(b'\x0c'),
+ 'n' => Ok(b'\n'),
+ 'r' => Ok(b'\r'),
+ 't' => Ok(b'\t'),
+ 'v' => Ok(b'\x0b'),
+ 'x' => {
+ let d1 = self.next_hex_digit()? as u8;
+ let d2 = self.next_hex_digit()? as u8;
+ Ok(((d1 << 4) | d2) as u8)
+ }
+ d if d >= '0' && d <= '7' => {
+ let mut r = d as u8 - b'0';
+ for _ in 0..2 {
+ match self.next_octal_digit() {
+ Err(_) => break,
+ Ok(d) => r = (r << 3) + d as u8,
+ }
+ }
+ Ok(r)
+ }
+ // https://github.com/google/protobuf/issues/4562
+ // TODO: overflow
+ c => Ok(c as u8),
+ }
+ }
+ '\n' | '\0' => Err(LexerError::IncorrectInput),
+ // TODO: check overflow
+ c => Ok(c as u8),
+ }
+ }
+
+ fn char_try_from(i: u32) -> LexerResult<char> {
+ char::try_from(i).map_err(|_| LexerError::IncorrectUnicodeChar)
+ }
+
+ pub fn next_json_char_value(&mut self) -> LexerResult<char> {
+ match self.next_char()? {
+ '\\' => match self.next_char()? {
+ '"' => Ok('"'),
+ '\'' => Ok('\''),
+ '\\' => Ok('\\'),
+ '/' => Ok('/'),
+ 'b' => Ok('\x08'),
+ 'f' => Ok('\x0c'),
+ 'n' => Ok('\n'),
+ 'r' => Ok('\r'),
+ 't' => Ok('\t'),
+ 'u' => {
+ let mut v = 0;
+ for _ in 0..4 {
+ let digit = self.next_hex_digit()?;
+ v = v * 16 + digit;
+ }
+ Self::char_try_from(v)
+ }
+ _ => Err(LexerError::IncorrectJsonEscape),
+ },
+ c => Ok(c),
+ }
+ }
+
+ // https://github.com/google/protobuf/issues/4564
+ // strLit = ( "'" { charValue } "'" ) | ( '"' { charValue } '"' )
+ fn next_str_lit_raw(&mut self) -> LexerResult<String> {
+ let mut raw = String::new();
+
+ let mut first = true;
+ loop {
+ if !first {
+ self.skip_ws()?;
+ }
+
+ let start = self.pos;
+
+ let q = match self.next_char_if_in("'\"") {
+ Some(q) => q,
+ None if !first => break,
+ None => return Err(LexerError::IncorrectInput),
+ };
+ first = false;
+ while self.lookahead_char() != Some(q) {
+ self.next_byte_value()?;
+ }
+ self.next_char_expect_eq(q)?;
+
+ raw.push_str(&self.input[start + 1..self.pos - 1]);
+ }
+ Ok(raw)
+ }
+
+ fn next_str_lit_raw_opt(&mut self) -> LexerResult<Option<String>> {
+ if self.lookahead_char_is_in("'\"") {
+ Ok(Some(self.next_str_lit_raw()?))
+ } else {
+ Ok(None)
+ }
+ }
+
+ /// Parse next token as JSON number
+ fn next_json_number_opt(&mut self) -> LexerResult<Option<JsonNumberLit>> {
+ assert_eq!(ParserLanguage::Json, self.language);
+
+ fn is_digit(c: char) -> bool {
+ c >= '0' && c <= '9'
+ }
+
+ fn is_digit_1_9(c: char) -> bool {
+ c >= '1' && c <= '9'
+ }
+
+ if !self.lookahead_char_is_in("-0123456789") {
+ return Ok(None);
+ }
+
+ let mut s = String::new();
+ if self.next_char_if_eq('-') {
+ s.push('-');
+ }
+
+ if self.next_char_if_eq('0') {
+ s.push('0');
+ } else {
+ s.push(self.next_char_expect(is_digit_1_9, LexerError::IncorrectJsonNumber)?);
+ while let Some(c) = self.next_char_if(is_digit) {
+ s.push(c);
+ }
+ }
+
+ if self.next_char_if_eq('.') {
+ s.push('.');
+ s.push(self.next_char_expect(is_digit, LexerError::IncorrectJsonNumber)?);
+ while let Some(c) = self.next_char_if(is_digit) {
+ s.push(c);
+ }
+ }
+
+ if let Some(c) = self.next_char_if_in("eE") {
+ s.push(c);
+ if let Some(c) = self.next_char_if_in("+-") {
+ s.push(c);
+ }
+ s.push(self.next_char_expect(is_digit, LexerError::IncorrectJsonNumber)?);
+ while let Some(c) = self.next_char_if(is_digit) {
+ s.push(c);
+ }
+ }
+
+ Ok(Some(JsonNumberLit(s)))
+ }
+
+ fn next_token_inner(&mut self) -> LexerResult<Token> {
+ if self.language == ParserLanguage::Json {
+ if let Some(v) = self.next_json_number_opt()? {
+ return Ok(Token::JsonNumber(v));
+ }
+ }
+
+ if let Some(ident) = self.next_ident_opt()? {
+ let token = if self.language != ParserLanguage::Json && ident == float::PROTOBUF_NAN {
+ Token::FloatLit(f64::NAN)
+ } else if self.language != ParserLanguage::Json && ident == float::PROTOBUF_INF {
+ Token::FloatLit(f64::INFINITY)
+ } else {
+ Token::Ident(ident.to_owned())
+ };
+ return Ok(token);
+ }
+
+ if self.language != ParserLanguage::Json {
+ let mut clone = self.clone();
+ let pos = clone.pos;
+ if let Ok(_) = clone.next_float_lit() {
+ let f = float::parse_protobuf_float(&self.input[pos..clone.pos])?;
+ *self = clone;
+ return Ok(Token::FloatLit(f));
+ }
+
+ if let Some(lit) = self.next_int_lit_opt()? {
+ return Ok(Token::IntLit(lit));
+ }
+ }
+
+ if let Some(escaped) = self.next_str_lit_raw_opt()? {
+ return Ok(Token::StrLit(StrLit { escaped }));
+ }
+
+ // This branch must be after str lit
+ if let Some(c) = self.next_char_if(|c| c.is_ascii_punctuation()) {
+ return Ok(Token::Symbol(c));
+ }
+
+ if let Some(ident) = self.next_ident_opt()? {
+ return Ok(Token::Ident(ident));
+ }
+
+ Err(LexerError::IncorrectInput)
+ }
+
+ pub fn next_token(&mut self) -> LexerResult<Option<TokenWithLocation>> {
+ self.skip_ws()?;
+ let loc = self.loc;
+
+ Ok(if self.eof() {
+ None
+ } else {
+ let token = self.next_token_inner()?;
+ // Skip whitespace here to update location
+ // to the beginning of the next token
+ self.skip_ws()?;
+ Some(TokenWithLocation { token, loc })
+ })
+ }
+}
+
+#[cfg(test)]
+mod test {
+ use super::*;
+
+ fn lex<P, R>(input: &str, parse_what: P) -> R
+ where
+ P: FnOnce(&mut Lexer) -> LexerResult<R>,
+ {
+ let mut lexer = Lexer::new(input, ParserLanguage::Proto);
+ let r = parse_what(&mut lexer).expect(&format!("lexer failed at {}", lexer.loc));
+ assert!(lexer.eof(), "check eof failed at {}", lexer.loc);
+ r
+ }
+
+ fn lex_opt<P, R>(input: &str, parse_what: P) -> R
+ where
+ P: FnOnce(&mut Lexer) -> LexerResult<Option<R>>,
+ {
+ let mut lexer = Lexer::new(input, ParserLanguage::Proto);
+ let o = parse_what(&mut lexer).expect(&format!("lexer failed at {}", lexer.loc));
+ let r = o.expect(&format!("lexer returned none at {}", lexer.loc));
+ assert!(lexer.eof(), "check eof failed at {}", lexer.loc);
+ r
+ }
+
+ #[test]
+ fn test_lexer_int_lit() {
+ let msg = r#"10"#;
+ let mess = lex_opt(msg, |p| p.next_int_lit_opt());
+ assert_eq!(10, mess);
+ }
+
+ #[test]
+ fn test_lexer_float_lit() {
+ let msg = r#"12.3"#;
+ let mess = lex(msg, |p| p.next_token_inner());
+ assert_eq!(Token::FloatLit(12.3), mess);
+ }
+
+ #[test]
+ fn test_lexer_float_lit_leading_zeros_in_exp() {
+ let msg = r#"1e00009"#;
+ let mess = lex(msg, |p| p.next_token_inner());
+ assert_eq!(Token::FloatLit(1_000_000_000.0), mess);
+ }
+}
diff --git a/src/lexer/loc.rs b/src/lexer/loc.rs
new file mode 100644
index 0000000..ea3fc1a
--- /dev/null
+++ b/src/lexer/loc.rs
@@ -0,0 +1,28 @@
+use std::fmt;
+
+pub const FIRST_LINE: u32 = 1;
+pub const FIRST_COL: u32 = 1;
+
+/// Location in file
+#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd)]
+pub struct Loc {
+ /// 1-based
+ pub line: u32,
+ /// 1-based
+ pub col: u32,
+}
+
+impl fmt::Display for Loc {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ write!(f, "{}:{}", self.line, self.col)
+ }
+}
+
+impl Loc {
+ pub fn start() -> Loc {
+ Loc {
+ line: FIRST_LINE,
+ col: FIRST_COL,
+ }
+ }
+}
diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs
new file mode 100644
index 0000000..bde64f5
--- /dev/null
+++ b/src/lexer/mod.rs
@@ -0,0 +1,12 @@
+//! Implementation of lexer for both protobuf parser and for text format parser.
+
+pub mod float;
+pub mod int;
+pub mod json_number_lit;
+pub mod lexer_impl;
+pub mod loc;
+pub mod num_lit;
+pub mod parser_language;
+pub mod str_lit;
+pub mod token;
+pub mod tokenizer;
diff --git a/src/lexer/num_lit.rs b/src/lexer/num_lit.rs
new file mode 100644
index 0000000..cc64cc4
--- /dev/null
+++ b/src/lexer/num_lit.rs
@@ -0,0 +1,5 @@
+#[derive(Copy, Clone)]
+pub enum NumLit {
+ U64(u64),
+ F64(f64),
+}
diff --git a/src/lexer/parser_language.rs b/src/lexer/parser_language.rs
new file mode 100644
index 0000000..e356571
--- /dev/null
+++ b/src/lexer/parser_language.rs
@@ -0,0 +1,10 @@
+/// We use the same lexer/tokenizer for all parsers for simplicity
+#[derive(Copy, Clone, Debug, Eq, PartialEq)]
+pub enum ParserLanguage {
+ // `.proto` files
+ Proto,
+ // Protobuf text format
+ TextFormat,
+ // JSON
+ Json,
+}
diff --git a/src/lexer/str_lit.rs b/src/lexer/str_lit.rs
new file mode 100644
index 0000000..0e51a16
--- /dev/null
+++ b/src/lexer/str_lit.rs
@@ -0,0 +1,77 @@
+use std::fmt;
+use std::string::FromUtf8Error;
+
+use crate::lexer::lexer_impl::Lexer;
+use crate::lexer::parser_language::ParserLanguage;
+
+#[derive(Debug, thiserror::Error)]
+pub enum StrLitDecodeError {
+ #[error(transparent)]
+ FromUtf8Error(#[from] FromUtf8Error),
+ #[error("String literal decode error")]
+ OtherError,
+}
+
+pub type StrLitDecodeResult<T> = Result<T, StrLitDecodeError>;
+
+/// String literal, both `string` and `bytes`.
+#[derive(Clone, Eq, PartialEq, Debug)]
+pub struct StrLit {
+ pub escaped: String,
+}
+
+impl fmt::Display for StrLit {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ write!(f, "\"{}\"", &self.escaped)
+ }
+}
+
+impl StrLit {
+ /// May fail if not valid UTF8
+ pub fn decode_utf8(&self) -> StrLitDecodeResult<String> {
+ let mut lexer = Lexer::new(&self.escaped, ParserLanguage::Json);
+ let mut r = Vec::new();
+ while !lexer.eof() {
+ r.push(
+ lexer
+ .next_byte_value()
+ .map_err(|_| StrLitDecodeError::OtherError)?,
+ );
+ }
+ Ok(String::from_utf8(r)?)
+ }
+
+ pub fn decode_bytes(&self) -> StrLitDecodeResult<Vec<u8>> {
+ let mut lexer = Lexer::new(&self.escaped, ParserLanguage::Json);
+ let mut r = Vec::new();
+ while !lexer.eof() {
+ r.push(
+ lexer
+ .next_byte_value()
+ .map_err(|_| StrLitDecodeError::OtherError)?,
+ );
+ }
+ Ok(r)
+ }
+
+ pub fn quoted(&self) -> String {
+ format!("\"{}\"", self.escaped)
+ }
+}
+
+#[cfg(test)]
+mod test {
+ use crate::lexer::str_lit::StrLit;
+
+ #[test]
+ fn decode_utf8() {
+ assert_eq!(
+ "\u{1234}".to_owned(),
+ StrLit {
+ escaped: "\\341\\210\\264".to_owned()
+ }
+ .decode_utf8()
+ .unwrap()
+ )
+ }
+}
diff --git a/src/lexer/token.rs b/src/lexer/token.rs
new file mode 100644
index 0000000..b20aba6
--- /dev/null
+++ b/src/lexer/token.rs
@@ -0,0 +1,47 @@
+use crate::lexer::json_number_lit::JsonNumberLit;
+use crate::lexer::lexer_impl::LexerError;
+use crate::lexer::lexer_impl::LexerResult;
+use crate::lexer::loc::Loc;
+use crate::lexer::num_lit::NumLit;
+use crate::lexer::str_lit::StrLit;
+
+#[derive(Clone, Debug, PartialEq)]
+pub enum Token {
+ Ident(String),
+ Symbol(char),
+ // Protobuf tokenizer has separate tokens for int and float.
+ // Tokens do not include sign.
+ IntLit(u64),
+ FloatLit(f64),
+ JsonNumber(JsonNumberLit),
+ // including quotes
+ StrLit(StrLit),
+}
+
+impl Token {
+ /// Back to original
+ pub fn format(&self) -> String {
+ match self {
+ &Token::Ident(ref s) => s.clone(),
+ &Token::Symbol(c) => c.to_string(),
+ &Token::IntLit(ref i) => i.to_string(),
+ &Token::StrLit(ref s) => s.quoted(),
+ &Token::FloatLit(ref f) => f.to_string(),
+ &Token::JsonNumber(ref f) => f.to_string(),
+ }
+ }
+
+ pub fn to_num_lit(&self) -> LexerResult<NumLit> {
+ match self {
+ &Token::IntLit(i) => Ok(NumLit::U64(i)),
+ &Token::FloatLit(f) => Ok(NumLit::F64(f)),
+ _ => Err(LexerError::IncorrectInput),
+ }
+ }
+}
+
+#[derive(Clone)]
+pub struct TokenWithLocation {
+ pub token: Token,
+ pub loc: Loc,
+}
diff --git a/src/lexer/tokenizer.rs b/src/lexer/tokenizer.rs
new file mode 100644
index 0000000..c5e84a0
--- /dev/null
+++ b/src/lexer/tokenizer.rs
@@ -0,0 +1,330 @@
+use crate::lexer::lexer_impl::Lexer;
+use crate::lexer::lexer_impl::LexerError;
+use crate::lexer::loc::Loc;
+use crate::lexer::parser_language::ParserLanguage;
+use crate::lexer::str_lit::StrLit;
+use crate::lexer::str_lit::StrLitDecodeError;
+use crate::lexer::token::Token;
+use crate::lexer::token::TokenWithLocation;
+
+#[derive(Debug, thiserror::Error)]
+pub enum TokenizerError {
+ #[error(transparent)]
+ LexerError(#[from] LexerError),
+ #[error(transparent)]
+ StrLitDecodeError(#[from] StrLitDecodeError),
+ #[error("Internal tokenizer error")]
+ InternalError,
+ // TODO: too broad
+ #[error("Incorrect input")]
+ IncorrectInput,
+ #[error("Not allowed in this context: {0}")]
+ NotAllowedInThisContext(&'static str),
+ #[error("Unexpected end of input")]
+ UnexpectedEof,
+ #[error("Expecting string literal")]
+ ExpectStrLit,
+ #[error("Expecting int literal")]
+ ExpectIntLit,
+ #[error("Expecting float literal")]
+ ExpectFloatLit,
+ #[error("Expecting identifier")]
+ ExpectIdent,
+ #[error("Expecting identifier `{}`", .0)]
+ ExpectNamedIdent(String),
+ #[error("While parsing {}, expecting char `{}`", .1, .0)]
+ ExpectChar(char, &'static str),
+ #[error("Expecting any char of: {}", .0.iter().map(|c| format!("`{}`", c)).collect::<Vec<_>>().join(", "))]
+ ExpectAnyChar(Vec<char>),
+}
+
+pub type TokenizerResult<R> = Result<R, TokenizerError>;
+
+#[derive(Clone)]
+pub struct Tokenizer<'a> {
+ lexer: Lexer<'a>,
+ next_token: Option<TokenWithLocation>,
+ last_token_loc: Option<Loc>,
+}
+
+impl<'a> Tokenizer<'a> {
+ pub fn new(input: &'a str, comment_style: ParserLanguage) -> Tokenizer<'a> {
+ Tokenizer {
+ lexer: Lexer::new(input, comment_style),
+ next_token: None,
+ last_token_loc: None,
+ }
+ }
+
+ pub fn loc(&self) -> Loc {
+ // After lookahead return the location of the next token
+ self.next_token
+ .as_ref()
+ .map(|t| t.loc.clone())
+ // After token consumed return the location of that token
+ .or(self.last_token_loc.clone())
+ // Otherwise return the position of lexer
+ .unwrap_or(self.lexer.loc)
+ }
+
+ pub fn lookahead_loc(&mut self) -> Loc {
+ drop(self.lookahead());
+ // TODO: does not handle EOF properly
+ self.loc()
+ }
+
+ fn lookahead(&mut self) -> TokenizerResult<Option<&Token>> {
+ Ok(match self.next_token {
+ Some(ref token) => Some(&token.token),
+ None => {
+ self.next_token = self.lexer.next_token()?;
+ self.last_token_loc = self.next_token.as_ref().map(|t| t.loc.clone());
+ match self.next_token {
+ Some(ref token) => Some(&token.token),
+ None => None,
+ }
+ }
+ })
+ }
+
+ pub fn lookahead_some(&mut self) -> TokenizerResult<&Token> {
+ match self.lookahead()? {
+ Some(token) => Ok(token),
+ None => Err(TokenizerError::UnexpectedEof),
+ }
+ }
+
+ fn next(&mut self) -> TokenizerResult<Option<Token>> {
+ self.lookahead()?;
+ Ok(self
+ .next_token
+ .take()
+ .map(|TokenWithLocation { token, .. }| token))
+ }
+
+ pub fn next_some(&mut self) -> TokenizerResult<Token> {
+ match self.next()? {
+ Some(token) => Ok(token),
+ None => Err(TokenizerError::UnexpectedEof),
+ }
+ }
+
+ /// Can be called only after lookahead, otherwise it's error
+ pub fn advance(&mut self) -> TokenizerResult<Token> {
+ self.next_token
+ .take()
+ .map(|TokenWithLocation { token, .. }| token)
+ .ok_or(TokenizerError::InternalError)
+ }
+
+ /// No more tokens
+ pub fn syntax_eof(&mut self) -> TokenizerResult<bool> {
+ Ok(self.lookahead()?.is_none())
+ }
+
+ pub fn next_token_if_map<P, R>(&mut self, p: P) -> TokenizerResult<Option<R>>
+ where
+ P: FnOnce(&Token) -> Option<R>,
+ {
+ self.lookahead()?;
+ let v = match self.next_token {
+ Some(ref token) => match p(&token.token) {
+ Some(v) => v,
+ None => return Ok(None),
+ },
+ _ => return Ok(None),
+ };
+ self.next_token = None;
+ Ok(Some(v))
+ }
+
+ pub fn next_token_check_map<P, R, E>(&mut self, p: P) -> Result<R, E>
+ where
+ P: FnOnce(&Token) -> Result<R, E>,
+ E: From<TokenizerError>,
+ {
+ self.lookahead()?;
+ let r = match self.next_token {
+ Some(ref token) => p(&token.token)?,
+ None => return Err(TokenizerError::UnexpectedEof.into()),
+ };
+ self.next_token = None;
+ Ok(r)
+ }
+
+ fn next_token_if<P>(&mut self, p: P) -> TokenizerResult<Option<Token>>
+ where
+ P: FnOnce(&Token) -> bool,
+ {
+ self.next_token_if_map(|token| if p(token) { Some(token.clone()) } else { None })
+ }
+
+ pub fn next_ident_if_in(&mut self, idents: &[&str]) -> TokenizerResult<Option<String>> {
+ let v = match self.lookahead()? {
+ Some(&Token::Ident(ref next)) => {
+ if idents.into_iter().find(|&i| i == next).is_some() {
+ next.clone()
+ } else {
+ return Ok(None);
+ }
+ }
+ _ => return Ok(None),
+ };
+ self.advance()?;
+ Ok(Some(v))
+ }
+
+ pub fn next_ident_if_eq(&mut self, word: &str) -> TokenizerResult<bool> {
+ Ok(self.next_ident_if_in(&[word])? != None)
+ }
+
+ pub fn next_ident_expect_eq(&mut self, word: &str) -> TokenizerResult<()> {
+ if self.next_ident_if_eq(word)? {
+ Ok(())
+ } else {
+ Err(TokenizerError::ExpectNamedIdent(word.to_owned()))
+ }
+ }
+
+ pub fn next_ident_if_eq_error(&mut self, word: &'static str) -> TokenizerResult<()> {
+ if self.clone().next_ident_if_eq(word)? {
+ // TODO: which context?
+ return Err(TokenizerError::NotAllowedInThisContext(word));
+ }
+ Ok(())
+ }
+
+ pub fn next_symbol_if_eq(&mut self, symbol: char) -> TokenizerResult<bool> {
+ Ok(self.next_token_if(|token| match token {
+ &Token::Symbol(c) if c == symbol => true,
+ _ => false,
+ })? != None)
+ }
+
+ pub fn next_symbol_expect_eq(
+ &mut self,
+ symbol: char,
+ desc: &'static str,
+ ) -> TokenizerResult<()> {
+ if self.lookahead_is_symbol(symbol)? {
+ self.advance()?;
+ Ok(())
+ } else {
+ Err(TokenizerError::ExpectChar(symbol, desc))
+ }
+ }
+
+ pub fn next_symbol_expect_eq_oneof(&mut self, symbols: &[char]) -> TokenizerResult<char> {
+ for symbol in symbols {
+ if let Ok(()) = self.next_symbol_expect_eq(*symbol, "ignored") {
+ return Ok(*symbol);
+ }
+ }
+ Err(TokenizerError::ExpectAnyChar(symbols.to_owned()))
+ }
+
+ pub fn lookahead_is_str_lit(&mut self) -> TokenizerResult<bool> {
+ Ok(match self.lookahead()? {
+ Some(&Token::StrLit(..)) => true,
+ _ => false,
+ })
+ }
+
+ pub fn lookahead_is_int_lit(&mut self) -> TokenizerResult<bool> {
+ Ok(match self.lookahead()? {
+ Some(&Token::IntLit(..)) => true,
+ _ => false,
+ })
+ }
+
+ pub fn lookahead_is_json_number(&mut self) -> TokenizerResult<bool> {
+ Ok(match self.lookahead()? {
+ Some(&Token::JsonNumber(..)) => true,
+ _ => false,
+ })
+ }
+
+ pub fn lookahead_if_symbol(&mut self) -> TokenizerResult<Option<char>> {
+ Ok(match self.lookahead()? {
+ Some(&Token::Symbol(c)) => Some(c),
+ _ => None,
+ })
+ }
+
+ pub fn lookahead_is_symbol(&mut self, symbol: char) -> TokenizerResult<bool> {
+ Ok(self.lookahead_if_symbol()? == Some(symbol))
+ }
+
+ pub fn lookahead_is_ident(&mut self, ident: &str) -> TokenizerResult<bool> {
+ Ok(match self.lookahead()? {
+ Some(Token::Ident(i)) => i == ident,
+ _ => false,
+ })
+ }
+
+ pub fn next_ident(&mut self) -> TokenizerResult<String> {
+ self.next_token_check_map(|token| match token {
+ &Token::Ident(ref ident) => Ok(ident.clone()),
+ _ => Err(TokenizerError::ExpectIdent),
+ })
+ }
+
+ pub fn next_str_lit(&mut self) -> TokenizerResult<StrLit> {
+ self.next_token_check_map(|token| match token {
+ &Token::StrLit(ref str_lit) => Ok(str_lit.clone()),
+ _ => Err(TokenizerError::ExpectStrLit),
+ })
+ }
+
+ pub fn next_int_lit(&mut self) -> TokenizerResult<u64> {
+ self.next_token_check_map(|token| match token {
+ &Token::IntLit(v) => Ok(v),
+ _ => Err(TokenizerError::ExpectIntLit),
+ })
+ }
+
+ pub fn next_float_lit(&mut self) -> TokenizerResult<f64> {
+ self.next_token_check_map(|token| match token {
+ &Token::FloatLit(v) => Ok(v),
+ _ => Err(TokenizerError::ExpectFloatLit),
+ })
+ }
+}
+
+#[cfg(test)]
+mod test {
+
+ use super::*;
+
+ fn tokenize<P, R>(input: &str, what: P) -> R
+ where
+ P: FnOnce(&mut Tokenizer) -> TokenizerResult<R>,
+ {
+ let mut tokenizer = Tokenizer::new(input, ParserLanguage::Proto);
+ let r = what(&mut tokenizer).expect(&format!("parse failed at {}", tokenizer.loc()));
+ let eof = tokenizer
+ .syntax_eof()
+ .expect(&format!("check eof failed at {}", tokenizer.loc()));
+ assert!(eof, "{}", tokenizer.loc());
+ r
+ }
+
+ #[test]
+ fn test_ident() {
+ let msg = r#" aabb_c "#;
+ let mess = tokenize(msg, |p| p.next_ident().map(|s| s.to_owned()));
+ assert_eq!("aabb_c", mess);
+ }
+
+ #[test]
+ fn test_str_lit() {
+ let msg = r#" "a\nb" "#;
+ let mess = tokenize(msg, |p| p.next_str_lit());
+ assert_eq!(
+ StrLit {
+ escaped: r#"a\nb"#.to_owned()
+ },
+ mess
+ );
+ }
+}
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..63c2a8d
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,9 @@
+//! # Supporting code for protobuf crates
+//!
+//! Code in this crate is used in protobuf crates like `protobuf` or `protobuf-parse`.
+//! None of code in this crate has public API.
+
+pub mod json_name;
+pub mod lexer;
+pub mod text_format;
+pub mod toposort;
diff --git a/src/text_format.rs b/src/text_format.rs
new file mode 100644
index 0000000..f49f4c2
--- /dev/null
+++ b/src/text_format.rs
@@ -0,0 +1,75 @@
+pub fn escape_bytes_to(bytes: &[u8], buf: &mut String) {
+ for &c in bytes {
+ match c {
+ b'\n' => buf.push_str(r"\n"),
+ b'\r' => buf.push_str(r"\r"),
+ b'\t' => buf.push_str(r"\t"),
+ b'\'' => buf.push_str("\\\'"),
+ b'"' => buf.push_str("\\\""),
+ b'\\' => buf.push_str(r"\\"),
+ b'\x20'..=b'\x7e' => buf.push(c as char),
+ _ => {
+ buf.push('\\');
+ buf.push((b'0' + (c >> 6)) as char);
+ buf.push((b'0' + ((c >> 3) & 7)) as char);
+ buf.push((b'0' + (c & 7)) as char);
+ }
+ }
+ }
+}
+
+pub fn quote_bytes_to(bytes: &[u8], buf: &mut String) {
+ buf.push('"');
+ escape_bytes_to(bytes, buf);
+ buf.push('"');
+}
+
+#[cfg(test)]
+mod test {
+ use crate::lexer::str_lit::StrLit;
+ use crate::text_format::escape_bytes_to;
+
+ fn escape(data: &[u8]) -> String {
+ let mut s = String::with_capacity(data.len() * 4);
+ escape_bytes_to(data, &mut s);
+ s
+ }
+
+ fn unescape_string(escaped: &str) -> Vec<u8> {
+ StrLit {
+ escaped: escaped.to_owned(),
+ }
+ .decode_bytes()
+ .expect("decode_bytes")
+ }
+
+ fn test_escape_unescape(text: &str, escaped: &str) {
+ assert_eq!(text.as_bytes(), &unescape_string(escaped)[..]);
+ assert_eq!(escaped, &escape(text.as_bytes())[..]);
+ }
+
+ #[test]
+ fn test_print_to_bytes() {
+ assert_eq!("ab", escape(b"ab"));
+ assert_eq!("a\\\\023", escape(b"a\\023"));
+ assert_eq!("a\\r\\n\\t \\'\\\"\\\\", escape(b"a\r\n\t '\"\\"));
+ assert_eq!("\\344\\275\\240\\345\\245\\275", escape("你好".as_bytes()));
+ }
+
+ #[test]
+ fn test_unescape_string() {
+ test_escape_unescape("", "");
+ test_escape_unescape("aa", "aa");
+ test_escape_unescape("\n", "\\n");
+ test_escape_unescape("\r", "\\r");
+ test_escape_unescape("\t", "\\t");
+ test_escape_unescape("你好", "\\344\\275\\240\\345\\245\\275");
+ // hex
+ assert_eq!(b"aaa\x01bbb", &unescape_string("aaa\\x01bbb")[..]);
+ assert_eq!(b"aaa\xcdbbb", &unescape_string("aaa\\xCDbbb")[..]);
+ assert_eq!(b"aaa\xcdbbb", &unescape_string("aaa\\xCDbbb")[..]);
+ // quotes
+ assert_eq!(b"aaa\"bbb", &unescape_string("aaa\\\"bbb")[..]);
+ assert_eq!(b"aaa\'bbb", &unescape_string("aaa\\\'bbb")[..]);
+ }
+}
diff --git a/src/toposort.rs b/src/toposort.rs
new file mode 100644
index 0000000..5e44590
--- /dev/null
+++ b/src/toposort.rs
@@ -0,0 +1,119 @@
+use std::collections::HashSet;
+use std::hash::Hash;
+
+#[derive(Debug, thiserror::Error)]
+#[error("Cycle detected")]
+pub struct TopoSortCycle;
+
+pub fn toposort<K, I>(
+ input: impl IntoIterator<Item = K>,
+ deps: impl Fn(&K) -> I,
+) -> Result<Vec<K>, TopoSortCycle>
+where
+ K: Eq + Hash + Clone,
+ I: Iterator<Item = K>,
+{
+ struct Ts<K, D, I>
+ where
+ K: Eq + Hash + Clone,
+ I: Iterator<Item = K>,
+ D: Fn(&K) -> I,
+ {
+ result_set: HashSet<K>,
+ result: Vec<K>,
+ deps: D,
+ stack: HashSet<K>,
+ }
+
+ impl<K, D, I> Ts<K, D, I>
+ where
+ K: Eq + Hash + Clone,
+ I: Iterator<Item = K>,
+ D: Fn(&K) -> I,
+ {
+ fn visit(&mut self, i: &K) -> Result<(), TopoSortCycle> {
+ if self.result_set.contains(i) {
+ return Ok(());
+ }
+
+ if !self.stack.insert(i.clone()) {
+ return Err(TopoSortCycle);
+ }
+ for dep in (self.deps)(i) {
+ self.visit(&dep)?;
+ }
+
+ let removed = self.stack.remove(i);
+ assert!(removed);
+
+ self.result.push(i.clone());
+ self.result_set.insert(i.clone());
+
+ Ok(())
+ }
+ }
+
+ let mut ts = Ts {
+ result: Vec::new(),
+ result_set: HashSet::new(),
+ deps,
+ stack: HashSet::new(),
+ };
+
+ for i in input {
+ ts.visit(&i)?;
+ }
+
+ Ok(ts.result)
+}
+
+#[cfg(test)]
+mod tests {
+ use std::collections::HashMap;
+
+ use crate::toposort::toposort;
+ use crate::toposort::TopoSortCycle;
+
+ fn test_toposort(input: &str) -> Result<Vec<&str>, TopoSortCycle> {
+ let mut keys: Vec<&str> = Vec::new();
+ let mut edges: HashMap<&str, Vec<&str>> = HashMap::new();
+ for part in input.split(" ") {
+ match part.split_once("->") {
+ Some((k, vs)) => {
+ keys.push(k);
+ edges.insert(k, vs.split(",").collect());
+ }
+ None => keys.push(part),
+ };
+ }
+
+ toposort(keys, |k| {
+ edges
+ .get(k)
+ .map(|v| v.as_slice())
+ .unwrap_or_default()
+ .into_iter()
+ .copied()
+ })
+ }
+
+ fn test_toposort_check(input: &str, expected: &str) {
+ let sorted = test_toposort(input).unwrap();
+ let expected = expected.split(" ").collect::<Vec<_>>();
+ assert_eq!(expected, sorted);
+ }
+
+ #[test]
+ fn test() {
+ test_toposort_check("1 2 3", "1 2 3");
+ test_toposort_check("1->2 2->3 3", "3 2 1");
+ test_toposort_check("1 2->1 3->2", "1 2 3");
+ test_toposort_check("1->2,3 2->3 3", "3 2 1");
+ }
+
+ #[test]
+ fn cycle() {
+ assert!(test_toposort("1->1").is_err());
+ assert!(test_toposort("1->2 2->1").is_err());
+ }
+}