From e8cf4959fc6ebe5c1b601fa932727aaaf0705a8f Mon Sep 17 00:00:00 2001 From: Jeff Vander Stoep Date: Thu, 16 Feb 2023 15:59:44 +0100 Subject: Upgrade pest to 2.5.5 This project was upgraded with external_updater. Usage: tools/external_updater/updater.sh update rust/crates/pest For more info, check https://cs.android.com/android/platform/superproject/+/master:tools/external_updater/README.md Test: TreeHugger Change-Id: I6fb46fe0ef2f30a14b6c8e7d2a21c164d5b485a3 --- Android.bp | 2 +- Cargo.lock | 17 ++----- Cargo.toml | 10 +--- Cargo.toml.orig | 6 +-- METADATA | 6 +-- src/iterators/flat_pairs.rs | 22 ++++++++- src/iterators/line_index.rs | 91 ++++++++++++++++++++++++++++++++++++ src/iterators/mod.rs | 1 + src/iterators/pair.rs | 28 +++++++---- src/iterators/pairs.rs | 90 +++++++++++------------------------- src/parser_state.rs | 2 +- src/position.rs | 110 +++++++++++++++----------------------------- 12 files changed, 207 insertions(+), 178 deletions(-) create mode 100644 src/iterators/line_index.rs diff --git a/Android.bp b/Android.bp index 58283e0..d49f319 100644 --- a/Android.bp +++ b/Android.bp @@ -41,7 +41,7 @@ rust_library_host { name: "libpest", crate_name: "pest", cargo_env_compat: true, - cargo_pkg_version: "2.5.4", + cargo_pkg_version: "2.5.5", srcs: ["src/lib.rs"], edition: "2021", features: [ diff --git a/Cargo.lock b/Cargo.lock index 5741be3..053e07f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,12 +2,6 @@ # It is not intended for manual editing. version = 3 -[[package]] -name = "bytecount" -version = "0.6.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c" - [[package]] name = "itoa" version = "1.0.5" @@ -22,9 +16,8 @@ checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" [[package]] name = "pest" -version = "2.5.4" +version = "2.5.5" dependencies = [ - "bytecount", "memchr", "serde", "serde_json", @@ -34,9 +27,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.50" +version = "1.0.51" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ef7d57beacfaf2d8aee5937dab7b7f28de3cb8b1828479bb5de2a7106f2bae2" +checksum = "5d727cae5b39d21da60fa540906919ad737832fe0b1c165da3a34d6548c849d6" dependencies = [ "unicode-ident", ] @@ -64,9 +57,9 @@ checksum = "bb7d1f0d3021d347a83e556fc4683dea2ea09d87bccdf88ff5c12545d89d5efb" [[package]] name = "serde_json" -version = "1.0.91" +version = "1.0.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "877c235533714907a8c2464236f5c4b2a17262ef1bd71f38f35ea592c8da6883" +checksum = "7434af0dc1cbd59268aa98b4c22c131c0584d2232f6fb166efb993e2832e896a" dependencies = [ "itoa", "ryu", diff --git a/Cargo.toml b/Cargo.toml index bf8e4d5..3c1f26d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,7 @@ edition = "2021" rust-version = "1.56" name = "pest" -version = "2.5.4" +version = "2.5.5" authors = ["Dragoș Tiselice "] description = "The Elegant Parser" homepage = "https://pest.rs/" @@ -29,10 +29,6 @@ categories = ["parsing"] license = "MIT/Apache-2.0" repository = "https://github.com/pest-parser/pest" -[dependencies.bytecount] -version = "0.6" -optional = true - [dependencies.memchr] version = "2" optional = true @@ -56,10 +52,6 @@ default-features = false [features] const_prec_climber = [] default = ["std"] -fast-line-col = [ - "memchr", - "bytecount", -] pretty-print = [ "serde", "serde_json", diff --git a/Cargo.toml.orig b/Cargo.toml.orig index 7ea229f..a268321 100644 --- a/Cargo.toml.orig +++ b/Cargo.toml.orig @@ -1,7 +1,7 @@ [package] name = "pest" description = "The Elegant Parser" -version = "2.5.4" +version = "2.5.5" edition = "2021" authors = ["Dragoș Tiselice "] homepage = "https://pest.rs/" @@ -21,9 +21,6 @@ std = ["ucd-trie/std", "thiserror"] pretty-print = ["serde", "serde_json"] # Enable const fn constructor for `PrecClimber` const_prec_climber = [] -# Enable faster `Position::line_col` calculation using SIMD -# (note that this may have extra overhead for small inputs) -fast-line-col = ["memchr", "bytecount"] [dependencies] ucd-trie = { version = "0.1.5", default-features = false } @@ -31,4 +28,3 @@ serde = { version = "1.0.145", optional = true } serde_json = { version = "1.0.85", optional = true} thiserror = { version = "1.0.37", optional = true } memchr = { version = "2", optional = true } -bytecount = { version = "0.6", optional = true } diff --git a/METADATA b/METADATA index aa66385..ce195cf 100644 --- a/METADATA +++ b/METADATA @@ -11,13 +11,13 @@ third_party { } url { type: ARCHIVE - value: "https://static.crates.io/crates/pest/pest-2.5.4.crate" + value: "https://static.crates.io/crates/pest/pest-2.5.5.crate" } - version: "2.5.4" + version: "2.5.5" license_type: NOTICE last_upgrade_date { year: 2023 month: 2 - day: 3 + day: 16 } } diff --git a/src/iterators/flat_pairs.rs b/src/iterators/flat_pairs.rs index 411d88b..52a2074 100644 --- a/src/iterators/flat_pairs.rs +++ b/src/iterators/flat_pairs.rs @@ -11,6 +11,7 @@ use alloc::rc::Rc; use alloc::vec::Vec; use core::fmt; +use super::line_index::LineIndex; use super::pair::{self, Pair}; use super::queueable_token::QueueableToken; use super::tokens::{self, Tokens}; @@ -28,6 +29,7 @@ pub struct FlatPairs<'i, R> { input: &'i str, start: usize, end: usize, + line_index: Rc, } /// # Safety @@ -42,6 +44,7 @@ pub unsafe fn new( FlatPairs { queue, input, + line_index: Rc::new(LineIndex::new(input)), start, end, } @@ -107,7 +110,14 @@ impl<'i, R: RuleType> Iterator for FlatPairs<'i, R> { return None; } - let pair = unsafe { pair::new(Rc::clone(&self.queue), self.input, self.start) }; + let pair = unsafe { + pair::new( + Rc::clone(&self.queue), + self.input, + Rc::clone(&self.line_index), + self.start, + ) + }; self.next_start(); Some(pair) @@ -122,7 +132,14 @@ impl<'i, R: RuleType> DoubleEndedIterator for FlatPairs<'i, R> { self.next_start_from_end(); - let pair = unsafe { pair::new(Rc::clone(&self.queue), self.input, self.end) }; + let pair = unsafe { + pair::new( + Rc::clone(&self.queue), + self.input, + Rc::clone(&self.line_index), + self.end, + ) + }; Some(pair) } @@ -141,6 +158,7 @@ impl<'i, R: Clone> Clone for FlatPairs<'i, R> { FlatPairs { queue: Rc::clone(&self.queue), input: self.input, + line_index: Rc::clone(&self.line_index), start: self.start, end: self.end, } diff --git a/src/iterators/line_index.rs b/src/iterators/line_index.rs new file mode 100644 index 0000000..54871e1 --- /dev/null +++ b/src/iterators/line_index.rs @@ -0,0 +1,91 @@ +//! `LineIndex` to make a line_offsets, each item is an byte offset (start from 0) of the beginning of the line. +//! +//! For example, the text: `"hello 你好\nworld"`, the line_offsets will store `[0, 13]`. +//! +//! Then `line_col` with a offset just need to find the line index by binary search. +//! +//! Inspired by rust-analyzer's `LineIndex`: +//! +use alloc::vec::Vec; + +#[derive(Clone)] +pub struct LineIndex { + /// Offset (bytes) the the beginning of each line, zero-based + line_offsets: Vec, +} + +impl LineIndex { + pub fn new(text: &str) -> LineIndex { + let mut line_offsets: Vec = alloc::vec![0]; + + let mut offset = 0; + + for c in text.chars() { + offset += c.len_utf8(); + if c == '\n' { + line_offsets.push(offset); + } + } + + LineIndex { line_offsets } + } + + /// Returns (line, col) of pos. + /// + /// The pos is a byte offset, start from 0, e.g. "ab" is 2, "你好" is 6 + pub fn line_col(&self, input: &str, pos: usize) -> (usize, usize) { + let line = self.line_offsets.partition_point(|&it| it <= pos) - 1; + let first_offset = self.line_offsets[line]; + + // Get line str from original input, then we can get column offset + let line_str = &input[first_offset..pos]; + let col = line_str.chars().count(); + + (line + 1, col + 1) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[allow(clippy::zero_prefixed_literal)] + #[test] + fn test_line_index() { + let text = "hello 你好 A🎈C\nworld"; + let table = [ + (00, 1, 1, 'h'), + (01, 1, 2, 'e'), + (02, 1, 3, 'l'), + (03, 1, 4, 'l'), + (04, 1, 5, 'o'), + (05, 1, 6, ' '), + (06, 1, 7, '你'), + (09, 1, 8, '好'), + (12, 1, 9, ' '), + (13, 1, 10, 'A'), + (14, 1, 11, '🎈'), + (18, 1, 12, 'C'), + (19, 1, 13, '\n'), + (20, 2, 1, 'w'), + (21, 2, 2, 'o'), + (22, 2, 3, 'r'), + (23, 2, 4, 'l'), + (24, 2, 5, 'd'), + ]; + + let index = LineIndex::new(text); + for &(offset, line, col, c) in table.iter() { + let res = index.line_col(text, offset); + assert_eq!( + (res.0, res.1), + (line, col), + "Expected: ({}, {}, {}, {:?})", + offset, + line, + col, + c + ); + } + } +} diff --git a/src/iterators/mod.rs b/src/iterators/mod.rs index 1a78963..7f81019 100644 --- a/src/iterators/mod.rs +++ b/src/iterators/mod.rs @@ -10,6 +10,7 @@ //! Types and iterators for parser output. mod flat_pairs; +mod line_index; mod pair; pub(crate) mod pairs; mod queueable_token; diff --git a/src/iterators/pair.rs b/src/iterators/pair.rs index 2c81347..891b905 100644 --- a/src/iterators/pair.rs +++ b/src/iterators/pair.rs @@ -20,6 +20,7 @@ use core::str; #[cfg(feature = "pretty-print")] use serde::ser::SerializeStruct; +use super::line_index::LineIndex; use super::pairs::{self, Pairs}; use super::queueable_token::QueueableToken; use super::tokens::{self, Tokens}; @@ -43,7 +44,7 @@ pub struct Pair<'i, R> { input: &'i str, /// Token index into `queue`. start: usize, - pub(crate) line_col: Option<(usize, usize)>, + line_index: Rc, } /// # Safety @@ -52,13 +53,14 @@ pub struct Pair<'i, R> { pub unsafe fn new( queue: Rc>>, input: &str, + line_index: Rc, start: usize, ) -> Pair<'_, R> { Pair { queue, input, start, - line_col: None, + line_index, } } @@ -204,7 +206,13 @@ impl<'i, R: RuleType> Pair<'i, R> { pub fn into_inner(self) -> Pairs<'i, R> { let pair = self.pair(); - pairs::new(self.queue, self.input, self.start + 1, pair) + pairs::new( + self.queue, + self.input, + Some(self.line_index), + self.start + 1, + pair, + ) } /// Returns the `Tokens` for the `Pair`. @@ -245,10 +253,8 @@ impl<'i, R: RuleType> Pair<'i, R> { /// Returns the `line`, `col` of this pair start. pub fn line_col(&self) -> (usize, usize) { - match &self.line_col { - Some(line_col) => (line_col.0, line_col.1), - None => self.as_span().start_pos().line_col(), - } + let pos = self.pos(self.start); + self.line_index.line_col(self.input, pos) } fn pair(&self) -> usize { @@ -273,7 +279,13 @@ impl<'i, R: RuleType> Pairs<'i, R> { /// Create a new `Pairs` iterator containing just the single `Pair`. pub fn single(pair: Pair<'i, R>) -> Self { let end = pair.pair(); - pairs::new(pair.queue, pair.input, pair.start, end) + pairs::new( + pair.queue, + pair.input, + Some(pair.line_index), + pair.start, + end, + ) } } diff --git a/src/iterators/pairs.rs b/src/iterators/pairs.rs index d4596b0..c21a7fa 100644 --- a/src/iterators/pairs.rs +++ b/src/iterators/pairs.rs @@ -20,27 +20,12 @@ use core::str; use serde::ser::SerializeStruct; use super::flat_pairs::{self, FlatPairs}; +use super::line_index::LineIndex; use super::pair::{self, Pair}; use super::queueable_token::QueueableToken; use super::tokens::{self, Tokens}; -use crate::{position, RuleType}; +use crate::RuleType; -#[derive(Clone)] -pub struct Cursor { - pub line: usize, - pub col: usize, - pub end: usize, -} - -impl Default for Cursor { - fn default() -> Cursor { - Cursor { - line: 1, - col: 1, - end: 0, - } - } -} /// An iterator over [`Pair`]s. It is created by [`pest::state`] and [`Pair::into_inner`]. /// /// [`Pair`]: struct.Pair.html @@ -52,21 +37,27 @@ pub struct Pairs<'i, R> { input: &'i str, start: usize, end: usize, - cursor: Cursor, + line_index: Rc, } pub fn new( queue: Rc>>, input: &str, + line_index: Option>, start: usize, end: usize, ) -> Pairs<'_, R> { + let line_index = match line_index { + Some(line_index) => line_index, + None => Rc::new(LineIndex::new(input)), + }; + Pairs { queue, input, start, end, - cursor: Cursor::default(), + line_index, } } @@ -199,7 +190,14 @@ impl<'i, R: RuleType> Pairs<'i, R> { #[inline] pub fn peek(&self) -> Option> { if self.start < self.end { - Some(unsafe { pair::new(Rc::clone(&self.queue), self.input, self.start) }) + Some(unsafe { + pair::new( + Rc::clone(&self.queue), + self.input, + Rc::clone(&self.line_index), + self.start, + ) + }) } else { None } @@ -237,42 +235,13 @@ impl<'i, R: RuleType> Pairs<'i, R> { } } } - - /// Move the cursor (line, col) by a part of the input. - fn move_cursor(&mut self, input: &str, start: usize, end: usize) -> (usize, usize) { - // Move cursor for some skiped characters (by skip(n)) - let prev_end = self.cursor.end; - if prev_end != start { - self.move_cursor(input, prev_end, start); - } - - let (prev_line, prev_col) = (self.cursor.line, self.cursor.col); - - let part = &input[self.cursor.end..end]; - let (l, c) = position::line_col(part, part.len(), (0, 0)); - - self.cursor.line += l; - // Has new line - if l > 0 { - self.cursor.col = c; - } else { - self.cursor.col += c; - } - self.cursor.end = end; - - (prev_line, prev_col) - } } impl<'i, R: RuleType> Iterator for Pairs<'i, R> { type Item = Pair<'i, R>; fn next(&mut self) -> Option { - let mut pair = self.peek()?; - let span = pair.as_span(); - - let (l, c) = self.move_cursor(self.input, span.start(), span.end()); - pair.line_col = Some((l, c)); + let pair = self.peek()?; self.start = self.pair() + 1; Some(pair) @@ -287,7 +256,14 @@ impl<'i, R: RuleType> DoubleEndedIterator for Pairs<'i, R> { self.end = self.pair_from_end(); - let pair = unsafe { pair::new(Rc::clone(&self.queue), self.input, self.end) }; + let pair = unsafe { + pair::new( + Rc::clone(&self.queue), + self.input, + Rc::clone(&self.line_index), + self.end, + ) + }; Some(pair) } @@ -478,26 +454,14 @@ mod tests { let pair = pairs.next().unwrap(); assert_eq!(pair.as_str(), "abc"); assert_eq!(pair.line_col(), (1, 1)); - assert_eq!( - (pairs.cursor.line, pairs.cursor.col, pairs.cursor.end), - (1, 4, 3) - ); let pair = pairs.next().unwrap(); assert_eq!(pair.as_str(), "e"); assert_eq!(pair.line_col(), (2, 1)); - assert_eq!( - (pairs.cursor.line, pairs.cursor.col, pairs.cursor.end), - (2, 2, 5) - ); let pair = pairs.next().unwrap(); assert_eq!(pair.as_str(), "fgh"); assert_eq!(pair.line_col(), (2, 2)); - assert_eq!( - (pairs.cursor.line, pairs.cursor.col, pairs.cursor.end), - (2, 5, 8) - ); } #[test] diff --git a/src/parser_state.rs b/src/parser_state.rs index 609de55..f58de00 100644 --- a/src/parser_state.rs +++ b/src/parser_state.rs @@ -157,7 +157,7 @@ where match f(state) { Ok(state) => { let len = state.queue.len(); - Ok(pairs::new(Rc::new(state.queue), input, 0, len)) + Ok(pairs::new(Rc::new(state.queue), input, None, 0, len)) } Err(mut state) => { let variant = if state.reached_call_limit() { diff --git a/src/position.rs b/src/position.rs index b7b3c10..465ff97 100644 --- a/src/position.rs +++ b/src/position.rs @@ -138,8 +138,43 @@ impl<'i> Position<'i> { if self.pos > self.input.len() { panic!("position out of bounds"); } + let mut pos = self.pos; + let slice = &self.input[..pos]; + let mut chars = slice.chars().peekable(); + + let mut line_col = (1, 1); + + while pos != 0 { + match chars.next() { + Some('\r') => { + if let Some(&'\n') = chars.peek() { + chars.next(); + + if pos == 1 { + pos -= 1; + } else { + pos -= 2; + } + + line_col = (line_col.0 + 1, 1); + } else { + pos -= 1; + line_col = (line_col.0, line_col.1 + 1); + } + } + Some('\n') => { + pos -= 1; + line_col = (line_col.0 + 1, 1); + } + Some(c) => { + pos -= c.len_utf8(); + line_col = (line_col.0, line_col.1 + 1); + } + None => unreachable!(), + } + } - line_col(self.input, self.pos, (1, 1)) + line_col } /// Returns the entire line of the input that contains this `Position`. @@ -452,79 +487,6 @@ impl<'i> Hash for Position<'i> { } } -/// Returns the line and column of the given `pos` in `input`. -pub(crate) fn line_col(input: &str, pos: usize, start: (usize, usize)) -> (usize, usize) { - #[cfg(feature = "fast-line-col")] - { - fast_line_col(input, pos, start) - } - #[cfg(not(feature = "fast-line-col"))] - { - original_line_col(input, pos, start) - } -} - -#[inline] -#[cfg(not(feature = "fast-line-col"))] -pub(crate) fn original_line_col( - input: &str, - mut pos: usize, - start: (usize, usize), -) -> (usize, usize) { - // Position's pos is always a UTF-8 border. - let slice = &input[..pos]; - let mut chars = slice.chars().peekable(); - - let mut line_col = start; - - while pos != 0 { - match chars.next() { - Some('\r') => { - if let Some(&'\n') = chars.peek() { - chars.next(); - - if pos == 1 { - pos -= 1; - } else { - pos -= 2; - } - - line_col = (line_col.0 + 1, 1); - } else { - pos -= 1; - line_col = (line_col.0, line_col.1 + 1); - } - } - Some('\n') => { - pos -= 1; - line_col = (line_col.0 + 1, 1); - } - Some(c) => { - pos -= c.len_utf8(); - line_col = (line_col.0, line_col.1 + 1); - } - None => unreachable!(), - } - } - - line_col -} - -#[inline] -#[cfg(feature = "fast-line-col")] -fn fast_line_col(input: &str, pos: usize, start: (usize, usize)) -> (usize, usize) { - // Position's pos is always a UTF-8 border. - let slice = &input[..pos]; - - let prec_ln = memchr::memrchr(b'\n', slice.as_bytes()); - if let Some(prec_nl_pos) = prec_ln { - let lines = bytecount::count(slice[..=prec_nl_pos].as_bytes(), b'\n') + start.0; - (lines, slice[prec_nl_pos..].chars().count()) - } else { - (start.0, slice.chars().count() + start.1) - } -} - #[cfg(test)] mod tests { use super::*; -- cgit v1.2.3