diff options
author | Jeff Vander Stoep <jeffv@google.com> | 2022-12-12 09:08:11 +0100 |
---|---|---|
committer | Jeff Vander Stoep <jeffv@google.com> | 2022-12-12 09:08:51 +0100 |
commit | e3d458e4045070c172111aef0214af1819ee7403 (patch) | |
tree | 065f4b90fc4351f93487c5554f9cacd93fe29cbe /src/unicode | |
parent | 1faff9be927c85d1dfb151bc7975d02f697854df (diff) | |
download | bstr-main-16k-with-phones.tar.gz |
Upgrade bstr to 1.0.1main-16k-with-phones
This project was upgraded with external_updater.
Usage: tools/external_updater/updater.sh update rust/crates/bstr
For more info, check https://cs.android.com/android/platform/superproject/+/master:tools/external_updater/README.md
Test: TreeHugger
Change-Id: I97636c5410bfed620afaf1e05bd02b94670f17b0
Diffstat (limited to 'src/unicode')
26 files changed, 171 insertions, 156 deletions
diff --git a/src/unicode/data/GraphemeBreakTest.txt b/src/unicode/data/GraphemeBreakTest.txt index fb4fec9..eff2fd3 100644 --- a/src/unicode/data/GraphemeBreakTest.txt +++ b/src/unicode/data/GraphemeBreakTest.txt @@ -1,6 +1,6 @@ -# GraphemeBreakTest-12.1.0.txt -# Date: 2019-03-10, 10:53:12 GMT -# © 2019 Unicode®, Inc. +# GraphemeBreakTest-14.0.0.txt +# Date: 2021-03-08, 06:22:32 GMT +# © 2021 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see http://www.unicode.org/terms_of_use.html # diff --git a/src/unicode/data/SentenceBreakTest.txt b/src/unicode/data/SentenceBreakTest.txt index 7c1c34a..61ea42c 100644 --- a/src/unicode/data/SentenceBreakTest.txt +++ b/src/unicode/data/SentenceBreakTest.txt @@ -1,6 +1,6 @@ -# SentenceBreakTest-12.1.0.txt -# Date: 2019-03-10, 10:53:28 GMT -# © 2019 Unicode®, Inc. +# SentenceBreakTest-14.0.0.txt +# Date: 2021-03-08, 06:22:40 GMT +# © 2021 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see http://www.unicode.org/terms_of_use.html # diff --git a/src/unicode/data/WordBreakTest.txt b/src/unicode/data/WordBreakTest.txt index facd892..1d1435b 100644 --- a/src/unicode/data/WordBreakTest.txt +++ b/src/unicode/data/WordBreakTest.txt @@ -1,6 +1,6 @@ -# WordBreakTest-12.1.0.txt -# Date: 2019-03-10, 10:53:29 GMT -# © 2019 Unicode®, Inc. +# WordBreakTest-14.0.0.txt +# Date: 2021-03-08, 06:22:40 GMT +# © 2021 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see http://www.unicode.org/terms_of_use.html # diff --git a/src/unicode/fsm/grapheme_break_fwd.bigendian.dfa b/src/unicode/fsm/grapheme_break_fwd.bigendian.dfa Binary files differindex 0efaaf2..31f99c1 100644 --- a/src/unicode/fsm/grapheme_break_fwd.bigendian.dfa +++ b/src/unicode/fsm/grapheme_break_fwd.bigendian.dfa diff --git a/src/unicode/fsm/grapheme_break_fwd.littleendian.dfa b/src/unicode/fsm/grapheme_break_fwd.littleendian.dfa Binary files differindex eb24025..3a51728 100644 --- a/src/unicode/fsm/grapheme_break_fwd.littleendian.dfa +++ b/src/unicode/fsm/grapheme_break_fwd.littleendian.dfa diff --git a/src/unicode/fsm/grapheme_break_fwd.rs b/src/unicode/fsm/grapheme_break_fwd.rs index b53b1d7..dea4a7e 100644 --- a/src/unicode/fsm/grapheme_break_fwd.rs +++ b/src/unicode/fsm/grapheme_break_fwd.rs @@ -2,11 +2,12 @@ // // ucd-generate dfa --name GRAPHEME_BREAK_FWD --sparse --minimize --anchored --state-size 2 src/unicode/fsm/ [snip (arg too long)] // -// ucd-generate 0.2.9 is available on crates.io. +// ucd-generate 0.2.12 is available on crates.io. #[cfg(target_endian = "big")] -lazy_static::lazy_static! { - pub static ref GRAPHEME_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = { +pub static GRAPHEME_BREAK_FWD: ::once_cell::sync::Lazy< + ::regex_automata::SparseDFA<&'static [u8], u16>, +> = ::once_cell::sync::Lazy::new(|| { #[repr(C)] struct Aligned<B: ?Sized> { _align: [u8; 0], @@ -18,15 +19,13 @@ lazy_static::lazy_static! { bytes: *include_bytes!("grapheme_break_fwd.bigendian.dfa"), }; - unsafe { - ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) - } - }; -} + unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) } +}); #[cfg(target_endian = "little")] -lazy_static::lazy_static! { - pub static ref GRAPHEME_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = { +pub static GRAPHEME_BREAK_FWD: ::once_cell::sync::Lazy< + ::regex_automata::SparseDFA<&'static [u8], u16>, +> = ::once_cell::sync::Lazy::new(|| { #[repr(C)] struct Aligned<B: ?Sized> { _align: [u8; 0], @@ -38,8 +37,5 @@ lazy_static::lazy_static! { bytes: *include_bytes!("grapheme_break_fwd.littleendian.dfa"), }; - unsafe { - ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) - } - }; -} + unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) } +}); diff --git a/src/unicode/fsm/grapheme_break_rev.bigendian.dfa b/src/unicode/fsm/grapheme_break_rev.bigendian.dfa Binary files differindex d42cd36..742d2a6 100644 --- a/src/unicode/fsm/grapheme_break_rev.bigendian.dfa +++ b/src/unicode/fsm/grapheme_break_rev.bigendian.dfa diff --git a/src/unicode/fsm/grapheme_break_rev.littleendian.dfa b/src/unicode/fsm/grapheme_break_rev.littleendian.dfa Binary files differindex c75ea5f..d1937f2 100644 --- a/src/unicode/fsm/grapheme_break_rev.littleendian.dfa +++ b/src/unicode/fsm/grapheme_break_rev.littleendian.dfa diff --git a/src/unicode/fsm/grapheme_break_rev.rs b/src/unicode/fsm/grapheme_break_rev.rs index 93e888c..2d2cd54 100644 --- a/src/unicode/fsm/grapheme_break_rev.rs +++ b/src/unicode/fsm/grapheme_break_rev.rs @@ -2,11 +2,12 @@ // // ucd-generate dfa --name GRAPHEME_BREAK_REV --reverse --longest --sparse --minimize --anchored --state-size 2 src/unicode/fsm/ [snip (arg too long)] // -// ucd-generate 0.2.9 is available on crates.io. +// ucd-generate 0.2.12 is available on crates.io. #[cfg(target_endian = "big")] -lazy_static::lazy_static! { - pub static ref GRAPHEME_BREAK_REV: ::regex_automata::SparseDFA<&'static [u8], u16> = { +pub static GRAPHEME_BREAK_REV: ::once_cell::sync::Lazy< + ::regex_automata::SparseDFA<&'static [u8], u16>, +> = ::once_cell::sync::Lazy::new(|| { #[repr(C)] struct Aligned<B: ?Sized> { _align: [u8; 0], @@ -18,15 +19,13 @@ lazy_static::lazy_static! { bytes: *include_bytes!("grapheme_break_rev.bigendian.dfa"), }; - unsafe { - ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) - } - }; -} + unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) } +}); #[cfg(target_endian = "little")] -lazy_static::lazy_static! { - pub static ref GRAPHEME_BREAK_REV: ::regex_automata::SparseDFA<&'static [u8], u16> = { +pub static GRAPHEME_BREAK_REV: ::once_cell::sync::Lazy< + ::regex_automata::SparseDFA<&'static [u8], u16>, +> = ::once_cell::sync::Lazy::new(|| { #[repr(C)] struct Aligned<B: ?Sized> { _align: [u8; 0], @@ -38,8 +37,5 @@ lazy_static::lazy_static! { bytes: *include_bytes!("grapheme_break_rev.littleendian.dfa"), }; - unsafe { - ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) - } - }; -} + unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) } +}); diff --git a/src/unicode/fsm/regional_indicator_rev.rs b/src/unicode/fsm/regional_indicator_rev.rs index 2bf7e4c..db7a40f 100644 --- a/src/unicode/fsm/regional_indicator_rev.rs +++ b/src/unicode/fsm/regional_indicator_rev.rs @@ -2,11 +2,12 @@ // // ucd-generate dfa --name REGIONAL_INDICATOR_REV --reverse --classes --minimize --anchored --premultiply --state-size 1 src/unicode/fsm/ \p{gcb=Regional_Indicator} // -// ucd-generate 0.2.9 is available on crates.io. +// ucd-generate 0.2.12 is available on crates.io. #[cfg(target_endian = "big")] -lazy_static::lazy_static! { - pub static ref REGIONAL_INDICATOR_REV: ::regex_automata::DenseDFA<&'static [u8], u8> = { +pub static REGIONAL_INDICATOR_REV: ::once_cell::sync::Lazy< + ::regex_automata::DenseDFA<&'static [u8], u8>, +> = ::once_cell::sync::Lazy::new(|| { #[repr(C)] struct Aligned<B: ?Sized> { _align: [u8; 0], @@ -18,15 +19,13 @@ lazy_static::lazy_static! { bytes: *include_bytes!("regional_indicator_rev.bigendian.dfa"), }; - unsafe { - ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) - } - }; -} + unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) } +}); #[cfg(target_endian = "little")] -lazy_static::lazy_static! { - pub static ref REGIONAL_INDICATOR_REV: ::regex_automata::DenseDFA<&'static [u8], u8> = { +pub static REGIONAL_INDICATOR_REV: ::once_cell::sync::Lazy< + ::regex_automata::DenseDFA<&'static [u8], u8>, +> = ::once_cell::sync::Lazy::new(|| { #[repr(C)] struct Aligned<B: ?Sized> { _align: [u8; 0], @@ -38,8 +37,5 @@ lazy_static::lazy_static! { bytes: *include_bytes!("regional_indicator_rev.littleendian.dfa"), }; - unsafe { - ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) - } - }; -} + unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) } +}); diff --git a/src/unicode/fsm/sentence_break_fwd.bigendian.dfa b/src/unicode/fsm/sentence_break_fwd.bigendian.dfa Binary files differindex a1813d7..1abdae8 100644 --- a/src/unicode/fsm/sentence_break_fwd.bigendian.dfa +++ b/src/unicode/fsm/sentence_break_fwd.bigendian.dfa diff --git a/src/unicode/fsm/sentence_break_fwd.littleendian.dfa b/src/unicode/fsm/sentence_break_fwd.littleendian.dfa Binary files differindex 2763583..2f8aadd 100644 --- a/src/unicode/fsm/sentence_break_fwd.littleendian.dfa +++ b/src/unicode/fsm/sentence_break_fwd.littleendian.dfa diff --git a/src/unicode/fsm/sentence_break_fwd.rs b/src/unicode/fsm/sentence_break_fwd.rs index cc937a4..97dd658 100644 --- a/src/unicode/fsm/sentence_break_fwd.rs +++ b/src/unicode/fsm/sentence_break_fwd.rs @@ -2,11 +2,12 @@ // // ucd-generate dfa --name SENTENCE_BREAK_FWD --minimize --sparse --anchored --state-size 4 src/unicode/fsm/ [snip (arg too long)] // -// ucd-generate 0.2.9 is available on crates.io. +// ucd-generate 0.2.12 is available on crates.io. #[cfg(target_endian = "big")] -lazy_static::lazy_static! { - pub static ref SENTENCE_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = { +pub static SENTENCE_BREAK_FWD: ::once_cell::sync::Lazy< + ::regex_automata::SparseDFA<&'static [u8], u32>, +> = ::once_cell::sync::Lazy::new(|| { #[repr(C)] struct Aligned<B: ?Sized> { _align: [u8; 0], @@ -18,15 +19,13 @@ lazy_static::lazy_static! { bytes: *include_bytes!("sentence_break_fwd.bigendian.dfa"), }; - unsafe { - ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) - } - }; -} + unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) } +}); #[cfg(target_endian = "little")] -lazy_static::lazy_static! { - pub static ref SENTENCE_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = { +pub static SENTENCE_BREAK_FWD: ::once_cell::sync::Lazy< + ::regex_automata::SparseDFA<&'static [u8], u32>, +> = ::once_cell::sync::Lazy::new(|| { #[repr(C)] struct Aligned<B: ?Sized> { _align: [u8; 0], @@ -38,8 +37,5 @@ lazy_static::lazy_static! { bytes: *include_bytes!("sentence_break_fwd.littleendian.dfa"), }; - unsafe { - ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) - } - }; -} + unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) } +}); diff --git a/src/unicode/fsm/simple_word_fwd.bigendian.dfa b/src/unicode/fsm/simple_word_fwd.bigendian.dfa Binary files differindex adc64c1..888e465 100644 --- a/src/unicode/fsm/simple_word_fwd.bigendian.dfa +++ b/src/unicode/fsm/simple_word_fwd.bigendian.dfa diff --git a/src/unicode/fsm/simple_word_fwd.littleendian.dfa b/src/unicode/fsm/simple_word_fwd.littleendian.dfa Binary files differindex dd48386..a1d527c 100644 --- a/src/unicode/fsm/simple_word_fwd.littleendian.dfa +++ b/src/unicode/fsm/simple_word_fwd.littleendian.dfa diff --git a/src/unicode/fsm/simple_word_fwd.rs b/src/unicode/fsm/simple_word_fwd.rs index f1f3da5..32b69b6 100644 --- a/src/unicode/fsm/simple_word_fwd.rs +++ b/src/unicode/fsm/simple_word_fwd.rs @@ -2,11 +2,12 @@ // // ucd-generate dfa --name SIMPLE_WORD_FWD --sparse --minimize --state-size 2 src/unicode/fsm/ \w // -// ucd-generate 0.2.9 is available on crates.io. +// ucd-generate 0.2.12 is available on crates.io. #[cfg(target_endian = "big")] -lazy_static::lazy_static! { - pub static ref SIMPLE_WORD_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = { +pub static SIMPLE_WORD_FWD: ::once_cell::sync::Lazy< + ::regex_automata::SparseDFA<&'static [u8], u16>, +> = ::once_cell::sync::Lazy::new(|| { #[repr(C)] struct Aligned<B: ?Sized> { _align: [u8; 0], @@ -18,15 +19,13 @@ lazy_static::lazy_static! { bytes: *include_bytes!("simple_word_fwd.bigendian.dfa"), }; - unsafe { - ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) - } - }; -} + unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) } +}); #[cfg(target_endian = "little")] -lazy_static::lazy_static! { - pub static ref SIMPLE_WORD_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = { +pub static SIMPLE_WORD_FWD: ::once_cell::sync::Lazy< + ::regex_automata::SparseDFA<&'static [u8], u16>, +> = ::once_cell::sync::Lazy::new(|| { #[repr(C)] struct Aligned<B: ?Sized> { _align: [u8; 0], @@ -38,8 +37,5 @@ lazy_static::lazy_static! { bytes: *include_bytes!("simple_word_fwd.littleendian.dfa"), }; - unsafe { - ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) - } - }; -} + unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) } +}); diff --git a/src/unicode/fsm/whitespace_anchored_fwd.rs b/src/unicode/fsm/whitespace_anchored_fwd.rs index 419b5d4..0780412 100644 --- a/src/unicode/fsm/whitespace_anchored_fwd.rs +++ b/src/unicode/fsm/whitespace_anchored_fwd.rs @@ -2,11 +2,12 @@ // // ucd-generate dfa --name WHITESPACE_ANCHORED_FWD --anchored --classes --premultiply --minimize --state-size 1 src/unicode/fsm/ \s+ // -// ucd-generate 0.2.9 is available on crates.io. +// ucd-generate 0.2.12 is available on crates.io. #[cfg(target_endian = "big")] -lazy_static::lazy_static! { - pub static ref WHITESPACE_ANCHORED_FWD: ::regex_automata::DenseDFA<&'static [u8], u8> = { +pub static WHITESPACE_ANCHORED_FWD: ::once_cell::sync::Lazy< + ::regex_automata::DenseDFA<&'static [u8], u8>, +> = ::once_cell::sync::Lazy::new(|| { #[repr(C)] struct Aligned<B: ?Sized> { _align: [u8; 0], @@ -18,15 +19,13 @@ lazy_static::lazy_static! { bytes: *include_bytes!("whitespace_anchored_fwd.bigendian.dfa"), }; - unsafe { - ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) - } - }; -} + unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) } +}); #[cfg(target_endian = "little")] -lazy_static::lazy_static! { - pub static ref WHITESPACE_ANCHORED_FWD: ::regex_automata::DenseDFA<&'static [u8], u8> = { +pub static WHITESPACE_ANCHORED_FWD: ::once_cell::sync::Lazy< + ::regex_automata::DenseDFA<&'static [u8], u8>, +> = ::once_cell::sync::Lazy::new(|| { #[repr(C)] struct Aligned<B: ?Sized> { _align: [u8; 0], @@ -38,8 +37,5 @@ lazy_static::lazy_static! { bytes: *include_bytes!("whitespace_anchored_fwd.littleendian.dfa"), }; - unsafe { - ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) - } - }; -} + unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) } +}); diff --git a/src/unicode/fsm/whitespace_anchored_rev.rs b/src/unicode/fsm/whitespace_anchored_rev.rs index 301b03c..3d0d7a6 100644 --- a/src/unicode/fsm/whitespace_anchored_rev.rs +++ b/src/unicode/fsm/whitespace_anchored_rev.rs @@ -2,11 +2,12 @@ // // ucd-generate dfa --name WHITESPACE_ANCHORED_REV --reverse --anchored --classes --premultiply --minimize --state-size 2 src/unicode/fsm/ \s+ // -// ucd-generate 0.2.9 is available on crates.io. +// ucd-generate 0.2.12 is available on crates.io. #[cfg(target_endian = "big")] -lazy_static::lazy_static! { - pub static ref WHITESPACE_ANCHORED_REV: ::regex_automata::DenseDFA<&'static [u16], u16> = { +pub static WHITESPACE_ANCHORED_REV: ::once_cell::sync::Lazy< + ::regex_automata::DenseDFA<&'static [u16], u16>, +> = ::once_cell::sync::Lazy::new(|| { #[repr(C)] struct Aligned<B: ?Sized> { _align: [u16; 0], @@ -18,15 +19,13 @@ lazy_static::lazy_static! { bytes: *include_bytes!("whitespace_anchored_rev.bigendian.dfa"), }; - unsafe { - ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) - } - }; -} + unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) } +}); #[cfg(target_endian = "little")] -lazy_static::lazy_static! { - pub static ref WHITESPACE_ANCHORED_REV: ::regex_automata::DenseDFA<&'static [u16], u16> = { +pub static WHITESPACE_ANCHORED_REV: ::once_cell::sync::Lazy< + ::regex_automata::DenseDFA<&'static [u16], u16>, +> = ::once_cell::sync::Lazy::new(|| { #[repr(C)] struct Aligned<B: ?Sized> { _align: [u16; 0], @@ -38,8 +37,5 @@ lazy_static::lazy_static! { bytes: *include_bytes!("whitespace_anchored_rev.littleendian.dfa"), }; - unsafe { - ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) - } - }; -} + unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) } +}); diff --git a/src/unicode/fsm/word_break_fwd.bigendian.dfa b/src/unicode/fsm/word_break_fwd.bigendian.dfa Binary files differindex 1e75db6..efb9c81 100644 --- a/src/unicode/fsm/word_break_fwd.bigendian.dfa +++ b/src/unicode/fsm/word_break_fwd.bigendian.dfa diff --git a/src/unicode/fsm/word_break_fwd.littleendian.dfa b/src/unicode/fsm/word_break_fwd.littleendian.dfa Binary files differindex e3093a3..9a716d0 100644 --- a/src/unicode/fsm/word_break_fwd.littleendian.dfa +++ b/src/unicode/fsm/word_break_fwd.littleendian.dfa diff --git a/src/unicode/fsm/word_break_fwd.rs b/src/unicode/fsm/word_break_fwd.rs index fb041b7..dcb5f6b 100644 --- a/src/unicode/fsm/word_break_fwd.rs +++ b/src/unicode/fsm/word_break_fwd.rs @@ -2,11 +2,12 @@ // // ucd-generate dfa --name WORD_BREAK_FWD --sparse --minimize --anchored --state-size 4 src/unicode/fsm/ [snip (arg too long)] // -// ucd-generate 0.2.9 is available on crates.io. +// ucd-generate 0.2.12 is available on crates.io. #[cfg(target_endian = "big")] -lazy_static::lazy_static! { - pub static ref WORD_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = { +pub static WORD_BREAK_FWD: ::once_cell::sync::Lazy< + ::regex_automata::SparseDFA<&'static [u8], u32>, +> = ::once_cell::sync::Lazy::new(|| { #[repr(C)] struct Aligned<B: ?Sized> { _align: [u8; 0], @@ -18,15 +19,13 @@ lazy_static::lazy_static! { bytes: *include_bytes!("word_break_fwd.bigendian.dfa"), }; - unsafe { - ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) - } - }; -} + unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) } +}); #[cfg(target_endian = "little")] -lazy_static::lazy_static! { - pub static ref WORD_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = { +pub static WORD_BREAK_FWD: ::once_cell::sync::Lazy< + ::regex_automata::SparseDFA<&'static [u8], u32>, +> = ::once_cell::sync::Lazy::new(|| { #[repr(C)] struct Aligned<B: ?Sized> { _align: [u8; 0], @@ -38,8 +37,5 @@ lazy_static::lazy_static! { bytes: *include_bytes!("word_break_fwd.littleendian.dfa"), }; - unsafe { - ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) - } - }; -} + unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) } +}); diff --git a/src/unicode/grapheme.rs b/src/unicode/grapheme.rs index ad31cf1..13b730c 100644 --- a/src/unicode/grapheme.rs +++ b/src/unicode/grapheme.rs @@ -1,10 +1,14 @@ use regex_automata::DFA; -use crate::ext_slice::ByteSlice; -use crate::unicode::fsm::grapheme_break_fwd::GRAPHEME_BREAK_FWD; -use crate::unicode::fsm::grapheme_break_rev::GRAPHEME_BREAK_REV; -use crate::unicode::fsm::regional_indicator_rev::REGIONAL_INDICATOR_REV; -use crate::utf8; +use crate::{ + ext_slice::ByteSlice, + unicode::fsm::{ + grapheme_break_fwd::GRAPHEME_BREAK_FWD, + grapheme_break_rev::GRAPHEME_BREAK_REV, + regional_indicator_rev::REGIONAL_INDICATOR_REV, + }, + utf8, +}; /// An iterator over grapheme clusters in a byte string. /// @@ -125,7 +129,7 @@ pub struct GraphemeIndices<'a> { impl<'a> GraphemeIndices<'a> { pub(crate) fn new(bs: &'a [u8]) -> GraphemeIndices<'a> { - GraphemeIndices { bs: bs, forward_index: 0, reverse_index: bs.len() } + GraphemeIndices { bs, forward_index: 0, reverse_index: bs.len() } } /// View the underlying data as a subslice of the original data. @@ -191,6 +195,22 @@ impl<'a> DoubleEndedIterator for GraphemeIndices<'a> { pub fn decode_grapheme(bs: &[u8]) -> (&str, usize) { if bs.is_empty() { ("", 0) + } else if bs.len() >= 2 + && bs[0].is_ascii() + && bs[1].is_ascii() + && !bs[0].is_ascii_whitespace() + { + // FIXME: It is somewhat sad that we have to special case this, but it + // leads to a significant speed up in predominantly ASCII text. The + // issue here is that the DFA has a bit of overhead, and running it for + // every byte in mostly ASCII text results in a bit slowdown. We should + // re-litigate this once regex-automata 0.3 is out, but it might be + // hard to avoid the special case. A DFA is always going to at least + // require some memory access. + + // Safe because all ASCII bytes are valid UTF-8. + let grapheme = unsafe { bs[..1].to_str_unchecked() }; + (grapheme, 1) } else if let Some(end) = GRAPHEME_BREAK_FWD.find(bs) { // Safe because a match can only occur for valid UTF-8. let grapheme = unsafe { bs[..end].to_str_unchecked() }; @@ -257,15 +277,17 @@ fn adjust_rev_for_regional_indicator(mut bs: &[u8], i: usize) -> usize { } } -#[cfg(test)] +#[cfg(all(test, feature = "std"))] mod tests { + #[cfg(not(miri))] use ucd_parse::GraphemeClusterBreakTest; + use crate::{ext_slice::ByteSlice, tests::LOSSY_TESTS}; + use super::*; - use crate::ext_slice::ByteSlice; - use crate::tests::LOSSY_TESTS; #[test] + #[cfg(not(miri))] fn forward_ucd() { for (i, test) in ucdtests().into_iter().enumerate() { let given = test.grapheme_clusters.concat(); @@ -288,6 +310,7 @@ mod tests { } #[test] + #[cfg(not(miri))] fn reverse_ucd() { for (i, test) in ucdtests().into_iter().enumerate() { let given = test.grapheme_clusters.concat(); @@ -329,15 +352,18 @@ mod tests { } } + #[cfg(not(miri))] fn uniescape(s: &str) -> String { s.chars().flat_map(|c| c.escape_unicode()).collect::<String>() } + #[cfg(not(miri))] fn uniescape_vec(strs: &[String]) -> Vec<String> { strs.iter().map(|s| uniescape(s)).collect() } /// Return all of the UCD for grapheme breaks. + #[cfg(not(miri))] fn ucdtests() -> Vec<GraphemeClusterBreakTest> { const TESTDATA: &'static str = include_str!("data/GraphemeBreakTest.txt"); diff --git a/src/unicode/mod.rs b/src/unicode/mod.rs index 60318f4..80638e8 100644 --- a/src/unicode/mod.rs +++ b/src/unicode/mod.rs @@ -1,8 +1,8 @@ -pub use self::grapheme::{decode_grapheme, GraphemeIndices, Graphemes}; -pub use self::sentence::{SentenceIndices, Sentences}; -pub use self::whitespace::{whitespace_len_fwd, whitespace_len_rev}; -pub use self::word::{ - WordIndices, Words, WordsWithBreakIndices, WordsWithBreaks, +pub use self::{ + grapheme::{decode_grapheme, GraphemeIndices, Graphemes}, + sentence::{SentenceIndices, Sentences}, + whitespace::{whitespace_len_fwd, whitespace_len_rev}, + word::{WordIndices, Words, WordsWithBreakIndices, WordsWithBreaks}, }; mod fsm; diff --git a/src/unicode/sentence.rs b/src/unicode/sentence.rs index 063f342..ff29c7e 100644 --- a/src/unicode/sentence.rs +++ b/src/unicode/sentence.rs @@ -1,8 +1,9 @@ use regex_automata::DFA; -use crate::ext_slice::ByteSlice; -use crate::unicode::fsm::sentence_break_fwd::SENTENCE_BREAK_FWD; -use crate::utf8; +use crate::{ + ext_slice::ByteSlice, + unicode::fsm::sentence_break_fwd::SENTENCE_BREAK_FWD, utf8, +}; /// An iterator over sentences in a byte string. /// @@ -97,7 +98,7 @@ pub struct SentenceIndices<'a> { impl<'a> SentenceIndices<'a> { pub(crate) fn new(bs: &'a [u8]) -> SentenceIndices<'a> { - SentenceIndices { bs: bs, forward_index: 0 } + SentenceIndices { bs, forward_index: 0 } } /// View the underlying data as a subslice of the original data. @@ -156,13 +157,15 @@ fn decode_sentence(bs: &[u8]) -> (&str, usize) { } } -#[cfg(test)] +#[cfg(all(test, feature = "std"))] mod tests { + #[cfg(not(miri))] use ucd_parse::SentenceBreakTest; use crate::ext_slice::ByteSlice; #[test] + #[cfg(not(miri))] fn forward_ucd() { for (i, test) in ucdtests().into_iter().enumerate() { let given = test.sentences.concat(); @@ -198,11 +201,13 @@ mod tests { bytes.sentences().collect() } + #[cfg(not(miri))] fn strs_to_bstrs<S: AsRef<str>>(strs: &[S]) -> Vec<&[u8]> { strs.iter().map(|s| s.as_ref().as_bytes()).collect() } /// Return all of the UCD for sentence breaks. + #[cfg(not(miri))] fn ucdtests() -> Vec<SentenceBreakTest> { const TESTDATA: &'static str = include_str!("data/SentenceBreakTest.txt"); diff --git a/src/unicode/whitespace.rs b/src/unicode/whitespace.rs index 949a83f..b5eff30 100644 --- a/src/unicode/whitespace.rs +++ b/src/unicode/whitespace.rs @@ -1,7 +1,9 @@ use regex_automata::DFA; -use crate::unicode::fsm::whitespace_anchored_fwd::WHITESPACE_ANCHORED_FWD; -use crate::unicode::fsm::whitespace_anchored_rev::WHITESPACE_ANCHORED_REV; +use crate::unicode::fsm::{ + whitespace_anchored_fwd::WHITESPACE_ANCHORED_FWD, + whitespace_anchored_rev::WHITESPACE_ANCHORED_REV, +}; /// Return the first position of a non-whitespace character. pub fn whitespace_len_fwd(slice: &[u8]) -> usize { diff --git a/src/unicode/word.rs b/src/unicode/word.rs index e0a5701..849f0c8 100644 --- a/src/unicode/word.rs +++ b/src/unicode/word.rs @@ -1,9 +1,12 @@ use regex_automata::DFA; -use crate::ext_slice::ByteSlice; -use crate::unicode::fsm::simple_word_fwd::SIMPLE_WORD_FWD; -use crate::unicode::fsm::word_break_fwd::WORD_BREAK_FWD; -use crate::utf8; +use crate::{ + ext_slice::ByteSlice, + unicode::fsm::{ + simple_word_fwd::SIMPLE_WORD_FWD, word_break_fwd::WORD_BREAK_FWD, + }, + utf8, +}; /// An iterator over words in a byte string. /// @@ -254,7 +257,7 @@ pub struct WordsWithBreakIndices<'a> { impl<'a> WordsWithBreakIndices<'a> { pub(crate) fn new(bs: &'a [u8]) -> WordsWithBreakIndices<'a> { - WordsWithBreakIndices { bs: bs, forward_index: 0 } + WordsWithBreakIndices { bs, forward_index: 0 } } /// View the underlying data as a subslice of the original data. @@ -316,13 +319,15 @@ fn decode_word(bs: &[u8]) -> (&str, usize) { } } -#[cfg(test)] +#[cfg(all(test, feature = "std"))] mod tests { + #[cfg(not(miri))] use ucd_parse::WordBreakTest; use crate::ext_slice::ByteSlice; #[test] + #[cfg(not(miri))] fn forward_ucd() { for (i, test) in ucdtests().into_iter().enumerate() { let given = test.words.concat(); @@ -379,17 +384,26 @@ mod tests { assert_eq!(vec!["1XY"], words(b"1XY")); assert_eq!(vec!["\u{FEFF}", "Ты"], words("\u{FEFF}Ты".as_bytes())); + + // Tests that Vithkuqi works, which was introduced in Unicode 14. + // This test fails prior to Unicode 14. + assert_eq!( + vec!["\u{10570}\u{10597}"], + words("\u{10570}\u{10597}".as_bytes()) + ); } fn words(bytes: &[u8]) -> Vec<&str> { bytes.words_with_breaks().collect() } + #[cfg(not(miri))] fn strs_to_bstrs<S: AsRef<str>>(strs: &[S]) -> Vec<&[u8]> { strs.iter().map(|s| s.as_ref().as_bytes()).collect() } /// Return all of the UCD for word breaks. + #[cfg(not(miri))] fn ucdtests() -> Vec<WordBreakTest> { const TESTDATA: &'static str = include_str!("data/WordBreakTest.txt"); |