aboutsummaryrefslogtreecommitdiff
path: root/src/unicode
diff options
context:
space:
mode:
authorJeff Vander Stoep <jeffv@google.com>2022-12-12 09:08:11 +0100
committerJeff Vander Stoep <jeffv@google.com>2022-12-12 09:08:51 +0100
commite3d458e4045070c172111aef0214af1819ee7403 (patch)
tree065f4b90fc4351f93487c5554f9cacd93fe29cbe /src/unicode
parent1faff9be927c85d1dfb151bc7975d02f697854df (diff)
downloadbstr-main-16k-with-phones.tar.gz
Upgrade bstr to 1.0.1main-16k-with-phones
This project was upgraded with external_updater. Usage: tools/external_updater/updater.sh update rust/crates/bstr For more info, check https://cs.android.com/android/platform/superproject/+/master:tools/external_updater/README.md Test: TreeHugger Change-Id: I97636c5410bfed620afaf1e05bd02b94670f17b0
Diffstat (limited to 'src/unicode')
-rw-r--r--src/unicode/data/GraphemeBreakTest.txt6
-rw-r--r--src/unicode/data/SentenceBreakTest.txt6
-rw-r--r--src/unicode/data/WordBreakTest.txt6
-rw-r--r--src/unicode/fsm/grapheme_break_fwd.bigendian.dfabin10589 -> 10781 bytes
-rw-r--r--src/unicode/fsm/grapheme_break_fwd.littleendian.dfabin10589 -> 10781 bytes
-rw-r--r--src/unicode/fsm/grapheme_break_fwd.rs26
-rw-r--r--src/unicode/fsm/grapheme_break_rev.bigendian.dfabin53905 -> 55271 bytes
-rw-r--r--src/unicode/fsm/grapheme_break_rev.littleendian.dfabin53905 -> 55271 bytes
-rw-r--r--src/unicode/fsm/grapheme_break_rev.rs26
-rw-r--r--src/unicode/fsm/regional_indicator_rev.rs26
-rw-r--r--src/unicode/fsm/sentence_break_fwd.bigendian.dfabin149903 -> 153619 bytes
-rw-r--r--src/unicode/fsm/sentence_break_fwd.littleendian.dfabin149903 -> 153619 bytes
-rw-r--r--src/unicode/fsm/sentence_break_fwd.rs26
-rw-r--r--src/unicode/fsm/simple_word_fwd.bigendian.dfabin8975 -> 9237 bytes
-rw-r--r--src/unicode/fsm/simple_word_fwd.littleendian.dfabin8975 -> 9237 bytes
-rw-r--r--src/unicode/fsm/simple_word_fwd.rs26
-rw-r--r--src/unicode/fsm/whitespace_anchored_fwd.rs26
-rw-r--r--src/unicode/fsm/whitespace_anchored_rev.rs26
-rw-r--r--src/unicode/fsm/word_break_fwd.bigendian.dfabin229739 -> 236309 bytes
-rw-r--r--src/unicode/fsm/word_break_fwd.littleendian.dfabin229739 -> 236309 bytes
-rw-r--r--src/unicode/fsm/word_break_fwd.rs26
-rw-r--r--src/unicode/grapheme.rs44
-rw-r--r--src/unicode/mod.rs10
-rw-r--r--src/unicode/sentence.rs15
-rw-r--r--src/unicode/whitespace.rs6
-rw-r--r--src/unicode/word.rs26
26 files changed, 171 insertions, 156 deletions
diff --git a/src/unicode/data/GraphemeBreakTest.txt b/src/unicode/data/GraphemeBreakTest.txt
index fb4fec9..eff2fd3 100644
--- a/src/unicode/data/GraphemeBreakTest.txt
+++ b/src/unicode/data/GraphemeBreakTest.txt
@@ -1,6 +1,6 @@
-# GraphemeBreakTest-12.1.0.txt
-# Date: 2019-03-10, 10:53:12 GMT
-# © 2019 Unicode®, Inc.
+# GraphemeBreakTest-14.0.0.txt
+# Date: 2021-03-08, 06:22:32 GMT
+# © 2021 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see http://www.unicode.org/terms_of_use.html
#
diff --git a/src/unicode/data/SentenceBreakTest.txt b/src/unicode/data/SentenceBreakTest.txt
index 7c1c34a..61ea42c 100644
--- a/src/unicode/data/SentenceBreakTest.txt
+++ b/src/unicode/data/SentenceBreakTest.txt
@@ -1,6 +1,6 @@
-# SentenceBreakTest-12.1.0.txt
-# Date: 2019-03-10, 10:53:28 GMT
-# © 2019 Unicode®, Inc.
+# SentenceBreakTest-14.0.0.txt
+# Date: 2021-03-08, 06:22:40 GMT
+# © 2021 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see http://www.unicode.org/terms_of_use.html
#
diff --git a/src/unicode/data/WordBreakTest.txt b/src/unicode/data/WordBreakTest.txt
index facd892..1d1435b 100644
--- a/src/unicode/data/WordBreakTest.txt
+++ b/src/unicode/data/WordBreakTest.txt
@@ -1,6 +1,6 @@
-# WordBreakTest-12.1.0.txt
-# Date: 2019-03-10, 10:53:29 GMT
-# © 2019 Unicode®, Inc.
+# WordBreakTest-14.0.0.txt
+# Date: 2021-03-08, 06:22:40 GMT
+# © 2021 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see http://www.unicode.org/terms_of_use.html
#
diff --git a/src/unicode/fsm/grapheme_break_fwd.bigendian.dfa b/src/unicode/fsm/grapheme_break_fwd.bigendian.dfa
index 0efaaf2..31f99c1 100644
--- a/src/unicode/fsm/grapheme_break_fwd.bigendian.dfa
+++ b/src/unicode/fsm/grapheme_break_fwd.bigendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/grapheme_break_fwd.littleendian.dfa b/src/unicode/fsm/grapheme_break_fwd.littleendian.dfa
index eb24025..3a51728 100644
--- a/src/unicode/fsm/grapheme_break_fwd.littleendian.dfa
+++ b/src/unicode/fsm/grapheme_break_fwd.littleendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/grapheme_break_fwd.rs b/src/unicode/fsm/grapheme_break_fwd.rs
index b53b1d7..dea4a7e 100644
--- a/src/unicode/fsm/grapheme_break_fwd.rs
+++ b/src/unicode/fsm/grapheme_break_fwd.rs
@@ -2,11 +2,12 @@
//
// ucd-generate dfa --name GRAPHEME_BREAK_FWD --sparse --minimize --anchored --state-size 2 src/unicode/fsm/ [snip (arg too long)]
//
-// ucd-generate 0.2.9 is available on crates.io.
+// ucd-generate 0.2.12 is available on crates.io.
#[cfg(target_endian = "big")]
-lazy_static::lazy_static! {
- pub static ref GRAPHEME_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = {
+pub static GRAPHEME_BREAK_FWD: ::once_cell::sync::Lazy<
+ ::regex_automata::SparseDFA<&'static [u8], u16>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -18,15 +19,13 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("grapheme_break_fwd.bigendian.dfa"),
};
- unsafe {
- ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
#[cfg(target_endian = "little")]
-lazy_static::lazy_static! {
- pub static ref GRAPHEME_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = {
+pub static GRAPHEME_BREAK_FWD: ::once_cell::sync::Lazy<
+ ::regex_automata::SparseDFA<&'static [u8], u16>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -38,8 +37,5 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("grapheme_break_fwd.littleendian.dfa"),
};
- unsafe {
- ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
diff --git a/src/unicode/fsm/grapheme_break_rev.bigendian.dfa b/src/unicode/fsm/grapheme_break_rev.bigendian.dfa
index d42cd36..742d2a6 100644
--- a/src/unicode/fsm/grapheme_break_rev.bigendian.dfa
+++ b/src/unicode/fsm/grapheme_break_rev.bigendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/grapheme_break_rev.littleendian.dfa b/src/unicode/fsm/grapheme_break_rev.littleendian.dfa
index c75ea5f..d1937f2 100644
--- a/src/unicode/fsm/grapheme_break_rev.littleendian.dfa
+++ b/src/unicode/fsm/grapheme_break_rev.littleendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/grapheme_break_rev.rs b/src/unicode/fsm/grapheme_break_rev.rs
index 93e888c..2d2cd54 100644
--- a/src/unicode/fsm/grapheme_break_rev.rs
+++ b/src/unicode/fsm/grapheme_break_rev.rs
@@ -2,11 +2,12 @@
//
// ucd-generate dfa --name GRAPHEME_BREAK_REV --reverse --longest --sparse --minimize --anchored --state-size 2 src/unicode/fsm/ [snip (arg too long)]
//
-// ucd-generate 0.2.9 is available on crates.io.
+// ucd-generate 0.2.12 is available on crates.io.
#[cfg(target_endian = "big")]
-lazy_static::lazy_static! {
- pub static ref GRAPHEME_BREAK_REV: ::regex_automata::SparseDFA<&'static [u8], u16> = {
+pub static GRAPHEME_BREAK_REV: ::once_cell::sync::Lazy<
+ ::regex_automata::SparseDFA<&'static [u8], u16>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -18,15 +19,13 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("grapheme_break_rev.bigendian.dfa"),
};
- unsafe {
- ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
#[cfg(target_endian = "little")]
-lazy_static::lazy_static! {
- pub static ref GRAPHEME_BREAK_REV: ::regex_automata::SparseDFA<&'static [u8], u16> = {
+pub static GRAPHEME_BREAK_REV: ::once_cell::sync::Lazy<
+ ::regex_automata::SparseDFA<&'static [u8], u16>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -38,8 +37,5 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("grapheme_break_rev.littleendian.dfa"),
};
- unsafe {
- ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
diff --git a/src/unicode/fsm/regional_indicator_rev.rs b/src/unicode/fsm/regional_indicator_rev.rs
index 2bf7e4c..db7a40f 100644
--- a/src/unicode/fsm/regional_indicator_rev.rs
+++ b/src/unicode/fsm/regional_indicator_rev.rs
@@ -2,11 +2,12 @@
//
// ucd-generate dfa --name REGIONAL_INDICATOR_REV --reverse --classes --minimize --anchored --premultiply --state-size 1 src/unicode/fsm/ \p{gcb=Regional_Indicator}
//
-// ucd-generate 0.2.9 is available on crates.io.
+// ucd-generate 0.2.12 is available on crates.io.
#[cfg(target_endian = "big")]
-lazy_static::lazy_static! {
- pub static ref REGIONAL_INDICATOR_REV: ::regex_automata::DenseDFA<&'static [u8], u8> = {
+pub static REGIONAL_INDICATOR_REV: ::once_cell::sync::Lazy<
+ ::regex_automata::DenseDFA<&'static [u8], u8>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -18,15 +19,13 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("regional_indicator_rev.bigendian.dfa"),
};
- unsafe {
- ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) }
+});
#[cfg(target_endian = "little")]
-lazy_static::lazy_static! {
- pub static ref REGIONAL_INDICATOR_REV: ::regex_automata::DenseDFA<&'static [u8], u8> = {
+pub static REGIONAL_INDICATOR_REV: ::once_cell::sync::Lazy<
+ ::regex_automata::DenseDFA<&'static [u8], u8>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -38,8 +37,5 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("regional_indicator_rev.littleendian.dfa"),
};
- unsafe {
- ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) }
+});
diff --git a/src/unicode/fsm/sentence_break_fwd.bigendian.dfa b/src/unicode/fsm/sentence_break_fwd.bigendian.dfa
index a1813d7..1abdae8 100644
--- a/src/unicode/fsm/sentence_break_fwd.bigendian.dfa
+++ b/src/unicode/fsm/sentence_break_fwd.bigendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/sentence_break_fwd.littleendian.dfa b/src/unicode/fsm/sentence_break_fwd.littleendian.dfa
index 2763583..2f8aadd 100644
--- a/src/unicode/fsm/sentence_break_fwd.littleendian.dfa
+++ b/src/unicode/fsm/sentence_break_fwd.littleendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/sentence_break_fwd.rs b/src/unicode/fsm/sentence_break_fwd.rs
index cc937a4..97dd658 100644
--- a/src/unicode/fsm/sentence_break_fwd.rs
+++ b/src/unicode/fsm/sentence_break_fwd.rs
@@ -2,11 +2,12 @@
//
// ucd-generate dfa --name SENTENCE_BREAK_FWD --minimize --sparse --anchored --state-size 4 src/unicode/fsm/ [snip (arg too long)]
//
-// ucd-generate 0.2.9 is available on crates.io.
+// ucd-generate 0.2.12 is available on crates.io.
#[cfg(target_endian = "big")]
-lazy_static::lazy_static! {
- pub static ref SENTENCE_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = {
+pub static SENTENCE_BREAK_FWD: ::once_cell::sync::Lazy<
+ ::regex_automata::SparseDFA<&'static [u8], u32>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -18,15 +19,13 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("sentence_break_fwd.bigendian.dfa"),
};
- unsafe {
- ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
#[cfg(target_endian = "little")]
-lazy_static::lazy_static! {
- pub static ref SENTENCE_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = {
+pub static SENTENCE_BREAK_FWD: ::once_cell::sync::Lazy<
+ ::regex_automata::SparseDFA<&'static [u8], u32>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -38,8 +37,5 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("sentence_break_fwd.littleendian.dfa"),
};
- unsafe {
- ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
diff --git a/src/unicode/fsm/simple_word_fwd.bigendian.dfa b/src/unicode/fsm/simple_word_fwd.bigendian.dfa
index adc64c1..888e465 100644
--- a/src/unicode/fsm/simple_word_fwd.bigendian.dfa
+++ b/src/unicode/fsm/simple_word_fwd.bigendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/simple_word_fwd.littleendian.dfa b/src/unicode/fsm/simple_word_fwd.littleendian.dfa
index dd48386..a1d527c 100644
--- a/src/unicode/fsm/simple_word_fwd.littleendian.dfa
+++ b/src/unicode/fsm/simple_word_fwd.littleendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/simple_word_fwd.rs b/src/unicode/fsm/simple_word_fwd.rs
index f1f3da5..32b69b6 100644
--- a/src/unicode/fsm/simple_word_fwd.rs
+++ b/src/unicode/fsm/simple_word_fwd.rs
@@ -2,11 +2,12 @@
//
// ucd-generate dfa --name SIMPLE_WORD_FWD --sparse --minimize --state-size 2 src/unicode/fsm/ \w
//
-// ucd-generate 0.2.9 is available on crates.io.
+// ucd-generate 0.2.12 is available on crates.io.
#[cfg(target_endian = "big")]
-lazy_static::lazy_static! {
- pub static ref SIMPLE_WORD_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = {
+pub static SIMPLE_WORD_FWD: ::once_cell::sync::Lazy<
+ ::regex_automata::SparseDFA<&'static [u8], u16>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -18,15 +19,13 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("simple_word_fwd.bigendian.dfa"),
};
- unsafe {
- ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
#[cfg(target_endian = "little")]
-lazy_static::lazy_static! {
- pub static ref SIMPLE_WORD_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = {
+pub static SIMPLE_WORD_FWD: ::once_cell::sync::Lazy<
+ ::regex_automata::SparseDFA<&'static [u8], u16>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -38,8 +37,5 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("simple_word_fwd.littleendian.dfa"),
};
- unsafe {
- ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
diff --git a/src/unicode/fsm/whitespace_anchored_fwd.rs b/src/unicode/fsm/whitespace_anchored_fwd.rs
index 419b5d4..0780412 100644
--- a/src/unicode/fsm/whitespace_anchored_fwd.rs
+++ b/src/unicode/fsm/whitespace_anchored_fwd.rs
@@ -2,11 +2,12 @@
//
// ucd-generate dfa --name WHITESPACE_ANCHORED_FWD --anchored --classes --premultiply --minimize --state-size 1 src/unicode/fsm/ \s+
//
-// ucd-generate 0.2.9 is available on crates.io.
+// ucd-generate 0.2.12 is available on crates.io.
#[cfg(target_endian = "big")]
-lazy_static::lazy_static! {
- pub static ref WHITESPACE_ANCHORED_FWD: ::regex_automata::DenseDFA<&'static [u8], u8> = {
+pub static WHITESPACE_ANCHORED_FWD: ::once_cell::sync::Lazy<
+ ::regex_automata::DenseDFA<&'static [u8], u8>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -18,15 +19,13 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("whitespace_anchored_fwd.bigendian.dfa"),
};
- unsafe {
- ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) }
+});
#[cfg(target_endian = "little")]
-lazy_static::lazy_static! {
- pub static ref WHITESPACE_ANCHORED_FWD: ::regex_automata::DenseDFA<&'static [u8], u8> = {
+pub static WHITESPACE_ANCHORED_FWD: ::once_cell::sync::Lazy<
+ ::regex_automata::DenseDFA<&'static [u8], u8>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -38,8 +37,5 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("whitespace_anchored_fwd.littleendian.dfa"),
};
- unsafe {
- ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) }
+});
diff --git a/src/unicode/fsm/whitespace_anchored_rev.rs b/src/unicode/fsm/whitespace_anchored_rev.rs
index 301b03c..3d0d7a6 100644
--- a/src/unicode/fsm/whitespace_anchored_rev.rs
+++ b/src/unicode/fsm/whitespace_anchored_rev.rs
@@ -2,11 +2,12 @@
//
// ucd-generate dfa --name WHITESPACE_ANCHORED_REV --reverse --anchored --classes --premultiply --minimize --state-size 2 src/unicode/fsm/ \s+
//
-// ucd-generate 0.2.9 is available on crates.io.
+// ucd-generate 0.2.12 is available on crates.io.
#[cfg(target_endian = "big")]
-lazy_static::lazy_static! {
- pub static ref WHITESPACE_ANCHORED_REV: ::regex_automata::DenseDFA<&'static [u16], u16> = {
+pub static WHITESPACE_ANCHORED_REV: ::once_cell::sync::Lazy<
+ ::regex_automata::DenseDFA<&'static [u16], u16>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u16; 0],
@@ -18,15 +19,13 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("whitespace_anchored_rev.bigendian.dfa"),
};
- unsafe {
- ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) }
+});
#[cfg(target_endian = "little")]
-lazy_static::lazy_static! {
- pub static ref WHITESPACE_ANCHORED_REV: ::regex_automata::DenseDFA<&'static [u16], u16> = {
+pub static WHITESPACE_ANCHORED_REV: ::once_cell::sync::Lazy<
+ ::regex_automata::DenseDFA<&'static [u16], u16>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u16; 0],
@@ -38,8 +37,5 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("whitespace_anchored_rev.littleendian.dfa"),
};
- unsafe {
- ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) }
+});
diff --git a/src/unicode/fsm/word_break_fwd.bigendian.dfa b/src/unicode/fsm/word_break_fwd.bigendian.dfa
index 1e75db6..efb9c81 100644
--- a/src/unicode/fsm/word_break_fwd.bigendian.dfa
+++ b/src/unicode/fsm/word_break_fwd.bigendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/word_break_fwd.littleendian.dfa b/src/unicode/fsm/word_break_fwd.littleendian.dfa
index e3093a3..9a716d0 100644
--- a/src/unicode/fsm/word_break_fwd.littleendian.dfa
+++ b/src/unicode/fsm/word_break_fwd.littleendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/word_break_fwd.rs b/src/unicode/fsm/word_break_fwd.rs
index fb041b7..dcb5f6b 100644
--- a/src/unicode/fsm/word_break_fwd.rs
+++ b/src/unicode/fsm/word_break_fwd.rs
@@ -2,11 +2,12 @@
//
// ucd-generate dfa --name WORD_BREAK_FWD --sparse --minimize --anchored --state-size 4 src/unicode/fsm/ [snip (arg too long)]
//
-// ucd-generate 0.2.9 is available on crates.io.
+// ucd-generate 0.2.12 is available on crates.io.
#[cfg(target_endian = "big")]
-lazy_static::lazy_static! {
- pub static ref WORD_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = {
+pub static WORD_BREAK_FWD: ::once_cell::sync::Lazy<
+ ::regex_automata::SparseDFA<&'static [u8], u32>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -18,15 +19,13 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("word_break_fwd.bigendian.dfa"),
};
- unsafe {
- ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
#[cfg(target_endian = "little")]
-lazy_static::lazy_static! {
- pub static ref WORD_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = {
+pub static WORD_BREAK_FWD: ::once_cell::sync::Lazy<
+ ::regex_automata::SparseDFA<&'static [u8], u32>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -38,8 +37,5 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("word_break_fwd.littleendian.dfa"),
};
- unsafe {
- ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
diff --git a/src/unicode/grapheme.rs b/src/unicode/grapheme.rs
index ad31cf1..13b730c 100644
--- a/src/unicode/grapheme.rs
+++ b/src/unicode/grapheme.rs
@@ -1,10 +1,14 @@
use regex_automata::DFA;
-use crate::ext_slice::ByteSlice;
-use crate::unicode::fsm::grapheme_break_fwd::GRAPHEME_BREAK_FWD;
-use crate::unicode::fsm::grapheme_break_rev::GRAPHEME_BREAK_REV;
-use crate::unicode::fsm::regional_indicator_rev::REGIONAL_INDICATOR_REV;
-use crate::utf8;
+use crate::{
+ ext_slice::ByteSlice,
+ unicode::fsm::{
+ grapheme_break_fwd::GRAPHEME_BREAK_FWD,
+ grapheme_break_rev::GRAPHEME_BREAK_REV,
+ regional_indicator_rev::REGIONAL_INDICATOR_REV,
+ },
+ utf8,
+};
/// An iterator over grapheme clusters in a byte string.
///
@@ -125,7 +129,7 @@ pub struct GraphemeIndices<'a> {
impl<'a> GraphemeIndices<'a> {
pub(crate) fn new(bs: &'a [u8]) -> GraphemeIndices<'a> {
- GraphemeIndices { bs: bs, forward_index: 0, reverse_index: bs.len() }
+ GraphemeIndices { bs, forward_index: 0, reverse_index: bs.len() }
}
/// View the underlying data as a subslice of the original data.
@@ -191,6 +195,22 @@ impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
pub fn decode_grapheme(bs: &[u8]) -> (&str, usize) {
if bs.is_empty() {
("", 0)
+ } else if bs.len() >= 2
+ && bs[0].is_ascii()
+ && bs[1].is_ascii()
+ && !bs[0].is_ascii_whitespace()
+ {
+ // FIXME: It is somewhat sad that we have to special case this, but it
+ // leads to a significant speed up in predominantly ASCII text. The
+ // issue here is that the DFA has a bit of overhead, and running it for
+ // every byte in mostly ASCII text results in a bit slowdown. We should
+ // re-litigate this once regex-automata 0.3 is out, but it might be
+ // hard to avoid the special case. A DFA is always going to at least
+ // require some memory access.
+
+ // Safe because all ASCII bytes are valid UTF-8.
+ let grapheme = unsafe { bs[..1].to_str_unchecked() };
+ (grapheme, 1)
} else if let Some(end) = GRAPHEME_BREAK_FWD.find(bs) {
// Safe because a match can only occur for valid UTF-8.
let grapheme = unsafe { bs[..end].to_str_unchecked() };
@@ -257,15 +277,17 @@ fn adjust_rev_for_regional_indicator(mut bs: &[u8], i: usize) -> usize {
}
}
-#[cfg(test)]
+#[cfg(all(test, feature = "std"))]
mod tests {
+ #[cfg(not(miri))]
use ucd_parse::GraphemeClusterBreakTest;
+ use crate::{ext_slice::ByteSlice, tests::LOSSY_TESTS};
+
use super::*;
- use crate::ext_slice::ByteSlice;
- use crate::tests::LOSSY_TESTS;
#[test]
+ #[cfg(not(miri))]
fn forward_ucd() {
for (i, test) in ucdtests().into_iter().enumerate() {
let given = test.grapheme_clusters.concat();
@@ -288,6 +310,7 @@ mod tests {
}
#[test]
+ #[cfg(not(miri))]
fn reverse_ucd() {
for (i, test) in ucdtests().into_iter().enumerate() {
let given = test.grapheme_clusters.concat();
@@ -329,15 +352,18 @@ mod tests {
}
}
+ #[cfg(not(miri))]
fn uniescape(s: &str) -> String {
s.chars().flat_map(|c| c.escape_unicode()).collect::<String>()
}
+ #[cfg(not(miri))]
fn uniescape_vec(strs: &[String]) -> Vec<String> {
strs.iter().map(|s| uniescape(s)).collect()
}
/// Return all of the UCD for grapheme breaks.
+ #[cfg(not(miri))]
fn ucdtests() -> Vec<GraphemeClusterBreakTest> {
const TESTDATA: &'static str =
include_str!("data/GraphemeBreakTest.txt");
diff --git a/src/unicode/mod.rs b/src/unicode/mod.rs
index 60318f4..80638e8 100644
--- a/src/unicode/mod.rs
+++ b/src/unicode/mod.rs
@@ -1,8 +1,8 @@
-pub use self::grapheme::{decode_grapheme, GraphemeIndices, Graphemes};
-pub use self::sentence::{SentenceIndices, Sentences};
-pub use self::whitespace::{whitespace_len_fwd, whitespace_len_rev};
-pub use self::word::{
- WordIndices, Words, WordsWithBreakIndices, WordsWithBreaks,
+pub use self::{
+ grapheme::{decode_grapheme, GraphemeIndices, Graphemes},
+ sentence::{SentenceIndices, Sentences},
+ whitespace::{whitespace_len_fwd, whitespace_len_rev},
+ word::{WordIndices, Words, WordsWithBreakIndices, WordsWithBreaks},
};
mod fsm;
diff --git a/src/unicode/sentence.rs b/src/unicode/sentence.rs
index 063f342..ff29c7e 100644
--- a/src/unicode/sentence.rs
+++ b/src/unicode/sentence.rs
@@ -1,8 +1,9 @@
use regex_automata::DFA;
-use crate::ext_slice::ByteSlice;
-use crate::unicode::fsm::sentence_break_fwd::SENTENCE_BREAK_FWD;
-use crate::utf8;
+use crate::{
+ ext_slice::ByteSlice,
+ unicode::fsm::sentence_break_fwd::SENTENCE_BREAK_FWD, utf8,
+};
/// An iterator over sentences in a byte string.
///
@@ -97,7 +98,7 @@ pub struct SentenceIndices<'a> {
impl<'a> SentenceIndices<'a> {
pub(crate) fn new(bs: &'a [u8]) -> SentenceIndices<'a> {
- SentenceIndices { bs: bs, forward_index: 0 }
+ SentenceIndices { bs, forward_index: 0 }
}
/// View the underlying data as a subslice of the original data.
@@ -156,13 +157,15 @@ fn decode_sentence(bs: &[u8]) -> (&str, usize) {
}
}
-#[cfg(test)]
+#[cfg(all(test, feature = "std"))]
mod tests {
+ #[cfg(not(miri))]
use ucd_parse::SentenceBreakTest;
use crate::ext_slice::ByteSlice;
#[test]
+ #[cfg(not(miri))]
fn forward_ucd() {
for (i, test) in ucdtests().into_iter().enumerate() {
let given = test.sentences.concat();
@@ -198,11 +201,13 @@ mod tests {
bytes.sentences().collect()
}
+ #[cfg(not(miri))]
fn strs_to_bstrs<S: AsRef<str>>(strs: &[S]) -> Vec<&[u8]> {
strs.iter().map(|s| s.as_ref().as_bytes()).collect()
}
/// Return all of the UCD for sentence breaks.
+ #[cfg(not(miri))]
fn ucdtests() -> Vec<SentenceBreakTest> {
const TESTDATA: &'static str =
include_str!("data/SentenceBreakTest.txt");
diff --git a/src/unicode/whitespace.rs b/src/unicode/whitespace.rs
index 949a83f..b5eff30 100644
--- a/src/unicode/whitespace.rs
+++ b/src/unicode/whitespace.rs
@@ -1,7 +1,9 @@
use regex_automata::DFA;
-use crate::unicode::fsm::whitespace_anchored_fwd::WHITESPACE_ANCHORED_FWD;
-use crate::unicode::fsm::whitespace_anchored_rev::WHITESPACE_ANCHORED_REV;
+use crate::unicode::fsm::{
+ whitespace_anchored_fwd::WHITESPACE_ANCHORED_FWD,
+ whitespace_anchored_rev::WHITESPACE_ANCHORED_REV,
+};
/// Return the first position of a non-whitespace character.
pub fn whitespace_len_fwd(slice: &[u8]) -> usize {
diff --git a/src/unicode/word.rs b/src/unicode/word.rs
index e0a5701..849f0c8 100644
--- a/src/unicode/word.rs
+++ b/src/unicode/word.rs
@@ -1,9 +1,12 @@
use regex_automata::DFA;
-use crate::ext_slice::ByteSlice;
-use crate::unicode::fsm::simple_word_fwd::SIMPLE_WORD_FWD;
-use crate::unicode::fsm::word_break_fwd::WORD_BREAK_FWD;
-use crate::utf8;
+use crate::{
+ ext_slice::ByteSlice,
+ unicode::fsm::{
+ simple_word_fwd::SIMPLE_WORD_FWD, word_break_fwd::WORD_BREAK_FWD,
+ },
+ utf8,
+};
/// An iterator over words in a byte string.
///
@@ -254,7 +257,7 @@ pub struct WordsWithBreakIndices<'a> {
impl<'a> WordsWithBreakIndices<'a> {
pub(crate) fn new(bs: &'a [u8]) -> WordsWithBreakIndices<'a> {
- WordsWithBreakIndices { bs: bs, forward_index: 0 }
+ WordsWithBreakIndices { bs, forward_index: 0 }
}
/// View the underlying data as a subslice of the original data.
@@ -316,13 +319,15 @@ fn decode_word(bs: &[u8]) -> (&str, usize) {
}
}
-#[cfg(test)]
+#[cfg(all(test, feature = "std"))]
mod tests {
+ #[cfg(not(miri))]
use ucd_parse::WordBreakTest;
use crate::ext_slice::ByteSlice;
#[test]
+ #[cfg(not(miri))]
fn forward_ucd() {
for (i, test) in ucdtests().into_iter().enumerate() {
let given = test.words.concat();
@@ -379,17 +384,26 @@ mod tests {
assert_eq!(vec!["1XY"], words(b"1XY"));
assert_eq!(vec!["\u{FEFF}", "Ты"], words("\u{FEFF}Ты".as_bytes()));
+
+ // Tests that Vithkuqi works, which was introduced in Unicode 14.
+ // This test fails prior to Unicode 14.
+ assert_eq!(
+ vec!["\u{10570}\u{10597}"],
+ words("\u{10570}\u{10597}".as_bytes())
+ );
}
fn words(bytes: &[u8]) -> Vec<&str> {
bytes.words_with_breaks().collect()
}
+ #[cfg(not(miri))]
fn strs_to_bstrs<S: AsRef<str>>(strs: &[S]) -> Vec<&[u8]> {
strs.iter().map(|s| s.as_ref().as_bytes()).collect()
}
/// Return all of the UCD for word breaks.
+ #[cfg(not(miri))]
fn ucdtests() -> Vec<WordBreakTest> {
const TESTDATA: &'static str = include_str!("data/WordBreakTest.txt");