Upgrade bstr to 1.0.1main-16k-with-phones

This project was upgraded with external_updater. Usage: tools/external_updater/updater.sh update rust/crates/bstr For more info, check https://cs.android.com/android/platform/superproject/+/master:tools/external_updater/README.md Test: TreeHugger Change-Id: I97636c5410bfed620afaf1e05bd02b94670f17b0
author: Jeff Vander Stoep <jeffv@google.com> 2022-12-12 09:08:11 +0100
committer: Jeff Vander Stoep <jeffv@google.com> 2022-12-12 09:08:51 +0100
commit: e3d458e4045070c172111aef0214af1819ee7403 (patch)
tree: 065f4b90fc4351f93487c5554f9cacd93fe29cbe /src/unicode
parent: 1faff9be927c85d1dfb151bc7975d02f697854df (diff)
download: bstr-main-16k-with-phones.tar.gz
26 files changed, 171 insertions, 156 deletions
diff --git a/src/unicode/data/GraphemeBreakTest.txt b/src/unicode/data/GraphemeBreakTest.txt
index fb4fec9..eff2fd3 100644
--- a/src/unicode/data/GraphemeBreakTest.txt
+++ b/src/unicode/data/GraphemeBreakTest.txt
@@ -1,6 +1,6 @@
-# GraphemeBreakTest-12.1.0.txt
-# Date: 2019-03-10, 10:53:12 GMT
-# © 2019 Unicode®, Inc.
+# GraphemeBreakTest-14.0.0.txt
+# Date: 2021-03-08, 06:22:32 GMT
+# © 2021 Unicode®, Inc.
 # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
 # For terms of use, see http://www.unicode.org/terms_of_use.html
 #
diff --git a/src/unicode/data/SentenceBreakTest.txt b/src/unicode/data/SentenceBreakTest.txt
index 7c1c34a..61ea42c 100644
--- a/src/unicode/data/SentenceBreakTest.txt
+++ b/src/unicode/data/SentenceBreakTest.txt
@@ -1,6 +1,6 @@
-# SentenceBreakTest-12.1.0.txt
-# Date: 2019-03-10, 10:53:28 GMT
-# © 2019 Unicode®, Inc.
+# SentenceBreakTest-14.0.0.txt
+# Date: 2021-03-08, 06:22:40 GMT
+# © 2021 Unicode®, Inc.
 # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
 # For terms of use, see http://www.unicode.org/terms_of_use.html
 #
diff --git a/src/unicode/data/WordBreakTest.txt b/src/unicode/data/WordBreakTest.txt
index facd892..1d1435b 100644
--- a/src/unicode/data/WordBreakTest.txt
+++ b/src/unicode/data/WordBreakTest.txt
@@ -1,6 +1,6 @@
-# WordBreakTest-12.1.0.txt
-# Date: 2019-03-10, 10:53:29 GMT
-# © 2019 Unicode®, Inc.
+# WordBreakTest-14.0.0.txt
+# Date: 2021-03-08, 06:22:40 GMT
+# © 2021 Unicode®, Inc.
 # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
 # For terms of use, see http://www.unicode.org/terms_of_use.html
 #
diff --git a/src/unicode/fsm/grapheme_break_fwd.bigendian.dfa b/src/unicode/fsm/grapheme_break_fwd.bigendian.dfa
index 0efaaf2..31f99c1 100644
--- a/src/unicode/fsm/grapheme_break_fwd.bigendian.dfa
+++ b/src/unicode/fsm/grapheme_break_fwd.bigendian.dfa
diff --git a/src/unicode/fsm/grapheme_break_fwd.littleendian.dfa b/src/unicode/fsm/grapheme_break_fwd.littleendian.dfa
index eb24025..3a51728 100644
--- a/src/unicode/fsm/grapheme_break_fwd.littleendian.dfa
+++ b/src/unicode/fsm/grapheme_break_fwd.littleendian.dfa
diff --git a/src/unicode/fsm/grapheme_break_fwd.rs b/src/unicode/fsm/grapheme_break_fwd.rs
index b53b1d7..dea4a7e 100644
--- a/src/unicode/fsm/grapheme_break_fwd.rs
+++ b/src/unicode/fsm/grapheme_break_fwd.rs
@@ -2,11 +2,12 @@
 //
 //   ucd-generate dfa --name GRAPHEME_BREAK_FWD --sparse --minimize --anchored --state-size 2 src/unicode/fsm/ [snip (arg too long)]
 //
-// ucd-generate 0.2.9 is available on crates.io.
+// ucd-generate 0.2.12 is available on crates.io.
 
 #[cfg(target_endian = "big")]
-lazy_static::lazy_static! {
-  pub static ref GRAPHEME_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = {
+pub static GRAPHEME_BREAK_FWD: ::once_cell::sync::Lazy<
+    ::regex_automata::SparseDFA<&'static [u8], u16>,
+> = ::once_cell::sync::Lazy::new(|| {
     #[repr(C)]
     struct Aligned<B: ?Sized> {
         _align: [u8; 0],
@@ -18,15 +19,13 @@ lazy_static::lazy_static! {
         bytes: *include_bytes!("grapheme_break_fwd.bigendian.dfa"),
     };
 
-    unsafe {
-      ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
-    }
-  };
-}
+    unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
 
 #[cfg(target_endian = "little")]
-lazy_static::lazy_static! {
-  pub static ref GRAPHEME_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = {
+pub static GRAPHEME_BREAK_FWD: ::once_cell::sync::Lazy<
+    ::regex_automata::SparseDFA<&'static [u8], u16>,
+> = ::once_cell::sync::Lazy::new(|| {
     #[repr(C)]
     struct Aligned<B: ?Sized> {
         _align: [u8; 0],
@@ -38,8 +37,5 @@ lazy_static::lazy_static! {
         bytes: *include_bytes!("grapheme_break_fwd.littleendian.dfa"),
     };
 
-    unsafe {
-      ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
-    }
-  };
-}
+    unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
diff --git a/src/unicode/fsm/grapheme_break_rev.bigendian.dfa b/src/unicode/fsm/grapheme_break_rev.bigendian.dfa
index d42cd36..742d2a6 100644
--- a/src/unicode/fsm/grapheme_break_rev.bigendian.dfa
+++ b/src/unicode/fsm/grapheme_break_rev.bigendian.dfa
diff --git a/src/unicode/fsm/grapheme_break_rev.littleendian.dfa b/src/unicode/fsm/grapheme_break_rev.littleendian.dfa
index c75ea5f..d1937f2 100644
--- a/src/unicode/fsm/grapheme_break_rev.littleendian.dfa
+++ b/src/unicode/fsm/grapheme_break_rev.littleendian.dfa
diff --git a/src/unicode/fsm/grapheme_break_rev.rs b/src/unicode/fsm/grapheme_break_rev.rs
index 93e888c..2d2cd54 100644
--- a/src/unicode/fsm/grapheme_break_rev.rs
+++ b/src/unicode/fsm/grapheme_break_rev.rs
@@ -2,11 +2,12 @@
 //
 //   ucd-generate dfa --name GRAPHEME_BREAK_REV --reverse --longest --sparse --minimize --anchored --state-size 2 src/unicode/fsm/ [snip (arg too long)]
 //
-// ucd-generate 0.2.9 is available on crates.io.
+// ucd-generate 0.2.12 is available on crates.io.
 
 #[cfg(target_endian = "big")]
-lazy_static::lazy_static! {
-  pub static ref GRAPHEME_BREAK_REV: ::regex_automata::SparseDFA<&'static [u8], u16> = {
+pub static GRAPHEME_BREAK_REV: ::once_cell::sync::Lazy<
+    ::regex_automata::SparseDFA<&'static [u8], u16>,
+> = ::once_cell::sync::Lazy::new(|| {
     #[repr(C)]
     struct Aligned<B: ?Sized> {
         _align: [u8; 0],
@@ -18,15 +19,13 @@ lazy_static::lazy_static! {
         bytes: *include_bytes!("grapheme_break_rev.bigendian.dfa"),
     };
 
-    unsafe {
-      ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
-    }
-  };
-}
+    unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
 
 #[cfg(target_endian = "little")]
-lazy_static::lazy_static! {
-  pub static ref GRAPHEME_BREAK_REV: ::regex_automata::SparseDFA<&'static [u8], u16> = {
+pub static GRAPHEME_BREAK_REV: ::once_cell::sync::Lazy<
+    ::regex_automata::SparseDFA<&'static [u8], u16>,
+> = ::once_cell::sync::Lazy::new(|| {
     #[repr(C)]
     struct Aligned<B: ?Sized> {
         _align: [u8; 0],
@@ -38,8 +37,5 @@ lazy_static::lazy_static! {
         bytes: *include_bytes!("grapheme_break_rev.littleendian.dfa"),
     };
 
-    unsafe {
-      ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
-    }
-  };
-}
+    unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
diff --git a/src/unicode/fsm/regional_indicator_rev.rs b/src/unicode/fsm/regional_indicator_rev.rs
index 2bf7e4c..db7a40f 100644
--- a/src/unicode/fsm/regional_indicator_rev.rs
+++ b/src/unicode/fsm/regional_indicator_rev.rs
@@ -2,11 +2,12 @@
 //
 //   ucd-generate dfa --name REGIONAL_INDICATOR_REV --reverse --classes --minimize --anchored --premultiply --state-size 1 src/unicode/fsm/ \p{gcb=Regional_Indicator}
 //
-// ucd-generate 0.2.9 is available on crates.io.
+// ucd-generate 0.2.12 is available on crates.io.
 
 #[cfg(target_endian = "big")]
-lazy_static::lazy_static! {
-  pub static ref REGIONAL_INDICATOR_REV: ::regex_automata::DenseDFA<&'static [u8], u8> = {
+pub static REGIONAL_INDICATOR_REV: ::once_cell::sync::Lazy<
+    ::regex_automata::DenseDFA<&'static [u8], u8>,
+> = ::once_cell::sync::Lazy::new(|| {
     #[repr(C)]
     struct Aligned<B: ?Sized> {
         _align: [u8; 0],
@@ -18,15 +19,13 @@ lazy_static::lazy_static! {
         bytes: *include_bytes!("regional_indicator_rev.bigendian.dfa"),
     };
 
-    unsafe {
-      ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes)
-    }
-  };
-}
+    unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) }
+});
 
 #[cfg(target_endian = "little")]
-lazy_static::lazy_static! {
-  pub static ref REGIONAL_INDICATOR_REV: ::regex_automata::DenseDFA<&'static [u8], u8> = {
+pub static REGIONAL_INDICATOR_REV: ::once_cell::sync::Lazy<
+    ::regex_automata::DenseDFA<&'static [u8], u8>,
+> = ::once_cell::sync::Lazy::new(|| {
     #[repr(C)]
     struct Aligned<B: ?Sized> {
         _align: [u8; 0],
@@ -38,8 +37,5 @@ lazy_static::lazy_static! {
         bytes: *include_bytes!("regional_indicator_rev.littleendian.dfa"),
     };
 
-    unsafe {
-      ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes)
-    }
-  };
-}
+    unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) }
+});
diff --git a/src/unicode/fsm/sentence_break_fwd.bigendian.dfa b/src/unicode/fsm/sentence_break_fwd.bigendian.dfa
index a1813d7..1abdae8 100644
--- a/src/unicode/fsm/sentence_break_fwd.bigendian.dfa
+++ b/src/unicode/fsm/sentence_break_fwd.bigendian.dfa
diff --git a/src/unicode/fsm/sentence_break_fwd.littleendian.dfa b/src/unicode/fsm/sentence_break_fwd.littleendian.dfa
index 2763583..2f8aadd 100644
--- a/src/unicode/fsm/sentence_break_fwd.littleendian.dfa
+++ b/src/unicode/fsm/sentence_break_fwd.littleendian.dfa
diff --git a/src/unicode/fsm/sentence_break_fwd.rs b/src/unicode/fsm/sentence_break_fwd.rs
index cc937a4..97dd658 100644
--- a/src/unicode/fsm/sentence_break_fwd.rs
+++ b/src/unicode/fsm/sentence_break_fwd.rs
@@ -2,11 +2,12 @@
 //
 //   ucd-generate dfa --name SENTENCE_BREAK_FWD --minimize --sparse --anchored --state-size 4 src/unicode/fsm/ [snip (arg too long)]
 //
-// ucd-generate 0.2.9 is available on crates.io.
+// ucd-generate 0.2.12 is available on crates.io.
 
 #[cfg(target_endian = "big")]
-lazy_static::lazy_static! {
-  pub static ref SENTENCE_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = {
+pub static SENTENCE_BREAK_FWD: ::once_cell::sync::Lazy<
+    ::regex_automata::SparseDFA<&'static [u8], u32>,
+> = ::once_cell::sync::Lazy::new(|| {
     #[repr(C)]
     struct Aligned<B: ?Sized> {
         _align: [u8; 0],
@@ -18,15 +19,13 @@ lazy_static::lazy_static! {
         bytes: *include_bytes!("sentence_break_fwd.bigendian.dfa"),
     };
 
-    unsafe {
-      ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
-    }
-  };
-}
+    unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
 
 #[cfg(target_endian = "little")]
-lazy_static::lazy_static! {
-  pub static ref SENTENCE_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = {
+pub static SENTENCE_BREAK_FWD: ::once_cell::sync::Lazy<
+    ::regex_automata::SparseDFA<&'static [u8], u32>,
+> = ::once_cell::sync::Lazy::new(|| {
     #[repr(C)]
     struct Aligned<B: ?Sized> {
         _align: [u8; 0],
@@ -38,8 +37,5 @@ lazy_static::lazy_static! {
         bytes: *include_bytes!("sentence_break_fwd.littleendian.dfa"),
     };
 
-    unsafe {
-      ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
-    }
-  };
-}
+    unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
diff --git a/src/unicode/fsm/simple_word_fwd.bigendian.dfa b/src/unicode/fsm/simple_word_fwd.bigendian.dfa
index adc64c1..888e465 100644
--- a/src/unicode/fsm/simple_word_fwd.bigendian.dfa
+++ b/src/unicode/fsm/simple_word_fwd.bigendian.dfa
diff --git a/src/unicode/fsm/simple_word_fwd.littleendian.dfa b/src/unicode/fsm/simple_word_fwd.littleendian.dfa
index dd48386..a1d527c 100644
--- a/src/unicode/fsm/simple_word_fwd.littleendian.dfa
+++ b/src/unicode/fsm/simple_word_fwd.littleendian.dfa
diff --git a/src/unicode/fsm/simple_word_fwd.rs b/src/unicode/fsm/simple_word_fwd.rs
index f1f3da5..32b69b6 100644
--- a/src/unicode/fsm/simple_word_fwd.rs
+++ b/src/unicode/fsm/simple_word_fwd.rs
@@ -2,11 +2,12 @@
 //
 //   ucd-generate dfa --name SIMPLE_WORD_FWD --sparse --minimize --state-size 2 src/unicode/fsm/ \w
 //
-// ucd-generate 0.2.9 is available on crates.io.
+// ucd-generate 0.2.12 is available on crates.io.
 
 #[cfg(target_endian = "big")]
-lazy_static::lazy_static! {
-  pub static ref SIMPLE_WORD_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = {
+pub static SIMPLE_WORD_FWD: ::once_cell::sync::Lazy<
+    ::regex_automata::SparseDFA<&'static [u8], u16>,
+> = ::once_cell::sync::Lazy::new(|| {
     #[repr(C)]
     struct Aligned<B: ?Sized> {
         _align: [u8; 0],
@@ -18,15 +19,13 @@ lazy_static::lazy_static! {
         bytes: *include_bytes!("simple_word_fwd.bigendian.dfa"),
     };
 
-    unsafe {
-      ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
-    }
-  };
-}
+    unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
 
 #[cfg(target_endian = "little")]
-lazy_static::lazy_static! {
-  pub static ref SIMPLE_WORD_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = {
+pub static SIMPLE_WORD_FWD: ::once_cell::sync::Lazy<
+    ::regex_automata::SparseDFA<&'static [u8], u16>,
+> = ::once_cell::sync::Lazy::new(|| {
     #[repr(C)]
     struct Aligned<B: ?Sized> {
         _align: [u8; 0],
@@ -38,8 +37,5 @@ lazy_static::lazy_static! {
         bytes: *include_bytes!("simple_word_fwd.littleendian.dfa"),
     };
 
-    unsafe {
-      ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
-    }
-  };
-}
+    unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
diff --git a/src/unicode/fsm/whitespace_anchored_fwd.rs b/src/unicode/fsm/whitespace_anchored_fwd.rs
index 419b5d4..0780412 100644
--- a/src/unicode/fsm/whitespace_anchored_fwd.rs
+++ b/src/unicode/fsm/whitespace_anchored_fwd.rs
@@ -2,11 +2,12 @@
 //
 //   ucd-generate dfa --name WHITESPACE_ANCHORED_FWD --anchored --classes --premultiply --minimize --state-size 1 src/unicode/fsm/ \s+
 //
-// ucd-generate 0.2.9 is available on crates.io.
+// ucd-generate 0.2.12 is available on crates.io.
 
 #[cfg(target_endian = "big")]
-lazy_static::lazy_static! {
-  pub static ref WHITESPACE_ANCHORED_FWD: ::regex_automata::DenseDFA<&'static [u8], u8> = {
+pub static WHITESPACE_ANCHORED_FWD: ::once_cell::sync::Lazy<
+    ::regex_automata::DenseDFA<&'static [u8], u8>,
+> = ::once_cell::sync::Lazy::new(|| {
     #[repr(C)]
     struct Aligned<B: ?Sized> {
         _align: [u8; 0],
@@ -18,15 +19,13 @@ lazy_static::lazy_static! {
         bytes: *include_bytes!("whitespace_anchored_fwd.bigendian.dfa"),
     };
 
-    unsafe {
-      ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes)
-    }
-  };
-}
+    unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) }
+});
 
 #[cfg(target_endian = "little")]
-lazy_static::lazy_static! {
-  pub static ref WHITESPACE_ANCHORED_FWD: ::regex_automata::DenseDFA<&'static [u8], u8> = {
+pub static WHITESPACE_ANCHORED_FWD: ::once_cell::sync::Lazy<
+    ::regex_automata::DenseDFA<&'static [u8], u8>,
+> = ::once_cell::sync::Lazy::new(|| {
     #[repr(C)]
     struct Aligned<B: ?Sized> {
         _align: [u8; 0],
@@ -38,8 +37,5 @@ lazy_static::lazy_static! {
         bytes: *include_bytes!("whitespace_anchored_fwd.littleendian.dfa"),
     };
 
-    unsafe {
-      ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes)
-    }
-  };
-}
+    unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) }
+});
diff --git a/src/unicode/fsm/whitespace_anchored_rev.rs b/src/unicode/fsm/whitespace_anchored_rev.rs
index 301b03c..3d0d7a6 100644
--- a/src/unicode/fsm/whitespace_anchored_rev.rs
+++ b/src/unicode/fsm/whitespace_anchored_rev.rs
@@ -2,11 +2,12 @@
 //
 //   ucd-generate dfa --name WHITESPACE_ANCHORED_REV --reverse --anchored --classes --premultiply --minimize --state-size 2 src/unicode/fsm/ \s+
 //
-// ucd-generate 0.2.9 is available on crates.io.
+// ucd-generate 0.2.12 is available on crates.io.
 
 #[cfg(target_endian = "big")]
-lazy_static::lazy_static! {
-  pub static ref WHITESPACE_ANCHORED_REV: ::regex_automata::DenseDFA<&'static [u16], u16> = {
+pub static WHITESPACE_ANCHORED_REV: ::once_cell::sync::Lazy<
+    ::regex_automata::DenseDFA<&'static [u16], u16>,
+> = ::once_cell::sync::Lazy::new(|| {
     #[repr(C)]
     struct Aligned<B: ?Sized> {
         _align: [u16; 0],
@@ -18,15 +19,13 @@ lazy_static::lazy_static! {
         bytes: *include_bytes!("whitespace_anchored_rev.bigendian.dfa"),
     };
 
-    unsafe {
-      ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes)
-    }
-  };
-}
+    unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) }
+});
 
 #[cfg(target_endian = "little")]
-lazy_static::lazy_static! {
-  pub static ref WHITESPACE_ANCHORED_REV: ::regex_automata::DenseDFA<&'static [u16], u16> = {
+pub static WHITESPACE_ANCHORED_REV: ::once_cell::sync::Lazy<
+    ::regex_automata::DenseDFA<&'static [u16], u16>,
+> = ::once_cell::sync::Lazy::new(|| {
     #[repr(C)]
     struct Aligned<B: ?Sized> {
         _align: [u16; 0],
@@ -38,8 +37,5 @@ lazy_static::lazy_static! {
         bytes: *include_bytes!("whitespace_anchored_rev.littleendian.dfa"),
     };
 
-    unsafe {
-      ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes)
-    }
-  };
-}
+    unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) }
+});
diff --git a/src/unicode/fsm/word_break_fwd.bigendian.dfa b/src/unicode/fsm/word_break_fwd.bigendian.dfa
index 1e75db6..efb9c81 100644
--- a/src/unicode/fsm/word_break_fwd.bigendian.dfa
+++ b/src/unicode/fsm/word_break_fwd.bigendian.dfa
diff --git a/src/unicode/fsm/word_break_fwd.littleendian.dfa b/src/unicode/fsm/word_break_fwd.littleendian.dfa
index e3093a3..9a716d0 100644
--- a/src/unicode/fsm/word_break_fwd.littleendian.dfa
+++ b/src/unicode/fsm/word_break_fwd.littleendian.dfa
diff --git a/src/unicode/fsm/word_break_fwd.rs b/src/unicode/fsm/word_break_fwd.rs
index fb041b7..dcb5f6b 100644
--- a/src/unicode/fsm/word_break_fwd.rs
+++ b/src/unicode/fsm/word_break_fwd.rs
@@ -2,11 +2,12 @@
 //
 //   ucd-generate dfa --name WORD_BREAK_FWD --sparse --minimize --anchored --state-size 4 src/unicode/fsm/ [snip (arg too long)]
 //
-// ucd-generate 0.2.9 is available on crates.io.
+// ucd-generate 0.2.12 is available on crates.io.
 
 #[cfg(target_endian = "big")]
-lazy_static::lazy_static! {
-  pub static ref WORD_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = {
+pub static WORD_BREAK_FWD: ::once_cell::sync::Lazy<
+    ::regex_automata::SparseDFA<&'static [u8], u32>,
+> = ::once_cell::sync::Lazy::new(|| {
     #[repr(C)]
     struct Aligned<B: ?Sized> {
         _align: [u8; 0],
@@ -18,15 +19,13 @@ lazy_static::lazy_static! {
         bytes: *include_bytes!("word_break_fwd.bigendian.dfa"),
     };
 
-    unsafe {
-      ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
-    }
-  };
-}
+    unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
 
 #[cfg(target_endian = "little")]
-lazy_static::lazy_static! {
-  pub static ref WORD_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = {
+pub static WORD_BREAK_FWD: ::once_cell::sync::Lazy<
+    ::regex_automata::SparseDFA<&'static [u8], u32>,
+> = ::once_cell::sync::Lazy::new(|| {
     #[repr(C)]
     struct Aligned<B: ?Sized> {
         _align: [u8; 0],
@@ -38,8 +37,5 @@ lazy_static::lazy_static! {
         bytes: *include_bytes!("word_break_fwd.littleendian.dfa"),
     };
 
-    unsafe {
-      ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
-    }
-  };
-}
+    unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
diff --git a/src/unicode/grapheme.rs b/src/unicode/grapheme.rs
index ad31cf1..13b730c 100644
--- a/src/unicode/grapheme.rs
+++ b/src/unicode/grapheme.rs
@@ -1,10 +1,14 @@
 use regex_automata::DFA;
 
-use crate::ext_slice::ByteSlice;
-use crate::unicode::fsm::grapheme_break_fwd::GRAPHEME_BREAK_FWD;
-use crate::unicode::fsm::grapheme_break_rev::GRAPHEME_BREAK_REV;
-use crate::unicode::fsm::regional_indicator_rev::REGIONAL_INDICATOR_REV;
-use crate::utf8;
+use crate::{
+    ext_slice::ByteSlice,
+    unicode::fsm::{
+        grapheme_break_fwd::GRAPHEME_BREAK_FWD,
+        grapheme_break_rev::GRAPHEME_BREAK_REV,
+        regional_indicator_rev::REGIONAL_INDICATOR_REV,
+    },
+    utf8,
+};
 
 /// An iterator over grapheme clusters in a byte string.
 ///
@@ -125,7 +129,7 @@ pub struct GraphemeIndices<'a> {
 
 impl<'a> GraphemeIndices<'a> {
     pub(crate) fn new(bs: &'a [u8]) -> GraphemeIndices<'a> {
-        GraphemeIndices { bs: bs, forward_index: 0, reverse_index: bs.len() }
+        GraphemeIndices { bs, forward_index: 0, reverse_index: bs.len() }
     }
 
     /// View the underlying data as a subslice of the original data.
@@ -191,6 +195,22 @@ impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
 pub fn decode_grapheme(bs: &[u8]) -> (&str, usize) {
     if bs.is_empty() {
         ("", 0)
+    } else if bs.len() >= 2
+        && bs[0].is_ascii()
+        && bs[1].is_ascii()
+        && !bs[0].is_ascii_whitespace()
+    {
+        // FIXME: It is somewhat sad that we have to special case this, but it
+        // leads to a significant speed up in predominantly ASCII text. The
+        // issue here is that the DFA has a bit of overhead, and running it for
+        // every byte in mostly ASCII text results in a bit slowdown. We should
+        // re-litigate this once regex-automata 0.3 is out, but it might be
+        // hard to avoid the special case. A DFA is always going to at least
+        // require some memory access.
+
+        // Safe because all ASCII bytes are valid UTF-8.
+        let grapheme = unsafe { bs[..1].to_str_unchecked() };
+        (grapheme, 1)
     } else if let Some(end) = GRAPHEME_BREAK_FWD.find(bs) {
         // Safe because a match can only occur for valid UTF-8.
         let grapheme = unsafe { bs[..end].to_str_unchecked() };
@@ -257,15 +277,17 @@ fn adjust_rev_for_regional_indicator(mut bs: &[u8], i: usize) -> usize {
     }
 }
 
-#[cfg(test)]
+#[cfg(all(test, feature = "std"))]
 mod tests {
+    #[cfg(not(miri))]
     use ucd_parse::GraphemeClusterBreakTest;
 
+    use crate::{ext_slice::ByteSlice, tests::LOSSY_TESTS};
+
     use super::*;
-    use crate::ext_slice::ByteSlice;
-    use crate::tests::LOSSY_TESTS;
 
     #[test]
+    #[cfg(not(miri))]
     fn forward_ucd() {
         for (i, test) in ucdtests().into_iter().enumerate() {
             let given = test.grapheme_clusters.concat();
@@ -288,6 +310,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(not(miri))]
     fn reverse_ucd() {
         for (i, test) in ucdtests().into_iter().enumerate() {
             let given = test.grapheme_clusters.concat();
@@ -329,15 +352,18 @@ mod tests {
         }
     }
 
+    #[cfg(not(miri))]
     fn uniescape(s: &str) -> String {
         s.chars().flat_map(|c| c.escape_unicode()).collect::<String>()
     }
 
+    #[cfg(not(miri))]
     fn uniescape_vec(strs: &[String]) -> Vec<String> {
         strs.iter().map(|s| uniescape(s)).collect()
     }
 
     /// Return all of the UCD for grapheme breaks.
+    #[cfg(not(miri))]
     fn ucdtests() -> Vec<GraphemeClusterBreakTest> {
         const TESTDATA: &'static str =
             include_str!("data/GraphemeBreakTest.txt");
diff --git a/src/unicode/mod.rs b/src/unicode/mod.rs
index 60318f4..80638e8 100644
--- a/src/unicode/mod.rs
+++ b/src/unicode/mod.rs
@@ -1,8 +1,8 @@
-pub use self::grapheme::{decode_grapheme, GraphemeIndices, Graphemes};
-pub use self::sentence::{SentenceIndices, Sentences};
-pub use self::whitespace::{whitespace_len_fwd, whitespace_len_rev};
-pub use self::word::{
-    WordIndices, Words, WordsWithBreakIndices, WordsWithBreaks,
+pub use self::{
+    grapheme::{decode_grapheme, GraphemeIndices, Graphemes},
+    sentence::{SentenceIndices, Sentences},
+    whitespace::{whitespace_len_fwd, whitespace_len_rev},
+    word::{WordIndices, Words, WordsWithBreakIndices, WordsWithBreaks},
 };
 
 mod fsm;
diff --git a/src/unicode/sentence.rs b/src/unicode/sentence.rs
index 063f342..ff29c7e 100644
--- a/src/unicode/sentence.rs
+++ b/src/unicode/sentence.rs
@@ -1,8 +1,9 @@
 use regex_automata::DFA;
 
-use crate::ext_slice::ByteSlice;
-use crate::unicode::fsm::sentence_break_fwd::SENTENCE_BREAK_FWD;
-use crate::utf8;
+use crate::{
+    ext_slice::ByteSlice,
+    unicode::fsm::sentence_break_fwd::SENTENCE_BREAK_FWD, utf8,
+};
 
 /// An iterator over sentences in a byte string.
 ///
@@ -97,7 +98,7 @@ pub struct SentenceIndices<'a> {
 
 impl<'a> SentenceIndices<'a> {
     pub(crate) fn new(bs: &'a [u8]) -> SentenceIndices<'a> {
-        SentenceIndices { bs: bs, forward_index: 0 }
+        SentenceIndices { bs, forward_index: 0 }
     }
 
     /// View the underlying data as a subslice of the original data.
@@ -156,13 +157,15 @@ fn decode_sentence(bs: &[u8]) -> (&str, usize) {
     }
 }
 
-#[cfg(test)]
+#[cfg(all(test, feature = "std"))]
 mod tests {
+    #[cfg(not(miri))]
     use ucd_parse::SentenceBreakTest;
 
     use crate::ext_slice::ByteSlice;
 
     #[test]
+    #[cfg(not(miri))]
     fn forward_ucd() {
         for (i, test) in ucdtests().into_iter().enumerate() {
             let given = test.sentences.concat();
@@ -198,11 +201,13 @@ mod tests {
         bytes.sentences().collect()
     }
 
+    #[cfg(not(miri))]
     fn strs_to_bstrs<S: AsRef<str>>(strs: &[S]) -> Vec<&[u8]> {
         strs.iter().map(|s| s.as_ref().as_bytes()).collect()
     }
 
     /// Return all of the UCD for sentence breaks.
+    #[cfg(not(miri))]
     fn ucdtests() -> Vec<SentenceBreakTest> {
         const TESTDATA: &'static str =
             include_str!("data/SentenceBreakTest.txt");
diff --git a/src/unicode/whitespace.rs b/src/unicode/whitespace.rs
index 949a83f..b5eff30 100644
--- a/src/unicode/whitespace.rs
+++ b/src/unicode/whitespace.rs
@@ -1,7 +1,9 @@
 use regex_automata::DFA;
 
-use crate::unicode::fsm::whitespace_anchored_fwd::WHITESPACE_ANCHORED_FWD;
-use crate::unicode::fsm::whitespace_anchored_rev::WHITESPACE_ANCHORED_REV;
+use crate::unicode::fsm::{
+    whitespace_anchored_fwd::WHITESPACE_ANCHORED_FWD,
+    whitespace_anchored_rev::WHITESPACE_ANCHORED_REV,
+};
 
 /// Return the first position of a non-whitespace character.
 pub fn whitespace_len_fwd(slice: &[u8]) -> usize {
diff --git a/src/unicode/word.rs b/src/unicode/word.rs
index e0a5701..849f0c8 100644
--- a/src/unicode/word.rs
+++ b/src/unicode/word.rs
@@ -1,9 +1,12 @@
 use regex_automata::DFA;
 
-use crate::ext_slice::ByteSlice;
-use crate::unicode::fsm::simple_word_fwd::SIMPLE_WORD_FWD;
-use crate::unicode::fsm::word_break_fwd::WORD_BREAK_FWD;
-use crate::utf8;
+use crate::{
+    ext_slice::ByteSlice,
+    unicode::fsm::{
+        simple_word_fwd::SIMPLE_WORD_FWD, word_break_fwd::WORD_BREAK_FWD,
+    },
+    utf8,
+};
 
 /// An iterator over words in a byte string.
 ///
@@ -254,7 +257,7 @@ pub struct WordsWithBreakIndices<'a> {
 
 impl<'a> WordsWithBreakIndices<'a> {
     pub(crate) fn new(bs: &'a [u8]) -> WordsWithBreakIndices<'a> {
-        WordsWithBreakIndices { bs: bs, forward_index: 0 }
+        WordsWithBreakIndices { bs, forward_index: 0 }
     }
 
     /// View the underlying data as a subslice of the original data.
@@ -316,13 +319,15 @@ fn decode_word(bs: &[u8]) -> (&str, usize) {
     }
 }
 
-#[cfg(test)]
+#[cfg(all(test, feature = "std"))]
 mod tests {
+    #[cfg(not(miri))]
     use ucd_parse::WordBreakTest;
 
     use crate::ext_slice::ByteSlice;
 
     #[test]
+    #[cfg(not(miri))]
     fn forward_ucd() {
         for (i, test) in ucdtests().into_iter().enumerate() {
             let given = test.words.concat();
@@ -379,17 +384,26 @@ mod tests {
         assert_eq!(vec!["1XY"], words(b"1XY"));
 
         assert_eq!(vec!["\u{FEFF}", "Ты"], words("\u{FEFF}Ты".as_bytes()));
+
+        // Tests that Vithkuqi works, which was introduced in Unicode 14.
+        // This test fails prior to Unicode 14.
+        assert_eq!(
+            vec!["\u{10570}\u{10597}"],
+            words("\u{10570}\u{10597}".as_bytes())
+        );
     }
 
     fn words(bytes: &[u8]) -> Vec<&str> {
         bytes.words_with_breaks().collect()
     }
 
+    #[cfg(not(miri))]
     fn strs_to_bstrs<S: AsRef<str>>(strs: &[S]) -> Vec<&[u8]> {
         strs.iter().map(|s| s.as_ref().as_bytes()).collect()
     }
 
     /// Return all of the UCD for word breaks.
+    #[cfg(not(miri))]
     fn ucdtests() -> Vec<WordBreakTest> {
         const TESTDATA: &'static str = include_str!("data/WordBreakTest.txt");
author	Jeff Vander Stoep <jeffv@google.com>	2022-12-12 09:08:11 +0100
committer	Jeff Vander Stoep <jeffv@google.com>	2022-12-12 09:08:51 +0100
commit	e3d458e4045070c172111aef0214af1819ee7403 (patch)
tree	065f4b90fc4351f93487c5554f9cacd93fe29cbe /src/unicode
parent	1faff9be927c85d1dfb151bc7975d02f697854df (diff)
download	bstr-main-16k-with-phones.tar.gz