aboutsummaryrefslogtreecommitdiff
path: root/src/unicode/word.rs
diff options
context:
space:
mode:
authorJeff Vander Stoep <jeffv@google.com>2022-12-12 09:08:11 +0100
committerJeff Vander Stoep <jeffv@google.com>2022-12-12 09:08:51 +0100
commite3d458e4045070c172111aef0214af1819ee7403 (patch)
tree065f4b90fc4351f93487c5554f9cacd93fe29cbe /src/unicode/word.rs
parent1faff9be927c85d1dfb151bc7975d02f697854df (diff)
downloadbstr-e3d458e4045070c172111aef0214af1819ee7403.tar.gz
Upgrade bstr to 1.0.1main-16k-with-phones
This project was upgraded with external_updater. Usage: tools/external_updater/updater.sh update rust/crates/bstr For more info, check https://cs.android.com/android/platform/superproject/+/master:tools/external_updater/README.md Test: TreeHugger Change-Id: I97636c5410bfed620afaf1e05bd02b94670f17b0
Diffstat (limited to 'src/unicode/word.rs')
-rw-r--r--src/unicode/word.rs26
1 files changed, 20 insertions, 6 deletions
diff --git a/src/unicode/word.rs b/src/unicode/word.rs
index e0a5701..849f0c8 100644
--- a/src/unicode/word.rs
+++ b/src/unicode/word.rs
@@ -1,9 +1,12 @@
use regex_automata::DFA;
-use crate::ext_slice::ByteSlice;
-use crate::unicode::fsm::simple_word_fwd::SIMPLE_WORD_FWD;
-use crate::unicode::fsm::word_break_fwd::WORD_BREAK_FWD;
-use crate::utf8;
+use crate::{
+ ext_slice::ByteSlice,
+ unicode::fsm::{
+ simple_word_fwd::SIMPLE_WORD_FWD, word_break_fwd::WORD_BREAK_FWD,
+ },
+ utf8,
+};
/// An iterator over words in a byte string.
///
@@ -254,7 +257,7 @@ pub struct WordsWithBreakIndices<'a> {
impl<'a> WordsWithBreakIndices<'a> {
pub(crate) fn new(bs: &'a [u8]) -> WordsWithBreakIndices<'a> {
- WordsWithBreakIndices { bs: bs, forward_index: 0 }
+ WordsWithBreakIndices { bs, forward_index: 0 }
}
/// View the underlying data as a subslice of the original data.
@@ -316,13 +319,15 @@ fn decode_word(bs: &[u8]) -> (&str, usize) {
}
}
-#[cfg(test)]
+#[cfg(all(test, feature = "std"))]
mod tests {
+ #[cfg(not(miri))]
use ucd_parse::WordBreakTest;
use crate::ext_slice::ByteSlice;
#[test]
+ #[cfg(not(miri))]
fn forward_ucd() {
for (i, test) in ucdtests().into_iter().enumerate() {
let given = test.words.concat();
@@ -379,17 +384,26 @@ mod tests {
assert_eq!(vec!["1XY"], words(b"1XY"));
assert_eq!(vec!["\u{FEFF}", "Ты"], words("\u{FEFF}Ты".as_bytes()));
+
+ // Tests that Vithkuqi works, which was introduced in Unicode 14.
+ // This test fails prior to Unicode 14.
+ assert_eq!(
+ vec!["\u{10570}\u{10597}"],
+ words("\u{10570}\u{10597}".as_bytes())
+ );
}
fn words(bytes: &[u8]) -> Vec<&str> {
bytes.words_with_breaks().collect()
}
+ #[cfg(not(miri))]
fn strs_to_bstrs<S: AsRef<str>>(strs: &[S]) -> Vec<&[u8]> {
strs.iter().map(|s| s.as_ref().as_bytes()).collect()
}
/// Return all of the UCD for word breaks.
+ #[cfg(not(miri))]
fn ucdtests() -> Vec<WordBreakTest> {
const TESTDATA: &'static str = include_str!("data/WordBreakTest.txt");