diff options
author | Android Build Coastguard Worker <android-build-coastguard-worker@google.com> | 2022-02-16 01:13:04 +0000 |
---|---|---|
committer | Android Build Coastguard Worker <android-build-coastguard-worker@google.com> | 2022-02-16 01:13:04 +0000 |
commit | ec9c482f6dca777e0955c304457a80ef915e564f (patch) | |
tree | 47be3ea825e1446e093c3b760db7d40a4e4e55d6 | |
parent | 0ceec9b4cae6be5020bb678f0172ee13a83c1509 (diff) | |
parent | 6453fc5ed39eb1c8fc72e9d291eb8c7084a05ad7 (diff) | |
download | unicode-segmentation-ec9c482f6dca777e0955c304457a80ef915e564f.tar.gz |
Snap for 8188146 from 6453fc5ed39eb1c8fc72e9d291eb8c7084a05ad7 to tm-frc-ipsec-release
Change-Id: Ia03e3604c435998c774cedd79bdf00c9aef761b4
-rw-r--r-- | .cargo_vcs_info.json | 2 | ||||
-rw-r--r-- | .github/workflows/rust.yml | 22 | ||||
-rw-r--r-- | Android.bp | 8 | ||||
-rw-r--r-- | Cargo.toml | 15 | ||||
-rw-r--r-- | Cargo.toml.orig | 13 | ||||
-rw-r--r-- | METADATA | 8 | ||||
-rw-r--r-- | README.md | 2 | ||||
-rw-r--r-- | TEST_MAPPING | 17 | ||||
-rw-r--r-- | benches/graphemes.rs | 59 | ||||
-rw-r--r-- | benches/unicode_words.rs | 64 | ||||
-rw-r--r-- | benches/word_bounds.rs | 64 | ||||
-rw-r--r-- | cargo2android.json | 4 | ||||
-rw-r--r-- | scripts/unicode.py | 5 | ||||
-rw-r--r-- | src/grapheme.rs | 30 | ||||
-rw-r--r-- | src/lib.rs | 31 | ||||
-rw-r--r-- | src/sentence.rs | 8 | ||||
-rw-r--r-- | src/tables.rs | 8 | ||||
-rw-r--r-- | src/test.rs | 6 | ||||
-rw-r--r-- | src/word.rs | 65 |
19 files changed, 88 insertions, 343 deletions
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json index e74a153..22f2cec 100644 --- a/.cargo_vcs_info.json +++ b/.cargo_vcs_info.json @@ -1,5 +1,5 @@ { "git": { - "sha1": "907d4d0b7e5c6f5e0f815c90a51d28b793d0c7a4" + "sha1": "3b75ee19b3c0ddacaeec03be688a7b8766833728" } } diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml deleted file mode 100644 index 3c13d1b..0000000 --- a/.github/workflows/rust.yml +++ /dev/null @@ -1,22 +0,0 @@ -name: Rust - -on: - push: - branches: [ master ] - pull_request: - branches: [ master ] - -env: - CARGO_TERM_COLOR: always - -jobs: - build: - - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v2 - - name: Build - run: cargo build --verbose - - name: Run tests - run: cargo test --verbose @@ -1,5 +1,4 @@ -// This file is generated by cargo2android.py --config cargo2android.json. -// Do not modify this file as changes will be overridden on upgrade. +// This file is generated by cargo2android.py --run --device --dependencies. package { default_applicable_licenses: [ @@ -42,10 +41,9 @@ license { rust_library { name: "libunicode_segmentation", + // has rustc warnings host_supported: true, crate_name: "unicode_segmentation", - cargo_env_compat: true, - cargo_pkg_version: "1.8.0", srcs: ["src/lib.rs"], - edition: "2018", + edition: "2015", } @@ -11,9 +11,8 @@ # will likely look very different (and much more reasonable) [package] -edition = "2018" name = "unicode-segmentation" -version = "1.8.0" +version = "1.7.1" authors = ["kwantam <kwantam@gmail.com>", "Manish Goregaokar <manishsmail@gmail.com>"] exclude = ["target/*", "Cargo.lock", "scripts/tmp", "benches/texts/*", "*.txt"] description = "This crate provides Grapheme Cluster, Word and Sentence boundaries\naccording to Unicode Standard Annex #29 rules.\n" @@ -27,16 +26,8 @@ repository = "https://github.com/unicode-rs/unicode-segmentation" [[bench]] name = "graphemes" harness = false - -[[bench]] -name = "unicode_words" -harness = false - -[[bench]] -name = "word_bounds" -harness = false -[dev-dependencies.criterion] -version = "0.3" +[dev-dependencies.bencher] +version = "0.1" [dev-dependencies.quickcheck] version = "0.7" diff --git a/Cargo.toml.orig b/Cargo.toml.orig index c1c16e6..3f55167 100644 --- a/Cargo.toml.orig +++ b/Cargo.toml.orig @@ -1,10 +1,9 @@ [package] name = "unicode-segmentation" -version = "1.8.0" +version = "1.7.1" authors = ["kwantam <kwantam@gmail.com>", "Manish Goregaokar <manishsmail@gmail.com>"] -edition = "2018" homepage = "https://github.com/unicode-rs/unicode-segmentation" repository = "https://github.com/unicode-rs/unicode-segmentation" documentation = "https://unicode-rs.github.io/unicode-segmentation" @@ -24,16 +23,8 @@ no_std = [] # This is a no-op, preserved for backward compatibility only. [dev-dependencies] quickcheck = "0.7" -criterion = "0.3" +bencher = "0.1" [[bench]] name = "graphemes" -harness = false - -[[bench]] -name = "unicode_words" -harness = false - -[[bench]] -name = "word_bounds" harness = false
\ No newline at end of file @@ -7,13 +7,13 @@ third_party { } url { type: ARCHIVE - value: "https://static.crates.io/crates/unicode-segmentation/unicode-segmentation-1.8.0.crate" + value: "https://static.crates.io/crates/unicode-segmentation/unicode-segmentation-1.7.1.crate" } - version: "1.8.0" + version: "1.7.1" license_type: NOTICE last_upgrade_date { year: 2021 - month: 8 - day: 9 + month: 1 + day: 12 } } @@ -38,7 +38,7 @@ to your `Cargo.toml`: ```toml [dependencies] -unicode-segmentation = "1.8.0" +unicode-segmentation = "1.7.1" ``` # Change Log diff --git a/TEST_MAPPING b/TEST_MAPPING index c934591..60b40b7 100644 --- a/TEST_MAPPING +++ b/TEST_MAPPING @@ -1,21 +1,8 @@ -// Generated by update_crate_tests.py for tests that depend on this crate. +// Generated by cargo2android.py for tests in Android.bp { - "imports": [ - { - "path": "external/rust/crates/base64" - }, - { - "path": "external/rust/crates/heck" - } - ], "presubmit": [ { - "name": "authfs_device_test_src_lib" - } - ], - "presubmit-rust": [ - { - "name": "authfs_device_test_src_lib" + "name": "heck_device_test_src_lib" } ] } diff --git a/benches/graphemes.rs b/benches/graphemes.rs index 8a7a379..5f14352 100644 --- a/benches/graphemes.rs +++ b/benches/graphemes.rs @@ -1,54 +1,55 @@ -use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use unicode_segmentation; +#[macro_use] +extern crate bencher; +extern crate unicode_segmentation; -use std::fs; +use bencher::Bencher; use unicode_segmentation::UnicodeSegmentation; +use std::fs; -fn graphemes(c: &mut Criterion, lang: &str, path: &str) { +fn graphemes(bench: &mut Bencher, path: &str) { let text = fs::read_to_string(path).unwrap(); - - c.bench_function(&format!("graphemes_{}",lang), |bench| { - bench.iter(|| { - for g in UnicodeSegmentation::graphemes(black_box(&*text), true) { - black_box(g); - } - }) + bench.iter(|| { + for g in UnicodeSegmentation::graphemes(&*text, true) { + bencher::black_box(g); + } }); + + bench.bytes = text.len() as u64; } -fn graphemes_arabic(c: &mut Criterion) { - graphemes(c, "arabic" ,"benches/texts/arabic.txt"); +fn graphemes_arabic(bench: &mut Bencher) { + graphemes(bench, "benches/texts/arabic.txt"); } -fn graphemes_english(c: &mut Criterion) { - graphemes(c, "english" ,"benches/texts/english.txt"); +fn graphemes_english(bench: &mut Bencher) { + graphemes(bench, "benches/texts/english.txt"); } -fn graphemes_hindi(c: &mut Criterion) { - graphemes(c, "hindi" ,"benches/texts/hindi.txt"); +fn graphemes_hindi(bench: &mut Bencher) { + graphemes(bench, "benches/texts/hindi.txt"); } -fn graphemes_japanese(c: &mut Criterion) { - graphemes(c, "japanese" ,"benches/texts/japanese.txt"); +fn graphemes_japanese(bench: &mut Bencher) { + graphemes(bench, "benches/texts/japanese.txt"); } -fn graphemes_korean(c: &mut Criterion) { - graphemes(c, "korean" ,"benches/texts/korean.txt"); +fn graphemes_korean(bench: &mut Bencher) { + graphemes(bench, "benches/texts/korean.txt"); } -fn graphemes_mandarin(c: &mut Criterion) { - graphemes(c, "mandarin" ,"benches/texts/mandarin.txt"); +fn graphemes_mandarin(bench: &mut Bencher) { + graphemes(bench, "benches/texts/mandarin.txt"); } -fn graphemes_russian(c: &mut Criterion) { - graphemes(c, "russian" ,"benches/texts/russian.txt"); +fn graphemes_russian(bench: &mut Bencher) { + graphemes(bench, "benches/texts/russian.txt"); } -fn graphemes_source_code(c: &mut Criterion) { - graphemes(c, "source_code","benches/texts/source_code.txt"); +fn graphemes_source_code(bench: &mut Bencher) { + graphemes(bench, "benches/texts/source_code.txt"); } -criterion_group!( +benchmark_group!( benches, graphemes_arabic, graphemes_english, @@ -60,4 +61,4 @@ criterion_group!( graphemes_source_code, ); -criterion_main!(benches); +benchmark_main!(benches); diff --git a/benches/unicode_words.rs b/benches/unicode_words.rs deleted file mode 100644 index 731e325..0000000 --- a/benches/unicode_words.rs +++ /dev/null @@ -1,64 +0,0 @@ -#[macro_use] -extern crate bencher; -extern crate unicode_segmentation; - -use bencher::Bencher; -use unicode_segmentation::UnicodeSegmentation; -use std::fs; - -fn unicode_words(bench: &mut Bencher, path: &str) { - let text = fs::read_to_string(path).unwrap(); - bench.iter(|| { - for w in text.unicode_words() { - bencher::black_box(w); - } - }); - - bench.bytes = text.len() as u64; -} - -fn unicode_words_arabic(bench: &mut Bencher) { - unicode_words(bench, "benches/texts/arabic.txt"); -} - -fn unicode_words_english(bench: &mut Bencher) { - unicode_words(bench, "benches/texts/english.txt"); -} - -fn unicode_words_hindi(bench: &mut Bencher) { - unicode_words(bench, "benches/texts/hindi.txt"); -} - -fn unicode_words_japanese(bench: &mut Bencher) { - unicode_words(bench, "benches/texts/japanese.txt"); -} - -fn unicode_words_korean(bench: &mut Bencher) { - unicode_words(bench, "benches/texts/korean.txt"); -} - -fn unicode_words_mandarin(bench: &mut Bencher) { - unicode_words(bench, "benches/texts/mandarin.txt"); -} - -fn unicode_words_russian(bench: &mut Bencher) { - unicode_words(bench, "benches/texts/russian.txt"); -} - -fn unicode_words_source_code(bench: &mut Bencher) { - unicode_words(bench, "benches/texts/source_code.txt"); -} - -benchmark_group!( - benches, - unicode_words_arabic, - unicode_words_english, - unicode_words_hindi, - unicode_words_japanese, - unicode_words_korean, - unicode_words_mandarin, - unicode_words_russian, - unicode_words_source_code, -); - -benchmark_main!(benches); diff --git a/benches/word_bounds.rs b/benches/word_bounds.rs deleted file mode 100644 index 035f57e..0000000 --- a/benches/word_bounds.rs +++ /dev/null @@ -1,64 +0,0 @@ -#[macro_use] -extern crate bencher; -extern crate unicode_segmentation; - -use bencher::Bencher; -use unicode_segmentation::UnicodeSegmentation; -use std::fs; - -fn word_bounds(bench: &mut Bencher, path: &str) { - let text = fs::read_to_string(path).unwrap(); - bench.iter(|| { - for w in text.split_word_bounds() { - bencher::black_box(w); - } - }); - - bench.bytes = text.len() as u64; -} - -fn word_bounds_arabic(bench: &mut Bencher) { - word_bounds(bench, "benches/texts/arabic.txt"); -} - -fn word_bounds_english(bench: &mut Bencher) { - word_bounds(bench, "benches/texts/english.txt"); -} - -fn word_bounds_hindi(bench: &mut Bencher) { - word_bounds(bench, "benches/texts/hindi.txt"); -} - -fn word_bounds_japanese(bench: &mut Bencher) { - word_bounds(bench, "benches/texts/japanese.txt"); -} - -fn word_bounds_korean(bench: &mut Bencher) { - word_bounds(bench, "benches/texts/korean.txt"); -} - -fn word_bounds_mandarin(bench: &mut Bencher) { - word_bounds(bench, "benches/texts/mandarin.txt"); -} - -fn word_bounds_russian(bench: &mut Bencher) { - word_bounds(bench, "benches/texts/russian.txt"); -} - -fn word_bounds_source_code(bench: &mut Bencher) { - word_bounds(bench, "benches/texts/source_code.txt"); -} - -benchmark_group!( - benches, - word_bounds_arabic, - word_bounds_english, - word_bounds_hindi, - word_bounds_japanese, - word_bounds_korean, - word_bounds_mandarin, - word_bounds_russian, - word_bounds_source_code, -); - -benchmark_main!(benches); diff --git a/cargo2android.json b/cargo2android.json deleted file mode 100644 index bf78496..0000000 --- a/cargo2android.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "device": true, - "run": true -}
\ No newline at end of file diff --git a/scripts/unicode.py b/scripts/unicode.py index 16e321d..1841e35 100644 --- a/scripts/unicode.py +++ b/scripts/unicode.py @@ -229,7 +229,7 @@ pub mod util { #[inline] fn is_alphabetic(c: char) -> bool { match c { - 'a' ..= 'z' | 'A' ..= 'Z' => true, + 'a' ... 'z' | 'A' ... 'Z' => true, c if c > '\x7f' => super::derived_property::Alphabetic(c), _ => false, } @@ -238,7 +238,7 @@ pub mod util { #[inline] fn is_numeric(c: char) -> bool { match c { - '0' ..= '9' => true, + '0' ... '9' => true, c if c > '\x7f' => super::general_category::N(c), _ => false, } @@ -281,6 +281,7 @@ def emit_break_module(f, break_table, break_cats, name): f.write(""" } fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> (u32, u32, %sCat) { + use core; use core::cmp::Ordering::{Equal, Less, Greater}; match r.binary_search_by(|&(lo, hi, _)| { if lo <= c && c <= hi { Equal } diff --git a/src/grapheme.rs b/src/grapheme.rs index 190b86e..e95d478 100644 --- a/src/grapheme.rs +++ b/src/grapheme.rs @@ -10,7 +10,7 @@ use core::cmp; -use crate::tables::grapheme::GraphemeCat; +use tables::grapheme::GraphemeCat; /// External iterator for grapheme clusters and byte offsets. /// @@ -73,7 +73,7 @@ impl<'a> DoubleEndedIterator for GraphemeIndices<'a> { /// /// [`graphemes`]: trait.UnicodeSegmentation.html#tymethod.graphemes /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html -#[derive(Clone, Debug)] +#[derive(Clone)] pub struct Graphemes<'a> { string: &'a str, cursor: GraphemeCursor, @@ -148,7 +148,7 @@ pub fn new_grapheme_indices<'b>(s: &'b str, is_extended: bool) -> GraphemeIndice // maybe unify with PairResult? // An enum describing information about a potential boundary. -#[derive(PartialEq, Eq, Clone, Debug)] +#[derive(PartialEq, Eq, Clone)] enum GraphemeState { // No information is known. Unknown, @@ -165,7 +165,7 @@ enum GraphemeState { } /// Cursor-based segmenter for grapheme clusters. -#[derive(Clone, Debug)] +#[derive(Clone)] pub struct GraphemeCursor { // Current cursor position. offset: usize, @@ -228,9 +228,8 @@ enum PairResult { Emoji, // a break if preceded by emoji base and (Extend)* } -#[inline] fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult { - use crate::tables::grapheme::GraphemeCat::*; + use tables::grapheme::GraphemeCat::*; use self::PairResult::*; match (before, after) { (GC_CR, GC_LF) => NotBreak, // GB3 @@ -296,8 +295,8 @@ impl GraphemeCursor { } fn grapheme_category(&mut self, ch: char) -> GraphemeCat { - use crate::tables::grapheme as gr; - use crate::tables::grapheme::GraphemeCat::*; + use tables::grapheme as gr; + use tables::grapheme::GraphemeCat::*; if ch <= '\u{7e}' { // Special-case optimization for ascii, except U+007F. This @@ -388,7 +387,7 @@ impl GraphemeCursor { /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Ok(true)); /// ``` pub fn provide_context(&mut self, chunk: &str, chunk_start: usize) { - use crate::tables::grapheme as gr; + use tables::grapheme as gr; assert!(chunk_start + chunk.len() == self.pre_context_offset.unwrap()); self.pre_context_offset = None; if self.is_extended && chunk_start + chunk.len() == self.offset { @@ -408,7 +407,6 @@ impl GraphemeCursor { } } - #[inline] fn decide(&mut self, is_break: bool) { self.state = if is_break { GraphemeState::Break @@ -417,13 +415,11 @@ impl GraphemeCursor { }; } - #[inline] fn decision(&mut self, is_break: bool) -> Result<bool, GraphemeIncomplete> { self.decide(is_break); Ok(is_break) } - #[inline] fn is_boundary_result(&self) -> Result<bool, GraphemeIncomplete> { if self.state == GraphemeState::Break { Ok(true) @@ -436,9 +432,8 @@ impl GraphemeCursor { } } - #[inline] fn handle_regional(&mut self, chunk: &str, chunk_start: usize) { - use crate::tables::grapheme as gr; + use tables::grapheme as gr; let mut ris_count = self.ris_count.unwrap_or(0); for ch in chunk.chars().rev() { if self.grapheme_category(ch) != gr::GC_Regional_Indicator { @@ -457,9 +452,8 @@ impl GraphemeCursor { self.state = GraphemeState::Regional; } - #[inline] fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) { - use crate::tables::grapheme as gr; + use tables::grapheme as gr; let mut iter = chunk.chars().rev(); if let Some(ch) = iter.next() { if self.grapheme_category(ch) != gr::GC_ZWJ { @@ -488,7 +482,6 @@ impl GraphemeCursor { self.state = GraphemeState::Emoji; } - #[inline] /// Determine whether the current cursor location is a grapheme cluster boundary. /// Only a part of the string need be supplied. If `chunk_start` is nonzero or /// the length of `chunk` is not equal to `len` on creation, then this method @@ -513,7 +506,7 @@ impl GraphemeCursor { /// assert_eq!(cursor.is_boundary(flags, 0), Ok(false)); /// ``` pub fn is_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<bool, GraphemeIncomplete> { - use crate::tables::grapheme as gr; + use tables::grapheme as gr; if self.state == GraphemeState::Break { return Ok(true) } @@ -570,7 +563,6 @@ impl GraphemeCursor { } } - #[inline] /// Find the next boundary after the current cursor position. Only a part of /// the string need be supplied. If the chunk is incomplete, then this /// method might return `GraphemeIncomplete::PreContext` or @@ -66,7 +66,7 @@ extern crate quickcheck; pub use grapheme::{Graphemes, GraphemeIndices}; pub use grapheme::{GraphemeCursor, GraphemeIncomplete}; pub use tables::UNICODE_VERSION; -pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords, UnicodeWordIndices}; +pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords}; pub use sentence::{USentenceBounds, USentenceBoundIndices, UnicodeSentences}; mod grapheme; @@ -146,30 +146,6 @@ pub trait UnicodeSegmentation { /// ``` fn unicode_words<'a>(&'a self) -> UnicodeWords<'a>; - /// Returns an iterator over the words of `self`, separated on - /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), and their - /// offsets. - /// - /// Here, "words" are just those substrings which, after splitting on - /// UAX#29 word boundaries, contain any alphanumeric characters. That is, the - /// substring must contain at least one character with the - /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) - /// property, or with - /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). - /// - /// # Example - /// - /// ``` - /// # use self::unicode_segmentation::UnicodeSegmentation; - /// let uwis = "The quick (\"brown\") fox can't jump 32.3 feet, right?"; - /// let uwi1 = uwis.unicode_word_indices().collect::<Vec<(usize, &str)>>(); - /// let b: &[_] = &[(0, "The"), (4, "quick"), (12, "brown"), (20, "fox"), (24, "can't"), - /// (30, "jump"), (35, "32.3"), (40, "feet"), (46, "right")]; - /// - /// assert_eq!(&uwi1[..], b); - /// ``` - fn unicode_word_indices<'a>(&'a self) -> UnicodeWordIndices<'a>; - /// Returns an iterator over substrings of `self` separated on /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries). /// @@ -274,11 +250,6 @@ impl UnicodeSegmentation for str { } #[inline] - fn unicode_word_indices(&self) -> UnicodeWordIndices { - word::new_unicode_word_indices(self) - } - - #[inline] fn split_word_bounds(&self) -> UWordBounds { word::new_word_bounds(self) } diff --git a/src/sentence.rs b/src/sentence.rs index 0a23abd..275da52 100644 --- a/src/sentence.rs +++ b/src/sentence.rs @@ -13,7 +13,7 @@ use core::iter::Filter; // All of the logic for forward iteration over sentences mod fwd { - use crate::tables::sentence::SentenceCat; + use tables::sentence::SentenceCat; use core::cmp; // Describe a parsed part of source string as described in this table: @@ -111,7 +111,7 @@ mod fwd { if parts[idx] == StatePart::ClosePlus { idx -= 1 } if parts[idx] == StatePart::ATerm { - use crate::tables::sentence as se; + use tables::sentence as se; for next_char in ahead.chars() { //( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower @@ -176,7 +176,7 @@ mod fwd { #[inline] fn next(&mut self) -> Option<usize> { - use crate::tables::sentence as se; + use tables::sentence as se; for next_char in self.string[self.pos..].chars() { let position_before = self.pos; @@ -331,7 +331,7 @@ pub fn new_sentence_bound_indices<'a>(source: &'a str) -> USentenceBoundIndices< #[inline] pub fn new_unicode_sentences<'b>(s: &'b str) -> UnicodeSentences<'b> { use super::UnicodeSegmentation; - use crate::tables::util::is_alphanumeric; + use tables::util::is_alphanumeric; fn has_alphanumeric(s: &&str) -> bool { s.chars().any(|c| is_alphanumeric(c)) } let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer diff --git a/src/tables.rs b/src/tables.rs index 6d09ea2..7062e36 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -30,7 +30,7 @@ pub mod util { #[inline] fn is_alphabetic(c: char) -> bool { match c { - 'a' ..= 'z' | 'A' ..= 'Z' => true, + 'a' ... 'z' | 'A' ... 'Z' => true, c if c > '' => super::derived_property::Alphabetic(c), _ => false, } @@ -39,7 +39,7 @@ pub mod util { #[inline] fn is_numeric(c: char) -> bool { match c { - '0' ..= '9' => true, + '0' ... '9' => true, c if c > '' => super::general_category::N(c), _ => false, } @@ -352,6 +352,7 @@ pub mod grapheme { } fn bsearch_range_value_table(c: char, r: &'static [(char, char, GraphemeCat)]) -> (u32, u32, GraphemeCat) { + use core; use core::cmp::Ordering::{Equal, Less, Greater}; match r.binary_search_by(|&(lo, hi, _)| { if lo <= c && c <= hi { Equal } @@ -1002,6 +1003,7 @@ pub mod word { } fn bsearch_range_value_table(c: char, r: &'static [(char, char, WordCat)]) -> (u32, u32, WordCat) { + use core; use core::cmp::Ordering::{Equal, Less, Greater}; match r.binary_search_by(|&(lo, hi, _)| { if lo <= c && c <= hi { Equal } @@ -1477,6 +1479,7 @@ pub mod emoji { } fn bsearch_range_value_table(c: char, r: &'static [(char, char, EmojiCat)]) -> (u32, u32, EmojiCat) { + use core; use core::cmp::Ordering::{Equal, Less, Greater}; match r.binary_search_by(|&(lo, hi, _)| { if lo <= c && c <= hi { Equal } @@ -1580,6 +1583,7 @@ pub mod sentence { } fn bsearch_range_value_table(c: char, r: &'static [(char, char, SentenceCat)]) -> (u32, u32, SentenceCat) { + use core; use core::cmp::Ordering::{Equal, Less, Greater}; match r.binary_search_by(|&(lo, hi, _)| { if lo <= c && c <= hi { Equal } diff --git a/src/test.rs b/src/test.rs index ae74c64..75b77c5 100644 --- a/src/test.rs +++ b/src/test.rs @@ -14,7 +14,7 @@ use std::prelude::v1::*; #[test] fn test_graphemes() { - use crate::testdata::{TEST_SAME, TEST_DIFF}; + use testdata::{TEST_SAME, TEST_DIFF}; pub const EXTRA_DIFF: &'static [(&'static str, &'static [&'static str], @@ -88,7 +88,7 @@ fn test_graphemes() { #[test] fn test_words() { - use crate::testdata::TEST_WORD; + use testdata::TEST_WORD; // Unicode's official tests don't really test longer chains of flag emoji // TODO This could be improved with more tests like flag emoji with interspersed Extend chars and ZWJ @@ -144,7 +144,7 @@ fn test_words() { #[test] fn test_sentences() { - use crate::testdata::TEST_SENTENCE; + use testdata::TEST_SENTENCE; for &(s, w) in TEST_SENTENCE.iter() { macro_rules! assert_ { diff --git a/src/word.rs b/src/word.rs index 5cfde0d..179d122 100644 --- a/src/word.rs +++ b/src/word.rs @@ -11,7 +11,7 @@ use core::cmp; use core::iter::Filter; -use crate::tables::word::WordCat; +use tables::word::WordCat; /// An iterator over the substrings of a string which, after splitting the string on /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), @@ -40,34 +40,6 @@ impl<'a> DoubleEndedIterator for UnicodeWords<'a> { fn next_back(&mut self) -> Option<&'a str> { self.inner.next_back() } } -/// An iterator over the substrings of a string which, after splitting the string on -/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), -/// contain any characters with the -/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) -/// property, or with -/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). -/// This iterator also provides the byte offsets for each substring. -/// -/// This struct is created by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See -/// its documentation for more. -/// -/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices -/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html -pub struct UnicodeWordIndices<'a> { - inner: Filter<UWordBoundIndices<'a>, fn(&(usize, &str)) -> bool>, -} - -impl<'a> Iterator for UnicodeWordIndices<'a> { - type Item = (usize, &'a str); - - #[inline] - fn next(&mut self) -> Option<(usize, &'a str)> { self.inner.next() } -} -impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> { - #[inline] - fn next_back(&mut self) -> Option<(usize, &'a str)> { self.inner.next_back() } -} - /// External iterator for a string's /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries). /// @@ -170,7 +142,7 @@ enum RegionalState { } fn is_emoji(ch: char) -> bool { - use crate::tables::emoji; + use tables::emoji; emoji::emoji_category(ch).2 == emoji::EmojiCat::EC_Extended_Pictographic } @@ -187,7 +159,7 @@ impl<'a> Iterator for UWordBounds<'a> { fn next(&mut self) -> Option<&'a str> { use self::UWordBoundsState::*; use self::FormatExtendType::*; - use crate::tables::word as wd; + use tables::word as wd; if self.string.len() == 0 { return None; } @@ -200,13 +172,14 @@ impl<'a> Iterator for UWordBounds<'a> { let mut cat = wd::WC_Any; let mut savecat = wd::WC_Any; + // Whether or not the previous category was ZWJ + // ZWJs get collapsed, so this handles precedence of WB3c over WB4 + let mut prev_zwj; // If extend/format/zwj were skipped. Handles precedence of WB3d over WB4 let mut skipped_format_extend = false; for (curr, ch) in self.string.char_indices() { idx = curr; - // Whether or not the previous category was ZWJ - // ZWJs get collapsed, so this handles precedence of WB3c over WB4 - let prev_zwj = cat == wd::WC_ZWJ; + prev_zwj = cat == wd::WC_ZWJ; // if there's a category cached, grab it cat = match self.cat { None => wd::word_category(ch).2, @@ -413,7 +386,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> { fn next_back(&mut self) -> Option<&'a str> { use self::UWordBoundsState::*; use self::FormatExtendType::*; - use crate::tables::word as wd; + use tables::word as wd; if self.string.len() == 0 { return None; } @@ -665,7 +638,7 @@ impl<'a> UWordBounds<'a> { #[inline] fn get_next_cat(&self, idx: usize) -> Option<WordCat> { - use crate::tables::word as wd; + use tables::word as wd; let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8(); if nidx < self.string.len() { let nch = self.string[nidx..].chars().next().unwrap(); @@ -677,7 +650,7 @@ impl<'a> UWordBounds<'a> { #[inline] fn get_prev_cat(&self, idx: usize) -> Option<WordCat> { - use crate::tables::word as wd; + use tables::word as wd; if idx > 0 { let nch = self.string[..idx].chars().next_back().unwrap(); Some(wd::word_category(nch).2) @@ -698,22 +671,12 @@ pub fn new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b> { } #[inline] -fn has_alphanumeric(s: &&str) -> bool { - use crate::tables::util::is_alphanumeric; - - s.chars().any(|c| is_alphanumeric(c)) -} - -#[inline] pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> { use super::UnicodeSegmentation; + use tables::util::is_alphanumeric; - UnicodeWords { inner: s.split_word_bounds().filter(has_alphanumeric) } -} + fn has_alphanumeric(s: &&str) -> bool { s.chars().any(|c| is_alphanumeric(c)) } + let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer -#[inline] -pub fn new_unicode_word_indices<'b>(s: &'b str) -> UnicodeWordIndices<'b> { - use super::UnicodeSegmentation; - - UnicodeWordIndices { inner: s.split_word_bound_indices().filter(|(_, c)| has_alphanumeric(c)) } + UnicodeWords { inner: s.split_word_bounds().filter(has_alphanumeric) } } |