aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJoel Galenson <jgalenson@google.com>2021-08-10 17:22:07 +0000
committerAutomerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>2021-08-10 17:22:07 +0000
commit72e8661e14e15eff76042b92405b94a8435035b1 (patch)
tree014fd32bf139cc7bb53787c6540380976a6a0802
parentd983a935a60f6c1a8823ea7e9a773c1524e9f247 (diff)
parentab4c798bfc95938cc01c8ebaf4d7c6b3ece12ed3 (diff)
downloadunicode-segmentation-72e8661e14e15eff76042b92405b94a8435035b1.tar.gz
Upgrade rust/crates/unicode-segmentation to 1.8.0 am: cdef32ccc4 am: 6cb7f46ab3 am: ab4c798bfc
Original change: https://android-review.googlesource.com/c/platform/external/rust/crates/unicode-segmentation/+/1791075 Change-Id: Id7b1e6bf54f3b65b6638402d5cc0dd10bfb4673c
-rw-r--r--.cargo_vcs_info.json2
-rw-r--r--.github/workflows/rust.yml22
-rw-r--r--Android.bp4
-rw-r--r--Cargo.toml15
-rw-r--r--Cargo.toml.orig13
-rw-r--r--METADATA8
-rw-r--r--README.md2
-rw-r--r--benches/graphemes.rs59
-rw-r--r--benches/unicode_words.rs64
-rw-r--r--benches/word_bounds.rs64
-rw-r--r--scripts/unicode.py5
-rw-r--r--src/grapheme.rs30
-rw-r--r--src/lib.rs31
-rw-r--r--src/sentence.rs8
-rw-r--r--src/tables.rs8
-rw-r--r--src/test.rs6
-rw-r--r--src/word.rs65
17 files changed, 321 insertions, 85 deletions
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json
index 22f2cec..e74a153 100644
--- a/.cargo_vcs_info.json
+++ b/.cargo_vcs_info.json
@@ -1,5 +1,5 @@
{
"git": {
- "sha1": "3b75ee19b3c0ddacaeec03be688a7b8766833728"
+ "sha1": "907d4d0b7e5c6f5e0f815c90a51d28b793d0c7a4"
}
}
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
new file mode 100644
index 0000000..3c13d1b
--- /dev/null
+++ b/.github/workflows/rust.yml
@@ -0,0 +1,22 @@
+name: Rust
+
+on:
+ push:
+ branches: [ master ]
+ pull_request:
+ branches: [ master ]
+
+env:
+ CARGO_TERM_COLOR: always
+
+jobs:
+ build:
+
+ runs-on: ubuntu-latest
+
+ steps:
+ - uses: actions/checkout@v2
+ - name: Build
+ run: cargo build --verbose
+ - name: Run tests
+ run: cargo test --verbose
diff --git a/Android.bp b/Android.bp
index da0cf0d..aaf1015 100644
--- a/Android.bp
+++ b/Android.bp
@@ -1,4 +1,5 @@
// This file is generated by cargo2android.py --run --device --dependencies.
+// Do not modify this file as changes will be overridden on upgrade.
package {
default_applicable_licenses: [
@@ -41,9 +42,8 @@ license {
rust_library {
name: "libunicode_segmentation",
- // has rustc warnings
host_supported: true,
crate_name: "unicode_segmentation",
srcs: ["src/lib.rs"],
- edition: "2015",
+ edition: "2018",
}
diff --git a/Cargo.toml b/Cargo.toml
index 0f21309..583df10 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -11,8 +11,9 @@
# will likely look very different (and much more reasonable)
[package]
+edition = "2018"
name = "unicode-segmentation"
-version = "1.7.1"
+version = "1.8.0"
authors = ["kwantam <kwantam@gmail.com>", "Manish Goregaokar <manishsmail@gmail.com>"]
exclude = ["target/*", "Cargo.lock", "scripts/tmp", "benches/texts/*", "*.txt"]
description = "This crate provides Grapheme Cluster, Word and Sentence boundaries\naccording to Unicode Standard Annex #29 rules.\n"
@@ -26,8 +27,16 @@ repository = "https://github.com/unicode-rs/unicode-segmentation"
[[bench]]
name = "graphemes"
harness = false
-[dev-dependencies.bencher]
-version = "0.1"
+
+[[bench]]
+name = "unicode_words"
+harness = false
+
+[[bench]]
+name = "word_bounds"
+harness = false
+[dev-dependencies.criterion]
+version = "0.3"
[dev-dependencies.quickcheck]
version = "0.7"
diff --git a/Cargo.toml.orig b/Cargo.toml.orig
index 3f55167..c1c16e6 100644
--- a/Cargo.toml.orig
+++ b/Cargo.toml.orig
@@ -1,9 +1,10 @@
[package]
name = "unicode-segmentation"
-version = "1.7.1"
+version = "1.8.0"
authors = ["kwantam <kwantam@gmail.com>", "Manish Goregaokar <manishsmail@gmail.com>"]
+edition = "2018"
homepage = "https://github.com/unicode-rs/unicode-segmentation"
repository = "https://github.com/unicode-rs/unicode-segmentation"
documentation = "https://unicode-rs.github.io/unicode-segmentation"
@@ -23,8 +24,16 @@ no_std = [] # This is a no-op, preserved for backward compatibility only.
[dev-dependencies]
quickcheck = "0.7"
-bencher = "0.1"
+criterion = "0.3"
[[bench]]
name = "graphemes"
+harness = false
+
+[[bench]]
+name = "unicode_words"
+harness = false
+
+[[bench]]
+name = "word_bounds"
harness = false \ No newline at end of file
diff --git a/METADATA b/METADATA
index bf983b5..00df7c3 100644
--- a/METADATA
+++ b/METADATA
@@ -7,13 +7,13 @@ third_party {
}
url {
type: ARCHIVE
- value: "https://static.crates.io/crates/unicode-segmentation/unicode-segmentation-1.7.1.crate"
+ value: "https://static.crates.io/crates/unicode-segmentation/unicode-segmentation-1.8.0.crate"
}
- version: "1.7.1"
+ version: "1.8.0"
license_type: NOTICE
last_upgrade_date {
year: 2021
- month: 1
- day: 12
+ month: 8
+ day: 9
}
}
diff --git a/README.md b/README.md
index 2f3bdca..a65c0e2 100644
--- a/README.md
+++ b/README.md
@@ -38,7 +38,7 @@ to your `Cargo.toml`:
```toml
[dependencies]
-unicode-segmentation = "1.7.1"
+unicode-segmentation = "1.8.0"
```
# Change Log
diff --git a/benches/graphemes.rs b/benches/graphemes.rs
index 5f14352..8a7a379 100644
--- a/benches/graphemes.rs
+++ b/benches/graphemes.rs
@@ -1,55 +1,54 @@
-#[macro_use]
-extern crate bencher;
-extern crate unicode_segmentation;
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use unicode_segmentation;
-use bencher::Bencher;
-use unicode_segmentation::UnicodeSegmentation;
use std::fs;
+use unicode_segmentation::UnicodeSegmentation;
-fn graphemes(bench: &mut Bencher, path: &str) {
+fn graphemes(c: &mut Criterion, lang: &str, path: &str) {
let text = fs::read_to_string(path).unwrap();
- bench.iter(|| {
- for g in UnicodeSegmentation::graphemes(&*text, true) {
- bencher::black_box(g);
- }
- });
- bench.bytes = text.len() as u64;
+ c.bench_function(&format!("graphemes_{}",lang), |bench| {
+ bench.iter(|| {
+ for g in UnicodeSegmentation::graphemes(black_box(&*text), true) {
+ black_box(g);
+ }
+ })
+ });
}
-fn graphemes_arabic(bench: &mut Bencher) {
- graphemes(bench, "benches/texts/arabic.txt");
+fn graphemes_arabic(c: &mut Criterion) {
+ graphemes(c, "arabic" ,"benches/texts/arabic.txt");
}
-fn graphemes_english(bench: &mut Bencher) {
- graphemes(bench, "benches/texts/english.txt");
+fn graphemes_english(c: &mut Criterion) {
+ graphemes(c, "english" ,"benches/texts/english.txt");
}
-fn graphemes_hindi(bench: &mut Bencher) {
- graphemes(bench, "benches/texts/hindi.txt");
+fn graphemes_hindi(c: &mut Criterion) {
+ graphemes(c, "hindi" ,"benches/texts/hindi.txt");
}
-fn graphemes_japanese(bench: &mut Bencher) {
- graphemes(bench, "benches/texts/japanese.txt");
+fn graphemes_japanese(c: &mut Criterion) {
+ graphemes(c, "japanese" ,"benches/texts/japanese.txt");
}
-fn graphemes_korean(bench: &mut Bencher) {
- graphemes(bench, "benches/texts/korean.txt");
+fn graphemes_korean(c: &mut Criterion) {
+ graphemes(c, "korean" ,"benches/texts/korean.txt");
}
-fn graphemes_mandarin(bench: &mut Bencher) {
- graphemes(bench, "benches/texts/mandarin.txt");
+fn graphemes_mandarin(c: &mut Criterion) {
+ graphemes(c, "mandarin" ,"benches/texts/mandarin.txt");
}
-fn graphemes_russian(bench: &mut Bencher) {
- graphemes(bench, "benches/texts/russian.txt");
+fn graphemes_russian(c: &mut Criterion) {
+ graphemes(c, "russian" ,"benches/texts/russian.txt");
}
-fn graphemes_source_code(bench: &mut Bencher) {
- graphemes(bench, "benches/texts/source_code.txt");
+fn graphemes_source_code(c: &mut Criterion) {
+ graphemes(c, "source_code","benches/texts/source_code.txt");
}
-benchmark_group!(
+criterion_group!(
benches,
graphemes_arabic,
graphemes_english,
@@ -61,4 +60,4 @@ benchmark_group!(
graphemes_source_code,
);
-benchmark_main!(benches);
+criterion_main!(benches);
diff --git a/benches/unicode_words.rs b/benches/unicode_words.rs
new file mode 100644
index 0000000..731e325
--- /dev/null
+++ b/benches/unicode_words.rs
@@ -0,0 +1,64 @@
+#[macro_use]
+extern crate bencher;
+extern crate unicode_segmentation;
+
+use bencher::Bencher;
+use unicode_segmentation::UnicodeSegmentation;
+use std::fs;
+
+fn unicode_words(bench: &mut Bencher, path: &str) {
+ let text = fs::read_to_string(path).unwrap();
+ bench.iter(|| {
+ for w in text.unicode_words() {
+ bencher::black_box(w);
+ }
+ });
+
+ bench.bytes = text.len() as u64;
+}
+
+fn unicode_words_arabic(bench: &mut Bencher) {
+ unicode_words(bench, "benches/texts/arabic.txt");
+}
+
+fn unicode_words_english(bench: &mut Bencher) {
+ unicode_words(bench, "benches/texts/english.txt");
+}
+
+fn unicode_words_hindi(bench: &mut Bencher) {
+ unicode_words(bench, "benches/texts/hindi.txt");
+}
+
+fn unicode_words_japanese(bench: &mut Bencher) {
+ unicode_words(bench, "benches/texts/japanese.txt");
+}
+
+fn unicode_words_korean(bench: &mut Bencher) {
+ unicode_words(bench, "benches/texts/korean.txt");
+}
+
+fn unicode_words_mandarin(bench: &mut Bencher) {
+ unicode_words(bench, "benches/texts/mandarin.txt");
+}
+
+fn unicode_words_russian(bench: &mut Bencher) {
+ unicode_words(bench, "benches/texts/russian.txt");
+}
+
+fn unicode_words_source_code(bench: &mut Bencher) {
+ unicode_words(bench, "benches/texts/source_code.txt");
+}
+
+benchmark_group!(
+ benches,
+ unicode_words_arabic,
+ unicode_words_english,
+ unicode_words_hindi,
+ unicode_words_japanese,
+ unicode_words_korean,
+ unicode_words_mandarin,
+ unicode_words_russian,
+ unicode_words_source_code,
+);
+
+benchmark_main!(benches);
diff --git a/benches/word_bounds.rs b/benches/word_bounds.rs
new file mode 100644
index 0000000..035f57e
--- /dev/null
+++ b/benches/word_bounds.rs
@@ -0,0 +1,64 @@
+#[macro_use]
+extern crate bencher;
+extern crate unicode_segmentation;
+
+use bencher::Bencher;
+use unicode_segmentation::UnicodeSegmentation;
+use std::fs;
+
+fn word_bounds(bench: &mut Bencher, path: &str) {
+ let text = fs::read_to_string(path).unwrap();
+ bench.iter(|| {
+ for w in text.split_word_bounds() {
+ bencher::black_box(w);
+ }
+ });
+
+ bench.bytes = text.len() as u64;
+}
+
+fn word_bounds_arabic(bench: &mut Bencher) {
+ word_bounds(bench, "benches/texts/arabic.txt");
+}
+
+fn word_bounds_english(bench: &mut Bencher) {
+ word_bounds(bench, "benches/texts/english.txt");
+}
+
+fn word_bounds_hindi(bench: &mut Bencher) {
+ word_bounds(bench, "benches/texts/hindi.txt");
+}
+
+fn word_bounds_japanese(bench: &mut Bencher) {
+ word_bounds(bench, "benches/texts/japanese.txt");
+}
+
+fn word_bounds_korean(bench: &mut Bencher) {
+ word_bounds(bench, "benches/texts/korean.txt");
+}
+
+fn word_bounds_mandarin(bench: &mut Bencher) {
+ word_bounds(bench, "benches/texts/mandarin.txt");
+}
+
+fn word_bounds_russian(bench: &mut Bencher) {
+ word_bounds(bench, "benches/texts/russian.txt");
+}
+
+fn word_bounds_source_code(bench: &mut Bencher) {
+ word_bounds(bench, "benches/texts/source_code.txt");
+}
+
+benchmark_group!(
+ benches,
+ word_bounds_arabic,
+ word_bounds_english,
+ word_bounds_hindi,
+ word_bounds_japanese,
+ word_bounds_korean,
+ word_bounds_mandarin,
+ word_bounds_russian,
+ word_bounds_source_code,
+);
+
+benchmark_main!(benches);
diff --git a/scripts/unicode.py b/scripts/unicode.py
index 1841e35..16e321d 100644
--- a/scripts/unicode.py
+++ b/scripts/unicode.py
@@ -229,7 +229,7 @@ pub mod util {
#[inline]
fn is_alphabetic(c: char) -> bool {
match c {
- 'a' ... 'z' | 'A' ... 'Z' => true,
+ 'a' ..= 'z' | 'A' ..= 'Z' => true,
c if c > '\x7f' => super::derived_property::Alphabetic(c),
_ => false,
}
@@ -238,7 +238,7 @@ pub mod util {
#[inline]
fn is_numeric(c: char) -> bool {
match c {
- '0' ... '9' => true,
+ '0' ..= '9' => true,
c if c > '\x7f' => super::general_category::N(c),
_ => false,
}
@@ -281,7 +281,6 @@ def emit_break_module(f, break_table, break_cats, name):
f.write(""" }
fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> (u32, u32, %sCat) {
- use core;
use core::cmp::Ordering::{Equal, Less, Greater};
match r.binary_search_by(|&(lo, hi, _)| {
if lo <= c && c <= hi { Equal }
diff --git a/src/grapheme.rs b/src/grapheme.rs
index e95d478..190b86e 100644
--- a/src/grapheme.rs
+++ b/src/grapheme.rs
@@ -10,7 +10,7 @@
use core::cmp;
-use tables::grapheme::GraphemeCat;
+use crate::tables::grapheme::GraphemeCat;
/// External iterator for grapheme clusters and byte offsets.
///
@@ -73,7 +73,7 @@ impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
///
/// [`graphemes`]: trait.UnicodeSegmentation.html#tymethod.graphemes
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
-#[derive(Clone)]
+#[derive(Clone, Debug)]
pub struct Graphemes<'a> {
string: &'a str,
cursor: GraphemeCursor,
@@ -148,7 +148,7 @@ pub fn new_grapheme_indices<'b>(s: &'b str, is_extended: bool) -> GraphemeIndice
// maybe unify with PairResult?
// An enum describing information about a potential boundary.
-#[derive(PartialEq, Eq, Clone)]
+#[derive(PartialEq, Eq, Clone, Debug)]
enum GraphemeState {
// No information is known.
Unknown,
@@ -165,7 +165,7 @@ enum GraphemeState {
}
/// Cursor-based segmenter for grapheme clusters.
-#[derive(Clone)]
+#[derive(Clone, Debug)]
pub struct GraphemeCursor {
// Current cursor position.
offset: usize,
@@ -228,8 +228,9 @@ enum PairResult {
Emoji, // a break if preceded by emoji base and (Extend)*
}
+#[inline]
fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {
- use tables::grapheme::GraphemeCat::*;
+ use crate::tables::grapheme::GraphemeCat::*;
use self::PairResult::*;
match (before, after) {
(GC_CR, GC_LF) => NotBreak, // GB3
@@ -295,8 +296,8 @@ impl GraphemeCursor {
}
fn grapheme_category(&mut self, ch: char) -> GraphemeCat {
- use tables::grapheme as gr;
- use tables::grapheme::GraphemeCat::*;
+ use crate::tables::grapheme as gr;
+ use crate::tables::grapheme::GraphemeCat::*;
if ch <= '\u{7e}' {
// Special-case optimization for ascii, except U+007F. This
@@ -387,7 +388,7 @@ impl GraphemeCursor {
/// assert_eq!(cursor.is_boundary(&flags[8..], 8), Ok(true));
/// ```
pub fn provide_context(&mut self, chunk: &str, chunk_start: usize) {
- use tables::grapheme as gr;
+ use crate::tables::grapheme as gr;
assert!(chunk_start + chunk.len() == self.pre_context_offset.unwrap());
self.pre_context_offset = None;
if self.is_extended && chunk_start + chunk.len() == self.offset {
@@ -407,6 +408,7 @@ impl GraphemeCursor {
}
}
+ #[inline]
fn decide(&mut self, is_break: bool) {
self.state = if is_break {
GraphemeState::Break
@@ -415,11 +417,13 @@ impl GraphemeCursor {
};
}
+ #[inline]
fn decision(&mut self, is_break: bool) -> Result<bool, GraphemeIncomplete> {
self.decide(is_break);
Ok(is_break)
}
+ #[inline]
fn is_boundary_result(&self) -> Result<bool, GraphemeIncomplete> {
if self.state == GraphemeState::Break {
Ok(true)
@@ -432,8 +436,9 @@ impl GraphemeCursor {
}
}
+ #[inline]
fn handle_regional(&mut self, chunk: &str, chunk_start: usize) {
- use tables::grapheme as gr;
+ use crate::tables::grapheme as gr;
let mut ris_count = self.ris_count.unwrap_or(0);
for ch in chunk.chars().rev() {
if self.grapheme_category(ch) != gr::GC_Regional_Indicator {
@@ -452,8 +457,9 @@ impl GraphemeCursor {
self.state = GraphemeState::Regional;
}
+ #[inline]
fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) {
- use tables::grapheme as gr;
+ use crate::tables::grapheme as gr;
let mut iter = chunk.chars().rev();
if let Some(ch) = iter.next() {
if self.grapheme_category(ch) != gr::GC_ZWJ {
@@ -482,6 +488,7 @@ impl GraphemeCursor {
self.state = GraphemeState::Emoji;
}
+ #[inline]
/// Determine whether the current cursor location is a grapheme cluster boundary.
/// Only a part of the string need be supplied. If `chunk_start` is nonzero or
/// the length of `chunk` is not equal to `len` on creation, then this method
@@ -506,7 +513,7 @@ impl GraphemeCursor {
/// assert_eq!(cursor.is_boundary(flags, 0), Ok(false));
/// ```
pub fn is_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<bool, GraphemeIncomplete> {
- use tables::grapheme as gr;
+ use crate::tables::grapheme as gr;
if self.state == GraphemeState::Break {
return Ok(true)
}
@@ -563,6 +570,7 @@ impl GraphemeCursor {
}
}
+ #[inline]
/// Find the next boundary after the current cursor position. Only a part of
/// the string need be supplied. If the chunk is incomplete, then this
/// method might return `GraphemeIncomplete::PreContext` or
diff --git a/src/lib.rs b/src/lib.rs
index 571e33a..6077bbd 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -66,7 +66,7 @@ extern crate quickcheck;
pub use grapheme::{Graphemes, GraphemeIndices};
pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
pub use tables::UNICODE_VERSION;
-pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords};
+pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords, UnicodeWordIndices};
pub use sentence::{USentenceBounds, USentenceBoundIndices, UnicodeSentences};
mod grapheme;
@@ -146,6 +146,30 @@ pub trait UnicodeSegmentation {
/// ```
fn unicode_words<'a>(&'a self) -> UnicodeWords<'a>;
+ /// Returns an iterator over the words of `self`, separated on
+ /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), and their
+ /// offsets.
+ ///
+ /// Here, "words" are just those substrings which, after splitting on
+ /// UAX#29 word boundaries, contain any alphanumeric characters. That is, the
+ /// substring must contain at least one character with the
+ /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
+ /// property, or with
+ /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// # use self::unicode_segmentation::UnicodeSegmentation;
+ /// let uwis = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
+ /// let uwi1 = uwis.unicode_word_indices().collect::<Vec<(usize, &str)>>();
+ /// let b: &[_] = &[(0, "The"), (4, "quick"), (12, "brown"), (20, "fox"), (24, "can't"),
+ /// (30, "jump"), (35, "32.3"), (40, "feet"), (46, "right")];
+ ///
+ /// assert_eq!(&uwi1[..], b);
+ /// ```
+ fn unicode_word_indices<'a>(&'a self) -> UnicodeWordIndices<'a>;
+
/// Returns an iterator over substrings of `self` separated on
/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
///
@@ -250,6 +274,11 @@ impl UnicodeSegmentation for str {
}
#[inline]
+ fn unicode_word_indices(&self) -> UnicodeWordIndices {
+ word::new_unicode_word_indices(self)
+ }
+
+ #[inline]
fn split_word_bounds(&self) -> UWordBounds {
word::new_word_bounds(self)
}
diff --git a/src/sentence.rs b/src/sentence.rs
index 275da52..0a23abd 100644
--- a/src/sentence.rs
+++ b/src/sentence.rs
@@ -13,7 +13,7 @@ use core::iter::Filter;
// All of the logic for forward iteration over sentences
mod fwd {
- use tables::sentence::SentenceCat;
+ use crate::tables::sentence::SentenceCat;
use core::cmp;
// Describe a parsed part of source string as described in this table:
@@ -111,7 +111,7 @@ mod fwd {
if parts[idx] == StatePart::ClosePlus { idx -= 1 }
if parts[idx] == StatePart::ATerm {
- use tables::sentence as se;
+ use crate::tables::sentence as se;
for next_char in ahead.chars() {
//( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
@@ -176,7 +176,7 @@ mod fwd {
#[inline]
fn next(&mut self) -> Option<usize> {
- use tables::sentence as se;
+ use crate::tables::sentence as se;
for next_char in self.string[self.pos..].chars() {
let position_before = self.pos;
@@ -331,7 +331,7 @@ pub fn new_sentence_bound_indices<'a>(source: &'a str) -> USentenceBoundIndices<
#[inline]
pub fn new_unicode_sentences<'b>(s: &'b str) -> UnicodeSentences<'b> {
use super::UnicodeSegmentation;
- use tables::util::is_alphanumeric;
+ use crate::tables::util::is_alphanumeric;
fn has_alphanumeric(s: &&str) -> bool { s.chars().any(|c| is_alphanumeric(c)) }
let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer
diff --git a/src/tables.rs b/src/tables.rs
index 7062e36..6d09ea2 100644
--- a/src/tables.rs
+++ b/src/tables.rs
@@ -30,7 +30,7 @@ pub mod util {
#[inline]
fn is_alphabetic(c: char) -> bool {
match c {
- 'a' ... 'z' | 'A' ... 'Z' => true,
+ 'a' ..= 'z' | 'A' ..= 'Z' => true,
c if c > '' => super::derived_property::Alphabetic(c),
_ => false,
}
@@ -39,7 +39,7 @@ pub mod util {
#[inline]
fn is_numeric(c: char) -> bool {
match c {
- '0' ... '9' => true,
+ '0' ..= '9' => true,
c if c > '' => super::general_category::N(c),
_ => false,
}
@@ -352,7 +352,6 @@ pub mod grapheme {
}
fn bsearch_range_value_table(c: char, r: &'static [(char, char, GraphemeCat)]) -> (u32, u32, GraphemeCat) {
- use core;
use core::cmp::Ordering::{Equal, Less, Greater};
match r.binary_search_by(|&(lo, hi, _)| {
if lo <= c && c <= hi { Equal }
@@ -1003,7 +1002,6 @@ pub mod word {
}
fn bsearch_range_value_table(c: char, r: &'static [(char, char, WordCat)]) -> (u32, u32, WordCat) {
- use core;
use core::cmp::Ordering::{Equal, Less, Greater};
match r.binary_search_by(|&(lo, hi, _)| {
if lo <= c && c <= hi { Equal }
@@ -1479,7 +1477,6 @@ pub mod emoji {
}
fn bsearch_range_value_table(c: char, r: &'static [(char, char, EmojiCat)]) -> (u32, u32, EmojiCat) {
- use core;
use core::cmp::Ordering::{Equal, Less, Greater};
match r.binary_search_by(|&(lo, hi, _)| {
if lo <= c && c <= hi { Equal }
@@ -1583,7 +1580,6 @@ pub mod sentence {
}
fn bsearch_range_value_table(c: char, r: &'static [(char, char, SentenceCat)]) -> (u32, u32, SentenceCat) {
- use core;
use core::cmp::Ordering::{Equal, Less, Greater};
match r.binary_search_by(|&(lo, hi, _)| {
if lo <= c && c <= hi { Equal }
diff --git a/src/test.rs b/src/test.rs
index 75b77c5..ae74c64 100644
--- a/src/test.rs
+++ b/src/test.rs
@@ -14,7 +14,7 @@ use std::prelude::v1::*;
#[test]
fn test_graphemes() {
- use testdata::{TEST_SAME, TEST_DIFF};
+ use crate::testdata::{TEST_SAME, TEST_DIFF};
pub const EXTRA_DIFF: &'static [(&'static str,
&'static [&'static str],
@@ -88,7 +88,7 @@ fn test_graphemes() {
#[test]
fn test_words() {
- use testdata::TEST_WORD;
+ use crate::testdata::TEST_WORD;
// Unicode's official tests don't really test longer chains of flag emoji
// TODO This could be improved with more tests like flag emoji with interspersed Extend chars and ZWJ
@@ -144,7 +144,7 @@ fn test_words() {
#[test]
fn test_sentences() {
- use testdata::TEST_SENTENCE;
+ use crate::testdata::TEST_SENTENCE;
for &(s, w) in TEST_SENTENCE.iter() {
macro_rules! assert_ {
diff --git a/src/word.rs b/src/word.rs
index 179d122..5cfde0d 100644
--- a/src/word.rs
+++ b/src/word.rs
@@ -11,7 +11,7 @@
use core::cmp;
use core::iter::Filter;
-use tables::word::WordCat;
+use crate::tables::word::WordCat;
/// An iterator over the substrings of a string which, after splitting the string on
/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
@@ -40,6 +40,34 @@ impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
fn next_back(&mut self) -> Option<&'a str> { self.inner.next_back() }
}
+/// An iterator over the substrings of a string which, after splitting the string on
+/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
+/// contain any characters with the
+/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
+/// property, or with
+/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
+/// This iterator also provides the byte offsets for each substring.
+///
+/// This struct is created by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See
+/// its documentation for more.
+///
+/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices
+/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
+pub struct UnicodeWordIndices<'a> {
+ inner: Filter<UWordBoundIndices<'a>, fn(&(usize, &str)) -> bool>,
+}
+
+impl<'a> Iterator for UnicodeWordIndices<'a> {
+ type Item = (usize, &'a str);
+
+ #[inline]
+ fn next(&mut self) -> Option<(usize, &'a str)> { self.inner.next() }
+}
+impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> {
+ #[inline]
+ fn next_back(&mut self) -> Option<(usize, &'a str)> { self.inner.next_back() }
+}
+
/// External iterator for a string's
/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
///
@@ -142,7 +170,7 @@ enum RegionalState {
}
fn is_emoji(ch: char) -> bool {
- use tables::emoji;
+ use crate::tables::emoji;
emoji::emoji_category(ch).2 == emoji::EmojiCat::EC_Extended_Pictographic
}
@@ -159,7 +187,7 @@ impl<'a> Iterator for UWordBounds<'a> {
fn next(&mut self) -> Option<&'a str> {
use self::UWordBoundsState::*;
use self::FormatExtendType::*;
- use tables::word as wd;
+ use crate::tables::word as wd;
if self.string.len() == 0 {
return None;
}
@@ -172,14 +200,13 @@ impl<'a> Iterator for UWordBounds<'a> {
let mut cat = wd::WC_Any;
let mut savecat = wd::WC_Any;
- // Whether or not the previous category was ZWJ
- // ZWJs get collapsed, so this handles precedence of WB3c over WB4
- let mut prev_zwj;
// If extend/format/zwj were skipped. Handles precedence of WB3d over WB4
let mut skipped_format_extend = false;
for (curr, ch) in self.string.char_indices() {
idx = curr;
- prev_zwj = cat == wd::WC_ZWJ;
+ // Whether or not the previous category was ZWJ
+ // ZWJs get collapsed, so this handles precedence of WB3c over WB4
+ let prev_zwj = cat == wd::WC_ZWJ;
// if there's a category cached, grab it
cat = match self.cat {
None => wd::word_category(ch).2,
@@ -386,7 +413,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
fn next_back(&mut self) -> Option<&'a str> {
use self::UWordBoundsState::*;
use self::FormatExtendType::*;
- use tables::word as wd;
+ use crate::tables::word as wd;
if self.string.len() == 0 {
return None;
}
@@ -638,7 +665,7 @@ impl<'a> UWordBounds<'a> {
#[inline]
fn get_next_cat(&self, idx: usize) -> Option<WordCat> {
- use tables::word as wd;
+ use crate::tables::word as wd;
let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
if nidx < self.string.len() {
let nch = self.string[nidx..].chars().next().unwrap();
@@ -650,7 +677,7 @@ impl<'a> UWordBounds<'a> {
#[inline]
fn get_prev_cat(&self, idx: usize) -> Option<WordCat> {
- use tables::word as wd;
+ use crate::tables::word as wd;
if idx > 0 {
let nch = self.string[..idx].chars().next_back().unwrap();
Some(wd::word_category(nch).2)
@@ -671,12 +698,22 @@ pub fn new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b> {
}
#[inline]
+fn has_alphanumeric(s: &&str) -> bool {
+ use crate::tables::util::is_alphanumeric;
+
+ s.chars().any(|c| is_alphanumeric(c))
+}
+
+#[inline]
pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> {
use super::UnicodeSegmentation;
- use tables::util::is_alphanumeric;
-
- fn has_alphanumeric(s: &&str) -> bool { s.chars().any(|c| is_alphanumeric(c)) }
- let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer
UnicodeWords { inner: s.split_word_bounds().filter(has_alphanumeric) }
}
+
+#[inline]
+pub fn new_unicode_word_indices<'b>(s: &'b str) -> UnicodeWordIndices<'b> {
+ use super::UnicodeSegmentation;
+
+ UnicodeWordIndices { inner: s.split_word_bound_indices().filter(|(_, c)| has_alphanumeric(c)) }
+}