aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJeff Vander Stoep <jeffv@google.com>2023-02-22 00:18:19 +0000
committerAutomerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>2023-02-22 00:18:19 +0000
commitfdafdc4b1f0a3e23f015a8af9631bf73ac5dd8eb (patch)
treefb9d5538736d1dc8e521d57d80211eb91f95f8cd
parent84d24275bdccd022929eea40bd56b6308e6114d0 (diff)
parent3ab9517db750d1a17f1d6b3f83dabeea9bd96e72 (diff)
downloadunicode-segmentation-fdafdc4b1f0a3e23f015a8af9631bf73ac5dd8eb.tar.gz
Upgrade unicode-segmentation to 1.10.1 am: af4da18ee6 am: a3413ee32f am: b53e9a1bb0 am: 3ab9517db7
Original change: https://android-review.googlesource.com/c/platform/external/rust/crates/unicode-segmentation/+/2441309 Change-Id: I546ebb11eb84ad2fabd457bba7bef5c4dea01f49 Signed-off-by: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>
-rw-r--r--.cargo_vcs_info.json2
-rw-r--r--Android.bp2
-rw-r--r--Cargo.toml2
-rw-r--r--Cargo.toml.orig2
-rw-r--r--METADATA10
-rw-r--r--README.md2
-rw-r--r--benches/unicode_words.rs55
-rw-r--r--benches/word_bounds.rs55
-rwxr-xr-xscripts/unicode.py62
-rw-r--r--src/tables.rs346
10 files changed, 449 insertions, 89 deletions
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json
index 6df8781..57ba2a9 100644
--- a/.cargo_vcs_info.json
+++ b/.cargo_vcs_info.json
@@ -1,6 +1,6 @@
{
"git": {
- "sha1": "6ee83831a3086d509b2a87158685000dab83be65"
+ "sha1": "d02ab6e5d14f009cde8c152ef19dcb55d0b594fc"
},
"path_in_vcs": ""
} \ No newline at end of file
diff --git a/Android.bp b/Android.bp
index 1735760..858aba0 100644
--- a/Android.bp
+++ b/Android.bp
@@ -45,7 +45,7 @@ rust_library {
host_supported: true,
crate_name: "unicode_segmentation",
cargo_env_compat: true,
- cargo_pkg_version: "1.10.0",
+ cargo_pkg_version: "1.10.1",
srcs: ["src/lib.rs"],
edition: "2018",
apex_available: [
diff --git a/Cargo.toml b/Cargo.toml
index 0da56c8..83dea28 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -12,7 +12,7 @@
[package]
edition = "2018"
name = "unicode-segmentation"
-version = "1.10.0"
+version = "1.10.1"
authors = [
"kwantam <kwantam@gmail.com>",
"Manish Goregaokar <manishsmail@gmail.com>",
diff --git a/Cargo.toml.orig b/Cargo.toml.orig
index ac21095..84ef138 100644
--- a/Cargo.toml.orig
+++ b/Cargo.toml.orig
@@ -1,7 +1,7 @@
[package]
name = "unicode-segmentation"
-version = "1.10.0"
+version = "1.10.1"
authors = ["kwantam <kwantam@gmail.com>", "Manish Goregaokar <manishsmail@gmail.com>"]
edition = "2018"
diff --git a/METADATA b/METADATA
index c54b937..860e2a9 100644
--- a/METADATA
+++ b/METADATA
@@ -11,13 +11,13 @@ third_party {
}
url {
type: ARCHIVE
- value: "https://static.crates.io/crates/unicode-segmentation/unicode-segmentation-1.10.0.crate"
+ value: "https://static.crates.io/crates/unicode-segmentation/unicode-segmentation-1.10.1.crate"
}
- version: "1.10.0"
+ version: "1.10.1"
license_type: NOTICE
last_upgrade_date {
- year: 2022
- month: 12
- day: 19
+ year: 2023
+ month: 2
+ day: 17
}
}
diff --git a/README.md b/README.md
index 48d9a92..ef61ebd 100644
--- a/README.md
+++ b/README.md
@@ -38,7 +38,7 @@ to your `Cargo.toml`:
```toml
[dependencies]
-unicode-segmentation = "1.9.0"
+unicode-segmentation = "1.10.1"
```
# Change Log
diff --git a/benches/unicode_words.rs b/benches/unicode_words.rs
index c87851a..a7f8f41 100644
--- a/benches/unicode_words.rs
+++ b/benches/unicode_words.rs
@@ -1,55 +1,52 @@
-#[macro_use]
-extern crate bencher;
-extern crate unicode_segmentation;
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
-use bencher::Bencher;
use std::fs;
use unicode_segmentation::UnicodeSegmentation;
-fn unicode_words(bench: &mut Bencher, path: &str) {
+fn unicode_words(c: &mut Criterion, lang: &str, path: &str) {
let text = fs::read_to_string(path).unwrap();
- bench.iter(|| {
- for w in text.unicode_words() {
- bencher::black_box(w);
- }
+ c.bench_function(&format!("unicode_words_{}", lang), |bench| {
+ bench.iter(|| {
+ for w in text.unicode_words() {
+ black_box(w);
+ }
+ })
});
-
- bench.bytes = text.len() as u64;
}
-fn unicode_words_arabic(bench: &mut Bencher) {
- unicode_words(bench, "benches/texts/arabic.txt");
+fn unicode_words_arabic(c: &mut Criterion) {
+ unicode_words(c, "arabic", "benches/texts/arabic.txt");
}
-fn unicode_words_english(bench: &mut Bencher) {
- unicode_words(bench, "benches/texts/english.txt");
+fn unicode_words_english(c: &mut Criterion) {
+ unicode_words(c, "english", "benches/texts/english.txt");
}
-fn unicode_words_hindi(bench: &mut Bencher) {
- unicode_words(bench, "benches/texts/hindi.txt");
+fn unicode_words_hindi(c: &mut Criterion) {
+ unicode_words(c, "hindi", "benches/texts/hindi.txt");
}
-fn unicode_words_japanese(bench: &mut Bencher) {
- unicode_words(bench, "benches/texts/japanese.txt");
+fn unicode_words_japanese(c: &mut Criterion) {
+ unicode_words(c, "japanese", "benches/texts/japanese.txt");
}
-fn unicode_words_korean(bench: &mut Bencher) {
- unicode_words(bench, "benches/texts/korean.txt");
+fn unicode_words_korean(c: &mut Criterion) {
+ unicode_words(c, "korean", "benches/texts/korean.txt");
}
-fn unicode_words_mandarin(bench: &mut Bencher) {
- unicode_words(bench, "benches/texts/mandarin.txt");
+fn unicode_words_mandarin(c: &mut Criterion) {
+ unicode_words(c, "mandarin", "benches/texts/mandarin.txt");
}
-fn unicode_words_russian(bench: &mut Bencher) {
- unicode_words(bench, "benches/texts/russian.txt");
+fn unicode_words_russian(c: &mut Criterion) {
+ unicode_words(c, "russian", "benches/texts/russian.txt");
}
-fn unicode_words_source_code(bench: &mut Bencher) {
- unicode_words(bench, "benches/texts/source_code.txt");
+fn unicode_words_source_code(c: &mut Criterion) {
+ unicode_words(c, "source_code", "benches/texts/source_code.txt");
}
-benchmark_group!(
+criterion_group!(
benches,
unicode_words_arabic,
unicode_words_english,
@@ -61,4 +58,4 @@ benchmark_group!(
unicode_words_source_code,
);
-benchmark_main!(benches);
+criterion_main!(benches);
diff --git a/benches/word_bounds.rs b/benches/word_bounds.rs
index 6b01ddb..cae7a88 100644
--- a/benches/word_bounds.rs
+++ b/benches/word_bounds.rs
@@ -1,55 +1,52 @@
-#[macro_use]
-extern crate bencher;
-extern crate unicode_segmentation;
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
-use bencher::Bencher;
use std::fs;
use unicode_segmentation::UnicodeSegmentation;
-fn word_bounds(bench: &mut Bencher, path: &str) {
+fn word_bounds(c: &mut Criterion, lang: &str, path: &str) {
let text = fs::read_to_string(path).unwrap();
- bench.iter(|| {
- for w in text.split_word_bounds() {
- bencher::black_box(w);
- }
+ c.bench_function(&format!("word_bounds_{}", lang), |bench| {
+ bench.iter(|| {
+ for w in text.split_word_bounds() {
+ black_box(w);
+ }
+ });
});
-
- bench.bytes = text.len() as u64;
}
-fn word_bounds_arabic(bench: &mut Bencher) {
- word_bounds(bench, "benches/texts/arabic.txt");
+fn word_bounds_arabic(c: &mut Criterion) {
+ word_bounds(c, "arabic", "benches/texts/arabic.txt");
}
-fn word_bounds_english(bench: &mut Bencher) {
- word_bounds(bench, "benches/texts/english.txt");
+fn word_bounds_english(c: &mut Criterion) {
+ word_bounds(c, "english", "benches/texts/english.txt");
}
-fn word_bounds_hindi(bench: &mut Bencher) {
- word_bounds(bench, "benches/texts/hindi.txt");
+fn word_bounds_hindi(c: &mut Criterion) {
+ word_bounds(c, "hindi", "benches/texts/hindi.txt");
}
-fn word_bounds_japanese(bench: &mut Bencher) {
- word_bounds(bench, "benches/texts/japanese.txt");
+fn word_bounds_japanese(c: &mut Criterion) {
+ word_bounds(c, "japanese", "benches/texts/japanese.txt");
}
-fn word_bounds_korean(bench: &mut Bencher) {
- word_bounds(bench, "benches/texts/korean.txt");
+fn word_bounds_korean(c: &mut Criterion) {
+ word_bounds(c, "korean", "benches/texts/korean.txt");
}
-fn word_bounds_mandarin(bench: &mut Bencher) {
- word_bounds(bench, "benches/texts/mandarin.txt");
+fn word_bounds_mandarin(c: &mut Criterion) {
+ word_bounds(c, "mandarin", "benches/texts/mandarin.txt");
}
-fn word_bounds_russian(bench: &mut Bencher) {
- word_bounds(bench, "benches/texts/russian.txt");
+fn word_bounds_russian(c: &mut Criterion) {
+ word_bounds(c, "russian", "benches/texts/russian.txt");
}
-fn word_bounds_source_code(bench: &mut Bencher) {
- word_bounds(bench, "benches/texts/source_code.txt");
+fn word_bounds_source_code(c: &mut Criterion) {
+ word_bounds(c, "source_code", "benches/texts/source_code.txt");
}
-benchmark_group!(
+criterion_group!(
benches,
word_bounds_arabic,
word_bounds_english,
@@ -61,4 +58,4 @@ benchmark_group!(
word_bounds_source_code,
);
-benchmark_main!(benches);
+criterion_main!(benches);
diff --git a/scripts/unicode.py b/scripts/unicode.py
index 7aed85e..18cea99 100755
--- a/scripts/unicode.py
+++ b/scripts/unicode.py
@@ -274,13 +274,36 @@ def emit_break_module(f, break_table, break_cats, name):
pub enum %sCat {
""" % (name, Name, Name))
+ # We don't want the lookup table to be too large so choose a reasonable
+ # cutoff. 0x20000 is selected because most of the range table entries are
+ # within the interval of [0x0, 0x20000]
+ lookup_value_cutoff = 0x20000
+
+ # Length of lookup table. It has to be a divisor of `lookup_value_cutoff`.
+ lookup_table_len = 0x400
+
+ lookup_interval = round(lookup_value_cutoff / lookup_table_len)
+
+ # Lookup table is a mapping from `character code / lookup_interval` to
+ # the index in the range table that covers the `character code`.
+ lookup_table = [0] * lookup_table_len
+ j = 0
+ for i in range(0, lookup_table_len):
+ lookup_from = i * lookup_interval
+ while j < len(break_table):
+ (_, entry_to, _) = break_table[j]
+ if entry_to >= lookup_from:
+ break
+ j += 1
+ lookup_table[i] = j
+
break_cats.append("Any")
break_cats.sort()
for cat in break_cats:
f.write((" %sC_" % Name[0]) + cat + ",\n")
f.write(""" }
- fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> (u32, u32, %sCat) {
+ fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)], default_lower: u32, default_upper: u32) -> (u32, u32, %sCat) {
use core::cmp::Ordering::{Equal, Less, Greater};
match r.binary_search_by(|&(lo, hi, _)| {
if lo <= c && c <= hi { Equal }
@@ -293,8 +316,8 @@ def emit_break_module(f, break_table, break_cats, name):
}
Err(idx) => {
(
- if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
- r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
+ if idx > 0 { r[idx-1].1 as u32 + 1 } else { default_lower },
+ r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(default_upper),
%sC_Any,
)
}
@@ -302,10 +325,39 @@ def emit_break_module(f, break_table, break_cats, name):
}
pub fn %s_category(c: char) -> (u32, u32, %sCat) {
- bsearch_range_value_table(c, %s_cat_table)
+ // Perform a quick O(1) lookup in a precomputed table to determine
+ // the slice of the range table to search in.
+ let lookup_interval = 0x%x;
+ let idx = (c as u32 / lookup_interval) as usize;
+ let range = %s_cat_lookup.get(idx..(idx + 2)).map_or(
+ // If the `idx` is outside of the precomputed table - use the slice
+ // starting from the last covered index in the precomputed table and
+ // ending with the length of the range table.
+ %d..%d,
+ |r| (r[0] as usize)..((r[1] + 1) as usize)
+ );
+
+ // Compute pessimistic default lower and upper bounds on the category.
+ // If character doesn't map to any range and there is no adjacent range
+ // in the table slice - these bounds has to apply.
+ let lower = idx as u32 * lookup_interval;
+ let upper = lower + lookup_interval - 1;
+ bsearch_range_value_table(c, &%s_cat_table[range], lower, upper)
}
-""" % (Name, Name, Name[0], name, Name, name))
+""" % (Name, Name, Name[0], name, Name, lookup_interval, name, j, len(break_table), name))
+
+
+ if len(break_table) <= 0xff:
+ lookup_type = "u8"
+ elif len(break_table) <= 0xffff:
+ lookup_type = "u16"
+ else:
+ lookup_type = "u32"
+
+ emit_table(f, "%s_cat_lookup" % name, lookup_table, "&'static [%s]" % lookup_type,
+ pfun=lambda x: "%d" % x,
+ is_pub=False, is_const=True)
emit_table(f, "%s_cat_table" % name, break_table, "&'static [(char, char, %sCat)]" % Name,
pfun=lambda x: "(%s,%s,%sC_%s)" % (escape_char(x[0]), escape_char(x[1]), Name[0], x[2]),
diff --git a/src/tables.rs b/src/tables.rs
index 5a811c9..ca83b50 100644
--- a/src/tables.rs
+++ b/src/tables.rs
@@ -365,7 +365,7 @@ pub mod grapheme {
GC_ZWJ,
}
- fn bsearch_range_value_table(c: char, r: &'static [(char, char, GraphemeCat)]) -> (u32, u32, GraphemeCat) {
+ fn bsearch_range_value_table(c: char, r: &'static [(char, char, GraphemeCat)], default_lower: u32, default_upper: u32) -> (u32, u32, GraphemeCat) {
use core::cmp::Ordering::{Equal, Less, Greater};
match r.binary_search_by(|&(lo, hi, _)| {
if lo <= c && c <= hi { Equal }
@@ -378,8 +378,8 @@ pub mod grapheme {
}
Err(idx) => {
(
- if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
- r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
+ if idx > 0 { r[idx-1].1 as u32 + 1 } else { default_lower },
+ r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(default_upper),
GC_Any,
)
}
@@ -387,9 +387,93 @@ pub mod grapheme {
}
pub fn grapheme_category(c: char) -> (u32, u32, GraphemeCat) {
- bsearch_range_value_table(c, grapheme_cat_table)
+ // Perform a quick O(1) lookup in a precomputed table to determine
+ // the slice of the range table to search in.
+ let lookup_interval = 0x80;
+ let idx = (c as u32 / lookup_interval) as usize;
+ let range = grapheme_cat_lookup.get(idx..(idx + 2)).map_or(
+ // If the `idx` is outside of the precomputed table - use the slice
+ // starting from the last covered index in the precomputed table and
+ // ending with the length of the range table.
+ 1443..1449,
+ |r| (r[0] as usize)..((r[1] + 1) as usize)
+ );
+
+ // Compute pessimistic default lower and upper bounds on the category.
+ // If character doesn't map to any range and there is no adjacent range
+ // in the table slice - these bounds has to apply.
+ let lower = idx as u32 * lookup_interval;
+ let upper = lower + lookup_interval - 1;
+ bsearch_range_value_table(c, &grapheme_cat_table[range], lower, upper)
}
+ const grapheme_cat_lookup: &'static [u16] = &[
+ 0, 5, 9, 9, 9, 9, 9, 10, 10, 10, 11, 11, 16, 21, 26, 29, 32, 37, 41, 53, 65, 75, 86, 97,
+ 106, 116, 131, 143, 153, 157, 161, 168, 173, 183, 188, 189, 191, 191, 191, 192, 192, 192,
+ 192, 192, 192, 192, 192, 198, 206, 209, 211, 219, 219, 232, 233, 242, 258, 262, 270, 270,
+ 271, 271, 271, 271, 271, 279, 280, 282, 284, 284, 284, 286, 290, 290, 291, 291, 295, 297,
+ 298, 313, 317, 317, 317, 318, 318, 318, 318, 322, 322, 322, 323, 324, 325, 325, 325, 325,
+ 325, 328, 329, 329, 329, 329, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331,
+ 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331,
+ 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331,
+ 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331,
+ 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331,
+ 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331,
+ 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331,
+ 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331,
+ 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331,
+ 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331,
+ 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331,
+ 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331,
+ 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331,
+ 331, 331, 331, 333, 335, 335, 335, 342, 347, 351, 360, 369, 379, 379, 386, 395, 405, 413,
+ 423, 431, 441, 450, 459, 469, 477, 487, 495, 505, 514, 523, 533, 541, 551, 559, 569, 578,
+ 587, 597, 605, 615, 623, 633, 642, 651, 661, 669, 679, 687, 697, 706, 715, 725, 733, 743,
+ 751, 761, 770, 779, 789, 797, 807, 815, 825, 834, 843, 853, 861, 871, 879, 889, 898, 907,
+ 917, 925, 935, 943, 953, 962, 971, 981, 989, 999, 1007, 1017, 1026, 1035, 1045, 1053, 1063,
+ 1071, 1081, 1090, 1099, 1109, 1117, 1127, 1135, 1145, 1154, 1163, 1173, 1181, 1186, 1186,
+ 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186,
+ 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186,
+ 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186,
+ 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186,
+ 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1187, 1187, 1187, 1187, 1187, 1187,
+ 1189, 1190, 1190, 1192, 1192, 1192, 1192, 1193, 1193, 1194, 1195, 1195, 1195, 1195, 1195,
+ 1195, 1195, 1195, 1195, 1195, 1195, 1195, 1195, 1195, 1200, 1201, 1201, 1201, 1201, 1201,
+ 1202, 1202, 1202, 1204, 1205, 1206, 1212, 1221, 1227, 1236, 1244, 1247, 1260, 1260, 1267,
+ 1278, 1278, 1286, 1292, 1299, 1303, 1303, 1307, 1307, 1318, 1324, 1333, 1337, 1337, 1337,
+ 1342, 1349, 1355, 1361, 1361, 1363, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372,
+ 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372,
+ 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372,
+ 1372, 1372, 1372, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375,
+ 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375,
+ 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375,
+ 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375,
+ 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375,
+ 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375,
+ 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375,
+ 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1376, 1377, 1377, 1377, 1377, 1377, 1377, 1377,
+ 1377, 1378, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382,
+ 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382,
+ 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382,
+ 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382,
+ 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382,
+ 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382,
+ 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382,
+ 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382,
+ 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382,
+ 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382,
+ 1382, 1382, 1382, 1382, 1382, 1382, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384,
+ 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384,
+ 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1386, 1386,
+ 1386, 1386, 1392, 1395, 1396, 1396, 1396, 1396, 1396, 1396, 1396, 1396, 1396, 1396, 1396,
+ 1396, 1396, 1396, 1396, 1396, 1399, 1402, 1402, 1402, 1402, 1402, 1402, 1402, 1402, 1402,
+ 1402, 1402, 1407, 1408, 1409, 1409, 1409, 1411, 1411, 1411, 1411, 1412, 1412, 1412, 1412,
+ 1412, 1412, 1412, 1412, 1413, 1414, 1414, 1414, 1414, 1414, 1414, 1414, 1414, 1414, 1414,
+ 1414, 1414, 1414, 1414, 1414, 1415, 1419, 1423, 1428, 1428, 1428, 1430, 1430, 1430, 1431,
+ 1431, 1432, 1433, 1434, 1435, 1438, 1440, 1442, 1442, 1442, 1443, 1443, 1443, 1443, 1443,
+ 1443, 1443, 1443, 1443, 1443
+ ];
+
const grapheme_cat_table: &'static [(char, char, GraphemeCat)] = &[
('\u{0}', '\u{9}', GC_Control), ('\u{a}', '\u{a}', GC_LF), ('\u{b}', '\u{c}', GC_Control),
('\u{d}', '\u{d}', GC_CR), ('\u{e}', '\u{1f}', GC_Control), ('\u{7f}', '\u{9f}',
@@ -1028,7 +1112,7 @@ pub mod word {
WC_ZWJ,
}
- fn bsearch_range_value_table(c: char, r: &'static [(char, char, WordCat)]) -> (u32, u32, WordCat) {
+ fn bsearch_range_value_table(c: char, r: &'static [(char, char, WordCat)], default_lower: u32, default_upper: u32) -> (u32, u32, WordCat) {
use core::cmp::Ordering::{Equal, Less, Greater};
match r.binary_search_by(|&(lo, hi, _)| {
if lo <= c && c <= hi { Equal }
@@ -1041,8 +1125,8 @@ pub mod word {
}
Err(idx) => {
(
- if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
- r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
+ if idx > 0 { r[idx-1].1 as u32 + 1 } else { default_lower },
+ r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(default_upper),
WC_Any,
)
}
@@ -1050,9 +1134,87 @@ pub mod word {
}
pub fn word_category(c: char) -> (u32, u32, WordCat) {
- bsearch_range_value_table(c, word_cat_table)
+ // Perform a quick O(1) lookup in a precomputed table to determine
+ // the slice of the range table to search in.
+ let lookup_interval = 0x80;
+ let idx = (c as u32 / lookup_interval) as usize;
+ let range = word_cat_lookup.get(idx..(idx + 2)).map_or(
+ // If the `idx` is outside of the precomputed table - use the slice
+ // starting from the last covered index in the precomputed table and
+ // ending with the length of the range table.
+ 1050..1053,
+ |r| (r[0] as usize)..((r[1] + 1) as usize)
+ );
+
+ // Compute pessimistic default lower and upper bounds on the category.
+ // If character doesn't map to any range and there is no adjacent range
+ // in the table slice - these bounds has to apply.
+ let lower = idx as u32 * lookup_interval;
+ let upper = lower + lookup_interval - 1;
+ bsearch_range_value_table(c, &word_cat_table[range], lower, upper)
}
+ const word_cat_lookup: &'static [u16] = &[
+ 0, 14, 22, 22, 22, 22, 24, 30, 36, 36, 38, 43, 55, 66, 78, 83, 93, 104, 111, 121, 143, 162,
+ 180, 198, 215, 231, 250, 266, 278, 282, 286, 295, 301, 308, 316, 316, 316, 321, 329, 333,
+ 336, 336, 336, 336, 336, 338, 342, 351, 354, 359, 365, 369, 370, 375, 378, 384, 391, 397,
+ 409, 409, 411, 411, 411, 420, 430, 449, 451, 464, 465, 465, 465, 465, 465, 465, 466, 466,
+ 466, 466, 466, 466, 466, 466, 466, 466, 466, 466, 466, 466, 466, 466, 470, 476, 486, 487,
+ 487, 487, 487, 492, 496, 497, 500, 500, 501, 502, 502, 502, 502, 502, 502, 502, 502, 502,
+ 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502,
+ 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502,
+ 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502,
+ 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502,
+ 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502,
+ 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502,
+ 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502,
+ 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502,
+ 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502,
+ 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502,
+ 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502,
+ 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502,
+ 502, 502, 504, 504, 504, 511, 515, 515, 519, 529, 538, 544, 551, 559, 568, 574, 578, 578,
+ 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578,
+ 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578,
+ 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578,
+ 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578,
+ 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 581, 581, 581, 581,
+ 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581,
+ 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581,
+ 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581,
+ 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 592, 593, 593, 593, 594,
+ 597, 609, 611, 620, 628, 634, 635, 636, 637, 637, 640, 644, 648, 648, 652, 655, 662, 662,
+ 662, 665, 668, 675, 678, 680, 682, 692, 696, 699, 700, 701, 703, 706, 706, 706, 710, 714,
+ 718, 726, 734, 744, 753, 759, 767, 785, 785, 791, 796, 796, 801, 805, 809, 811, 811, 813,
+ 815, 828, 835, 844, 848, 848, 848, 854, 857, 869, 875, 875, 877, 885, 886, 886, 886, 886,
+ 886, 886, 886, 886, 887, 888, 888, 889, 889, 889, 889, 889, 889, 889, 889, 889, 889, 889,
+ 889, 889, 889, 889, 889, 889, 889, 889, 889, 889, 890, 890, 890, 890, 890, 890, 890, 890,
+ 890, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895,
+ 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895,
+ 895, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896,
+ 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896,
+ 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896,
+ 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896,
+ 896, 899, 903, 908, 909, 909, 909, 909, 909, 910, 910, 913, 920, 920, 920, 920, 920, 920,
+ 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920,
+ 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920,
+ 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920,
+ 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920,
+ 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920,
+ 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920,
+ 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 923, 924, 924, 927,
+ 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, 927,
+ 927, 927, 927, 929, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933,
+ 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933,
+ 933, 933, 933, 933, 933, 935, 935, 935, 935, 938, 941, 942, 942, 942, 942, 943, 951, 960,
+ 960, 960, 964, 968, 973, 973, 973, 973, 973, 976, 979, 979, 979, 979, 979, 979, 979, 979,
+ 979, 981, 981, 987, 988, 993, 993, 993, 998, 998, 998, 998, 1001, 1001, 1001, 1001, 1001,
+ 1001, 1005, 1005, 1007, 1011, 1011, 1011, 1011, 1011, 1011, 1011, 1011, 1011, 1011, 1039,
+ 1044, 1044, 1044, 1044, 1044, 1046, 1048, 1048, 1048, 1048, 1049, 1049, 1049, 1049, 1049,
+ 1049, 1049, 1049, 1049, 1049, 1049, 1049, 1049, 1049, 1049, 1049, 1050, 1050, 1050, 1050,
+ 1050, 1050, 1050, 1050
+ ];
+
const word_cat_table: &'static [(char, char, WordCat)] = &[
('\u{a}', '\u{a}', WC_LF), ('\u{b}', '\u{c}', WC_Newline), ('\u{d}', '\u{d}', WC_CR),
('\u{20}', '\u{20}', WC_WSegSpace), ('\u{22}', '\u{22}', WC_Double_Quote), ('\u{27}',
@@ -1530,7 +1692,7 @@ pub mod emoji {
EC_Extended_Pictographic,
}
- fn bsearch_range_value_table(c: char, r: &'static [(char, char, EmojiCat)]) -> (u32, u32, EmojiCat) {
+ fn bsearch_range_value_table(c: char, r: &'static [(char, char, EmojiCat)], default_lower: u32, default_upper: u32) -> (u32, u32, EmojiCat) {
use core::cmp::Ordering::{Equal, Less, Greater};
match r.binary_search_by(|&(lo, hi, _)| {
if lo <= c && c <= hi { Equal }
@@ -1543,8 +1705,8 @@ pub mod emoji {
}
Err(idx) => {
(
- if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
- r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
+ if idx > 0 { r[idx-1].1 as u32 + 1 } else { default_lower },
+ r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(default_upper),
EC_Any,
)
}
@@ -1552,9 +1714,73 @@ pub mod emoji {
}
pub fn emoji_category(c: char) -> (u32, u32, EmojiCat) {
- bsearch_range_value_table(c, emoji_cat_table)
+ // Perform a quick O(1) lookup in a precomputed table to determine
+ // the slice of the range table to search in.
+ let lookup_interval = 0x80;
+ let idx = (c as u32 / lookup_interval) as usize;
+ let range = emoji_cat_lookup.get(idx..(idx + 2)).map_or(
+ // If the `idx` is outside of the precomputed table - use the slice
+ // starting from the last covered index in the precomputed table and
+ // ending with the length of the range table.
+ 77..78,
+ |r| (r[0] as usize)..((r[1] + 1) as usize)
+ );
+
+ // Compute pessimistic default lower and upper bounds on the category.
+ // If character doesn't map to any range and there is no adjacent range
+ // in the table slice - these bounds has to apply.
+ let lower = idx as u32 * lookup_interval;
+ let upper = lower + lookup_interval - 1;
+ bsearch_range_value_table(c, &emoji_cat_table[range], lower, upper)
}
+ const emoji_cat_lookup: &'static [u8] = &[
+ 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 4, 4, 6, 8, 8, 8, 10, 14, 14, 15, 15, 19, 21, 22, 37, 41, 41, 41, 42, 42, 42, 42,
+ 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 48, 48, 48, 48, 48, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 51, 55, 58, 63, 63, 63, 64, 64, 64, 65, 65, 66, 67,
+ 68, 69, 72, 74, 76, 76, 76, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77
+ ];
+
const emoji_cat_table: &'static [(char, char, EmojiCat)] = &[
('\u{a9}', '\u{a9}', EC_Extended_Pictographic), ('\u{ae}', '\u{ae}',
EC_Extended_Pictographic), ('\u{203c}', '\u{203c}', EC_Extended_Pictographic), ('\u{2049}',
@@ -1633,7 +1859,7 @@ pub mod sentence {
SC_Upper,
}
- fn bsearch_range_value_table(c: char, r: &'static [(char, char, SentenceCat)]) -> (u32, u32, SentenceCat) {
+ fn bsearch_range_value_table(c: char, r: &'static [(char, char, SentenceCat)], default_lower: u32, default_upper: u32) -> (u32, u32, SentenceCat) {
use core::cmp::Ordering::{Equal, Less, Greater};
match r.binary_search_by(|&(lo, hi, _)| {
if lo <= c && c <= hi { Equal }
@@ -1646,8 +1872,8 @@ pub mod sentence {
}
Err(idx) => {
(
- if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
- r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
+ if idx > 0 { r[idx-1].1 as u32 + 1 } else { default_lower },
+ r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(default_upper),
SC_Any,
)
}
@@ -1655,9 +1881,97 @@ pub mod sentence {
}
pub fn sentence_category(c: char) -> (u32, u32, SentenceCat) {
- bsearch_range_value_table(c, sentence_cat_table)
+ // Perform a quick O(1) lookup in a precomputed table to determine
+ // the slice of the range table to search in.
+ let lookup_interval = 0x80;
+ let idx = (c as u32 / lookup_interval) as usize;
+ let range = sentence_cat_lookup.get(idx..(idx + 2)).map_or(
+ // If the `idx` is outside of the precomputed table - use the slice
+ // starting from the last covered index in the precomputed table and
+ // ending with the length of the range table.
+ 2410..2421,
+ |r| (r[0] as usize)..((r[1] + 1) as usize)
+ );
+
+ // Compute pessimistic default lower and upper bounds on the category.
+ // If character doesn't map to any range and there is no adjacent range
+ // in the table slice - these bounds has to apply.
+ let lower = idx as u32 * lookup_interval;
+ let upper = lower + lookup_interval - 1;
+ bsearch_range_value_table(c, &sentence_cat_table[range], lower, upper)
}
+ const sentence_cat_lookup: &'static [u16] = &[
+ 0, 19, 31, 154, 247, 314, 323, 333, 375, 409, 528, 579, 588, 599, 612, 618, 629, 643, 650,
+ 661, 683, 702, 720, 738, 755, 771, 790, 806, 818, 825, 840, 850, 856, 871, 882, 882, 882,
+ 887, 895, 901, 904, 904, 904, 904, 904, 907, 912, 922, 928, 937, 943, 950, 953, 959, 964,
+ 973, 980, 988, 1000, 1000, 1002, 1130, 1249, 1267, 1288, 1308, 1311, 1336, 1340, 1340, 1340,
+ 1342, 1342, 1342, 1344, 1344, 1344, 1344, 1344, 1346, 1348, 1348, 1348, 1348, 1351, 1351,
+ 1351, 1351, 1351, 1369, 1476, 1482, 1492, 1501, 1501, 1501, 1501, 1512, 1517, 1518, 1521,
+ 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521,
+ 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521,
+ 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521,
+ 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1522, 1522, 1522, 1522, 1522,
+ 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522,
+ 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522,
+ 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522,
+ 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522,
+ 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522,
+ 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522,
+ 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522,
+ 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522,
+ 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522,
+ 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522,
+ 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522,
+ 1522, 1522, 1522, 1522, 1525, 1525, 1525, 1580, 1613, 1696, 1769, 1780, 1790, 1797, 1808,
+ 1819, 1836, 1843, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849,
+ 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849,
+ 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849,
+ 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849,
+ 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849,
+ 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849,
+ 1849, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852,
+ 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852,
+ 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852,
+ 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852,
+ 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1853, 1854, 1864, 1865, 1865,
+ 1865, 1867, 1870, 1886, 1888, 1905, 1913, 1919, 1920, 1921, 1922, 1922, 1925, 1929, 1933,
+ 1935, 1939, 1942, 1949, 1949, 1949, 1952, 1957, 1964, 1967, 1969, 1971, 1982, 1986, 1989,
+ 1990, 1991, 1993, 1996, 1996, 1996, 2000, 2005, 2010, 2019, 2028, 2039, 2051, 2059, 2068,
+ 2086, 2086, 2093, 2098, 2098, 2105, 2110, 2114, 2119, 2119, 2121, 2124, 2139, 2146, 2156,
+ 2161, 2161, 2161, 2168, 2171, 2183, 2189, 2189, 2192, 2201, 2202, 2202, 2202, 2202, 2202,
+ 2202, 2202, 2202, 2203, 2204, 2204, 2205, 2205, 2205, 2205, 2205, 2205, 2205, 2205, 2205,
+ 2205, 2205, 2205, 2205, 2205, 2205, 2205, 2205, 2205, 2205, 2205, 2205, 2206, 2206, 2206,
+ 2206, 2206, 2206, 2206, 2206, 2206, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211,
+ 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211,
+ 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2212, 2212, 2212,
+ 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212,
+ 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212,
+ 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212,
+ 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212,
+ 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2216, 2221, 2228, 2229, 2229, 2229,
+ 2229, 2229, 2231, 2232, 2235, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242,
+ 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242,
+ 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242,
+ 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2243, 2243, 2243, 2243, 2243, 2243, 2243,
+ 2243, 2243, 2243, 2244, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245,
+ 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245,
+ 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245,
+ 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245,
+ 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2248, 2248,
+ 2248, 2253, 2253, 2253, 2254, 2254, 2254, 2254, 2254, 2254, 2254, 2254, 2254, 2254, 2254,
+ 2254, 2254, 2254, 2254, 2254, 2254, 2254, 2254, 2256, 2261, 2261, 2261, 2261, 2261, 2261,
+ 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261,
+ 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261,
+ 2261, 2263, 2263, 2263, 2263, 2266, 2269, 2270, 2270, 2270, 2270, 2275, 2288, 2300, 2305,
+ 2310, 2316, 2322, 2330, 2330, 2330, 2330, 2330, 2333, 2337, 2337, 2337, 2337, 2337, 2337,
+ 2337, 2337, 2337, 2341, 2341, 2347, 2348, 2353, 2353, 2353, 2358, 2358, 2358, 2358, 2361,
+ 2361, 2361, 2361, 2361, 2361, 2365, 2365, 2367, 2372, 2372, 2372, 2372, 2372, 2372, 2372,
+ 2372, 2372, 2372, 2400, 2405, 2405, 2405, 2405, 2405, 2407, 2408, 2408, 2408, 2408, 2408,
+ 2408, 2408, 2408, 2408, 2409, 2409, 2409, 2409, 2409, 2409, 2409, 2409, 2409, 2409, 2409,
+ 2410, 2410, 2410, 2410, 2410, 2410, 2410, 2410
+ ];
+
const sentence_cat_table: &'static [(char, char, SentenceCat)] = &[
('\u{9}', '\u{9}', SC_Sp), ('\u{a}', '\u{a}', SC_LF), ('\u{b}', '\u{c}', SC_Sp), ('\u{d}',
'\u{d}', SC_CR), ('\u{20}', '\u{20}', SC_Sp), ('\u{21}', '\u{21}', SC_STerm), ('\u{22}',