Upgrade unicode-segmentation to 1.10.1 am: af4da18ee6 am: a3413ee32f am: b53e9a1bb0 am: 3ab9517db7

Original change: https://android-review.googlesource.com/c/platform/external/rust/crates/unicode-segmentation/+/2441309 Change-Id: I546ebb11eb84ad2fabd457bba7bef5c4dea01f49 Signed-off-by: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>
author: Jeff Vander Stoep <jeffv@google.com> 2023-02-22 00:18:19 +0000
committer: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com> 2023-02-22 00:18:19 +0000
commit: fdafdc4b1f0a3e23f015a8af9631bf73ac5dd8eb (patch)
tree: fb9d5538736d1dc8e521d57d80211eb91f95f8cd
parent: 84d24275bdccd022929eea40bd56b6308e6114d0 (diff)
parent: 3ab9517db750d1a17f1d6b3f83dabeea9bd96e72 (diff)
download: unicode-segmentation-fdafdc4b1f0a3e23f015a8af9631bf73ac5dd8eb.tar.gz
10 files changed, 449 insertions, 89 deletions
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json
index 6df8781..57ba2a9 100644
--- a/.cargo_vcs_info.json
+++ b/.cargo_vcs_info.json
@@ -1,6 +1,6 @@
 {
   "git": {
-    "sha1": "6ee83831a3086d509b2a87158685000dab83be65"
+    "sha1": "d02ab6e5d14f009cde8c152ef19dcb55d0b594fc"
   },
   "path_in_vcs": ""
 }
 \ No newline at end of file
diff --git a/Android.bp b/Android.bp
index 1735760..858aba0 100644
--- a/Android.bp
+++ b/Android.bp
@@ -45,7 +45,7 @@ rust_library {
     host_supported: true,
     crate_name: "unicode_segmentation",
     cargo_env_compat: true,
-    cargo_pkg_version: "1.10.0",
+    cargo_pkg_version: "1.10.1",
     srcs: ["src/lib.rs"],
     edition: "2018",
     apex_available: [
diff --git a/Cargo.toml b/Cargo.toml
index 0da56c8..83dea28 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -12,7 +12,7 @@
 [package]
 edition = "2018"
 name = "unicode-segmentation"
-version = "1.10.0"
+version = "1.10.1"
 authors = [
     "kwantam <kwantam@gmail.com>",
     "Manish Goregaokar <manishsmail@gmail.com>",
diff --git a/Cargo.toml.orig b/Cargo.toml.orig
index ac21095..84ef138 100644
--- a/Cargo.toml.orig
+++ b/Cargo.toml.orig
@@ -1,7 +1,7 @@
 [package]
 
 name = "unicode-segmentation"
-version = "1.10.0"
+version = "1.10.1"
 authors = ["kwantam <kwantam@gmail.com>", "Manish Goregaokar <manishsmail@gmail.com>"]
 
 edition = "2018"
diff --git a/METADATA b/METADATA
index c54b937..860e2a9 100644
--- a/METADATA
+++ b/METADATA
@@ -11,13 +11,13 @@ third_party {
   }
   url {
     type: ARCHIVE
-    value: "https://static.crates.io/crates/unicode-segmentation/unicode-segmentation-1.10.0.crate"
+    value: "https://static.crates.io/crates/unicode-segmentation/unicode-segmentation-1.10.1.crate"
   }
-  version: "1.10.0"
+  version: "1.10.1"
   license_type: NOTICE
   last_upgrade_date {
-    year: 2022
-    month: 12
-    day: 19
+    year: 2023
+    month: 2
+    day: 17
   }
 }
diff --git a/README.md b/README.md
index 48d9a92..ef61ebd 100644
--- a/README.md
+++ b/README.md
@@ -38,7 +38,7 @@ to your `Cargo.toml`:
 
 ```toml
 [dependencies]
-unicode-segmentation = "1.9.0"
+unicode-segmentation = "1.10.1"
 ```
 
 # Change Log
diff --git a/benches/unicode_words.rs b/benches/unicode_words.rs
index c87851a..a7f8f41 100644
--- a/benches/unicode_words.rs
+++ b/benches/unicode_words.rs
@@ -1,55 +1,52 @@
-#[macro_use]
-extern crate bencher;
-extern crate unicode_segmentation;
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
 
-use bencher::Bencher;
 use std::fs;
 use unicode_segmentation::UnicodeSegmentation;
 
-fn unicode_words(bench: &mut Bencher, path: &str) {
+fn unicode_words(c: &mut Criterion, lang: &str, path: &str) {
     let text = fs::read_to_string(path).unwrap();
-    bench.iter(|| {
-        for w in text.unicode_words() {
-            bencher::black_box(w);
-        }
+    c.bench_function(&format!("unicode_words_{}", lang), |bench| {
+        bench.iter(|| {
+            for w in text.unicode_words() {
+                black_box(w);
+            }
+        })
     });
-
-    bench.bytes = text.len() as u64;
 }
 
-fn unicode_words_arabic(bench: &mut Bencher) {
-    unicode_words(bench, "benches/texts/arabic.txt");
+fn unicode_words_arabic(c: &mut Criterion) {
+    unicode_words(c, "arabic", "benches/texts/arabic.txt");
 }
 
-fn unicode_words_english(bench: &mut Bencher) {
-    unicode_words(bench, "benches/texts/english.txt");
+fn unicode_words_english(c: &mut Criterion) {
+    unicode_words(c, "english", "benches/texts/english.txt");
 }
 
-fn unicode_words_hindi(bench: &mut Bencher) {
-    unicode_words(bench, "benches/texts/hindi.txt");
+fn unicode_words_hindi(c: &mut Criterion) {
+    unicode_words(c, "hindi", "benches/texts/hindi.txt");
 }
 
-fn unicode_words_japanese(bench: &mut Bencher) {
-    unicode_words(bench, "benches/texts/japanese.txt");
+fn unicode_words_japanese(c: &mut Criterion) {
+    unicode_words(c, "japanese", "benches/texts/japanese.txt");
 }
 
-fn unicode_words_korean(bench: &mut Bencher) {
-    unicode_words(bench, "benches/texts/korean.txt");
+fn unicode_words_korean(c: &mut Criterion) {
+    unicode_words(c, "korean", "benches/texts/korean.txt");
 }
 
-fn unicode_words_mandarin(bench: &mut Bencher) {
-    unicode_words(bench, "benches/texts/mandarin.txt");
+fn unicode_words_mandarin(c: &mut Criterion) {
+    unicode_words(c, "mandarin", "benches/texts/mandarin.txt");
 }
 
-fn unicode_words_russian(bench: &mut Bencher) {
-    unicode_words(bench, "benches/texts/russian.txt");
+fn unicode_words_russian(c: &mut Criterion) {
+    unicode_words(c, "russian", "benches/texts/russian.txt");
 }
 
-fn unicode_words_source_code(bench: &mut Bencher) {
-    unicode_words(bench, "benches/texts/source_code.txt");
+fn unicode_words_source_code(c: &mut Criterion) {
+    unicode_words(c, "source_code", "benches/texts/source_code.txt");
 }
 
-benchmark_group!(
+criterion_group!(
     benches,
     unicode_words_arabic,
     unicode_words_english,
@@ -61,4 +58,4 @@ benchmark_group!(
     unicode_words_source_code,
 );
 
-benchmark_main!(benches);
+criterion_main!(benches);
diff --git a/benches/word_bounds.rs b/benches/word_bounds.rs
index 6b01ddb..cae7a88 100644
--- a/benches/word_bounds.rs
+++ b/benches/word_bounds.rs
@@ -1,55 +1,52 @@
-#[macro_use]
-extern crate bencher;
-extern crate unicode_segmentation;
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
 
-use bencher::Bencher;
 use std::fs;
 use unicode_segmentation::UnicodeSegmentation;
 
-fn word_bounds(bench: &mut Bencher, path: &str) {
+fn word_bounds(c: &mut Criterion, lang: &str, path: &str) {
     let text = fs::read_to_string(path).unwrap();
-    bench.iter(|| {
-        for w in text.split_word_bounds() {
-            bencher::black_box(w);
-        }
+    c.bench_function(&format!("word_bounds_{}", lang), |bench| {
+        bench.iter(|| {
+            for w in text.split_word_bounds() {
+                black_box(w);
+            }
+        });
     });
-
-    bench.bytes = text.len() as u64;
 }
 
-fn word_bounds_arabic(bench: &mut Bencher) {
-    word_bounds(bench, "benches/texts/arabic.txt");
+fn word_bounds_arabic(c: &mut Criterion) {
+    word_bounds(c, "arabic", "benches/texts/arabic.txt");
 }
 
-fn word_bounds_english(bench: &mut Bencher) {
-    word_bounds(bench, "benches/texts/english.txt");
+fn word_bounds_english(c: &mut Criterion) {
+    word_bounds(c, "english", "benches/texts/english.txt");
 }
 
-fn word_bounds_hindi(bench: &mut Bencher) {
-    word_bounds(bench, "benches/texts/hindi.txt");
+fn word_bounds_hindi(c: &mut Criterion) {
+    word_bounds(c, "hindi", "benches/texts/hindi.txt");
 }
 
-fn word_bounds_japanese(bench: &mut Bencher) {
-    word_bounds(bench, "benches/texts/japanese.txt");
+fn word_bounds_japanese(c: &mut Criterion) {
+    word_bounds(c, "japanese", "benches/texts/japanese.txt");
 }
 
-fn word_bounds_korean(bench: &mut Bencher) {
-    word_bounds(bench, "benches/texts/korean.txt");
+fn word_bounds_korean(c: &mut Criterion) {
+    word_bounds(c, "korean", "benches/texts/korean.txt");
 }
 
-fn word_bounds_mandarin(bench: &mut Bencher) {
-    word_bounds(bench, "benches/texts/mandarin.txt");
+fn word_bounds_mandarin(c: &mut Criterion) {
+    word_bounds(c, "mandarin", "benches/texts/mandarin.txt");
 }
 
-fn word_bounds_russian(bench: &mut Bencher) {
-    word_bounds(bench, "benches/texts/russian.txt");
+fn word_bounds_russian(c: &mut Criterion) {
+    word_bounds(c, "russian", "benches/texts/russian.txt");
 }
 
-fn word_bounds_source_code(bench: &mut Bencher) {
-    word_bounds(bench, "benches/texts/source_code.txt");
+fn word_bounds_source_code(c: &mut Criterion) {
+    word_bounds(c, "source_code", "benches/texts/source_code.txt");
 }
 
-benchmark_group!(
+criterion_group!(
     benches,
     word_bounds_arabic,
     word_bounds_english,
@@ -61,4 +58,4 @@ benchmark_group!(
     word_bounds_source_code,
 );
 
-benchmark_main!(benches);
+criterion_main!(benches);
diff --git a/scripts/unicode.py b/scripts/unicode.py
index 7aed85e..18cea99 100755
--- a/scripts/unicode.py
+++ b/scripts/unicode.py
@@ -274,13 +274,36 @@ def emit_break_module(f, break_table, break_cats, name):
     pub enum %sCat {
 """ % (name, Name, Name))
 
+    # We don't want the lookup table to be too large so choose a reasonable
+    # cutoff. 0x20000 is selected because most of the range table entries are
+    # within the interval of [0x0, 0x20000]
+    lookup_value_cutoff = 0x20000
+
+    # Length of lookup table. It has to be a divisor of `lookup_value_cutoff`.
+    lookup_table_len = 0x400
+
+    lookup_interval = round(lookup_value_cutoff / lookup_table_len)
+
+    # Lookup table is a mapping from `character code / lookup_interval` to
+    # the index in the range table that covers the `character code`.
+    lookup_table = [0] * lookup_table_len
+    j = 0
+    for i in range(0, lookup_table_len):
+      lookup_from = i * lookup_interval
+      while j < len(break_table):
+        (_, entry_to, _) = break_table[j]
+        if entry_to >= lookup_from:
+          break
+        j += 1
+      lookup_table[i] = j
+
     break_cats.append("Any")
     break_cats.sort()
     for cat in break_cats:
         f.write(("        %sC_" % Name[0]) + cat + ",\n")
     f.write("""    }
 
-    fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> (u32, u32, %sCat) {
+    fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)], default_lower: u32, default_upper: u32) -> (u32, u32, %sCat) {
         use core::cmp::Ordering::{Equal, Less, Greater};
         match r.binary_search_by(|&(lo, hi, _)| {
             if lo <= c && c <= hi { Equal }
@@ -293,8 +316,8 @@ def emit_break_module(f, break_table, break_cats, name):
             }
             Err(idx) => {
                 (
-                    if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
-                    r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
+                    if idx > 0 { r[idx-1].1 as u32 + 1 } else { default_lower },
+                    r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(default_upper),
                     %sC_Any,
                 )
             }
@@ -302,10 +325,39 @@ def emit_break_module(f, break_table, break_cats, name):
     }
 
     pub fn %s_category(c: char) -> (u32, u32, %sCat) {
-        bsearch_range_value_table(c, %s_cat_table)
+        // Perform a quick O(1) lookup in a precomputed table to determine
+        // the slice of the range table to search in.
+        let lookup_interval = 0x%x;
+        let idx = (c as u32 / lookup_interval) as usize;
+        let range = %s_cat_lookup.get(idx..(idx + 2)).map_or(
+          // If the `idx` is outside of the precomputed table - use the slice
+          // starting from the last covered index in the precomputed table and
+          // ending with the length of the range table.
+          %d..%d,
+          |r| (r[0] as usize)..((r[1] + 1) as usize)
+        );
+
+        // Compute pessimistic default lower and upper bounds on the category.
+        // If character doesn't map to any range and there is no adjacent range
+        // in the table slice - these bounds has to apply.
+        let lower = idx as u32 * lookup_interval;
+        let upper = lower + lookup_interval - 1;
+        bsearch_range_value_table(c, &%s_cat_table[range], lower, upper)
     }
 
-""" % (Name, Name, Name[0], name, Name, name))
+""" % (Name, Name, Name[0], name, Name, lookup_interval, name, j, len(break_table), name))
+
+
+    if len(break_table) <= 0xff:
+      lookup_type = "u8"
+    elif len(break_table) <= 0xffff:
+      lookup_type = "u16"
+    else:
+      lookup_type = "u32"
+
+    emit_table(f, "%s_cat_lookup" % name, lookup_table, "&'static [%s]" % lookup_type,
+        pfun=lambda x: "%d" % x,
+        is_pub=False, is_const=True)
 
     emit_table(f, "%s_cat_table" % name, break_table, "&'static [(char, char, %sCat)]" % Name,
         pfun=lambda x: "(%s,%s,%sC_%s)" % (escape_char(x[0]), escape_char(x[1]), Name[0], x[2]),
diff --git a/src/tables.rs b/src/tables.rs
index 5a811c9..ca83b50 100644
--- a/src/tables.rs
+++ b/src/tables.rs
@@ -365,7 +365,7 @@ pub mod grapheme {
         GC_ZWJ,
     }
 
-    fn bsearch_range_value_table(c: char, r: &'static [(char, char, GraphemeCat)]) -> (u32, u32, GraphemeCat) {
+    fn bsearch_range_value_table(c: char, r: &'static [(char, char, GraphemeCat)], default_lower: u32, default_upper: u32) -> (u32, u32, GraphemeCat) {
         use core::cmp::Ordering::{Equal, Less, Greater};
         match r.binary_search_by(|&(lo, hi, _)| {
             if lo <= c && c <= hi { Equal }
@@ -378,8 +378,8 @@ pub mod grapheme {
             }
             Err(idx) => {
                 (
-                    if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
-                    r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
+                    if idx > 0 { r[idx-1].1 as u32 + 1 } else { default_lower },
+                    r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(default_upper),
                     GC_Any,
                 )
             }
@@ -387,9 +387,93 @@ pub mod grapheme {
     }
 
     pub fn grapheme_category(c: char) -> (u32, u32, GraphemeCat) {
-        bsearch_range_value_table(c, grapheme_cat_table)
+        // Perform a quick O(1) lookup in a precomputed table to determine
+        // the slice of the range table to search in.
+        let lookup_interval = 0x80;
+        let idx = (c as u32 / lookup_interval) as usize;
+        let range = grapheme_cat_lookup.get(idx..(idx + 2)).map_or(
+          // If the `idx` is outside of the precomputed table - use the slice
+          // starting from the last covered index in the precomputed table and
+          // ending with the length of the range table.
+          1443..1449,
+          |r| (r[0] as usize)..((r[1] + 1) as usize)
+        );
+
+        // Compute pessimistic default lower and upper bounds on the category.
+        // If character doesn't map to any range and there is no adjacent range
+        // in the table slice - these bounds has to apply.
+        let lower = idx as u32 * lookup_interval;
+        let upper = lower + lookup_interval - 1;
+        bsearch_range_value_table(c, &grapheme_cat_table[range], lower, upper)
     }
 
+    const grapheme_cat_lookup: &'static [u16] = &[
+        0, 5, 9, 9, 9, 9, 9, 10, 10, 10, 11, 11, 16, 21, 26, 29, 32, 37, 41, 53, 65, 75, 86, 97,
+        106, 116, 131, 143, 153, 157, 161, 168, 173, 183, 188, 189, 191, 191, 191, 192, 192, 192,
+        192, 192, 192, 192, 192, 198, 206, 209, 211, 219, 219, 232, 233, 242, 258, 262, 270, 270,
+        271, 271, 271, 271, 271, 279, 280, 282, 284, 284, 284, 286, 290, 290, 291, 291, 295, 297,
+        298, 313, 317, 317, 317, 318, 318, 318, 318, 322, 322, 322, 323, 324, 325, 325, 325, 325,
+        325, 328, 329, 329, 329, 329, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331,
+        331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331,
+        331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331,
+        331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331,
+        331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331,
+        331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331,
+        331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331,
+        331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331,
+        331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331,
+        331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331,
+        331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331,
+        331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331,
+        331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331,
+        331, 331, 331, 333, 335, 335, 335, 342, 347, 351, 360, 369, 379, 379, 386, 395, 405, 413,
+        423, 431, 441, 450, 459, 469, 477, 487, 495, 505, 514, 523, 533, 541, 551, 559, 569, 578,
+        587, 597, 605, 615, 623, 633, 642, 651, 661, 669, 679, 687, 697, 706, 715, 725, 733, 743,
+        751, 761, 770, 779, 789, 797, 807, 815, 825, 834, 843, 853, 861, 871, 879, 889, 898, 907,
+        917, 925, 935, 943, 953, 962, 971, 981, 989, 999, 1007, 1017, 1026, 1035, 1045, 1053, 1063,
+        1071, 1081, 1090, 1099, 1109, 1117, 1127, 1135, 1145, 1154, 1163, 1173, 1181, 1186, 1186,
+        1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186,
+        1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186,
+        1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186,
+        1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186,
+        1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1187, 1187, 1187, 1187, 1187, 1187,
+        1189, 1190, 1190, 1192, 1192, 1192, 1192, 1193, 1193, 1194, 1195, 1195, 1195, 1195, 1195,
+        1195, 1195, 1195, 1195, 1195, 1195, 1195, 1195, 1195, 1200, 1201, 1201, 1201, 1201, 1201,
+        1202, 1202, 1202, 1204, 1205, 1206, 1212, 1221, 1227, 1236, 1244, 1247, 1260, 1260, 1267,
+        1278, 1278, 1286, 1292, 1299, 1303, 1303, 1307, 1307, 1318, 1324, 1333, 1337, 1337, 1337,
+        1342, 1349, 1355, 1361, 1361, 1363, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372,
+        1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372,
+        1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372,
+        1372, 1372, 1372, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375,
+        1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375,
+        1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375,
+        1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375,
+        1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375,
+        1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375,
+        1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375,
+        1375, 1375, 1375, 1375, 1375, 1375, 1375, 1376, 1377, 1377, 1377, 1377, 1377, 1377, 1377,
+        1377, 1378, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382,
+        1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382,
+        1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382,
+        1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382,
+        1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382,
+        1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382,
+        1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382,
+        1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382,
+        1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382,
+        1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382,
+        1382, 1382, 1382, 1382, 1382, 1382, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384,
+        1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384,
+        1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1386, 1386,
+        1386, 1386, 1392, 1395, 1396, 1396, 1396, 1396, 1396, 1396, 1396, 1396, 1396, 1396, 1396,
+        1396, 1396, 1396, 1396, 1396, 1399, 1402, 1402, 1402, 1402, 1402, 1402, 1402, 1402, 1402,
+        1402, 1402, 1407, 1408, 1409, 1409, 1409, 1411, 1411, 1411, 1411, 1412, 1412, 1412, 1412,
+        1412, 1412, 1412, 1412, 1413, 1414, 1414, 1414, 1414, 1414, 1414, 1414, 1414, 1414, 1414,
+        1414, 1414, 1414, 1414, 1414, 1415, 1419, 1423, 1428, 1428, 1428, 1430, 1430, 1430, 1431,
+        1431, 1432, 1433, 1434, 1435, 1438, 1440, 1442, 1442, 1442, 1443, 1443, 1443, 1443, 1443,
+        1443, 1443, 1443, 1443, 1443
+    ];
+
     const grapheme_cat_table: &'static [(char, char, GraphemeCat)] = &[
         ('\u{0}', '\u{9}', GC_Control), ('\u{a}', '\u{a}', GC_LF), ('\u{b}', '\u{c}', GC_Control),
         ('\u{d}', '\u{d}', GC_CR), ('\u{e}', '\u{1f}', GC_Control), ('\u{7f}', '\u{9f}',
@@ -1028,7 +1112,7 @@ pub mod word {
         WC_ZWJ,
     }
 
-    fn bsearch_range_value_table(c: char, r: &'static [(char, char, WordCat)]) -> (u32, u32, WordCat) {
+    fn bsearch_range_value_table(c: char, r: &'static [(char, char, WordCat)], default_lower: u32, default_upper: u32) -> (u32, u32, WordCat) {
         use core::cmp::Ordering::{Equal, Less, Greater};
         match r.binary_search_by(|&(lo, hi, _)| {
             if lo <= c && c <= hi { Equal }
@@ -1041,8 +1125,8 @@ pub mod word {
             }
             Err(idx) => {
                 (
-                    if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
-                    r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
+                    if idx > 0 { r[idx-1].1 as u32 + 1 } else { default_lower },
+                    r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(default_upper),
                     WC_Any,
                 )
             }
@@ -1050,9 +1134,87 @@ pub mod word {
     }
 
     pub fn word_category(c: char) -> (u32, u32, WordCat) {
-        bsearch_range_value_table(c, word_cat_table)
+        // Perform a quick O(1) lookup in a precomputed table to determine
+        // the slice of the range table to search in.
+        let lookup_interval = 0x80;
+        let idx = (c as u32 / lookup_interval) as usize;
+        let range = word_cat_lookup.get(idx..(idx + 2)).map_or(
+          // If the `idx` is outside of the precomputed table - use the slice
+          // starting from the last covered index in the precomputed table and
+          // ending with the length of the range table.
+          1050..1053,
+          |r| (r[0] as usize)..((r[1] + 1) as usize)
+        );
+
+        // Compute pessimistic default lower and upper bounds on the category.
+        // If character doesn't map to any range and there is no adjacent range
+        // in the table slice - these bounds has to apply.
+        let lower = idx as u32 * lookup_interval;
+        let upper = lower + lookup_interval - 1;
+        bsearch_range_value_table(c, &word_cat_table[range], lower, upper)
     }
 
+    const word_cat_lookup: &'static [u16] = &[
+        0, 14, 22, 22, 22, 22, 24, 30, 36, 36, 38, 43, 55, 66, 78, 83, 93, 104, 111, 121, 143, 162,
+        180, 198, 215, 231, 250, 266, 278, 282, 286, 295, 301, 308, 316, 316, 316, 321, 329, 333,
+        336, 336, 336, 336, 336, 338, 342, 351, 354, 359, 365, 369, 370, 375, 378, 384, 391, 397,
+        409, 409, 411, 411, 411, 420, 430, 449, 451, 464, 465, 465, 465, 465, 465, 465, 466, 466,
+        466, 466, 466, 466, 466, 466, 466, 466, 466, 466, 466, 466, 466, 466, 470, 476, 486, 487,
+        487, 487, 487, 492, 496, 497, 500, 500, 501, 502, 502, 502, 502, 502, 502, 502, 502, 502,
+        502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502,
+        502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502,
+        502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502,
+        502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502,
+        502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502,
+        502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502,
+        502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502,
+        502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502,
+        502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502,
+        502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502,
+        502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502,
+        502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502,
+        502, 502, 504, 504, 504, 511, 515, 515, 519, 529, 538, 544, 551, 559, 568, 574, 578, 578,
+        578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578,
+        578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578,
+        578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578,
+        578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578,
+        578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 581, 581, 581, 581,
+        581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581,
+        581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581,
+        581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581,
+        581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 592, 593, 593, 593, 594,
+        597, 609, 611, 620, 628, 634, 635, 636, 637, 637, 640, 644, 648, 648, 652, 655, 662, 662,
+        662, 665, 668, 675, 678, 680, 682, 692, 696, 699, 700, 701, 703, 706, 706, 706, 710, 714,
+        718, 726, 734, 744, 753, 759, 767, 785, 785, 791, 796, 796, 801, 805, 809, 811, 811, 813,
+        815, 828, 835, 844, 848, 848, 848, 854, 857, 869, 875, 875, 877, 885, 886, 886, 886, 886,
+        886, 886, 886, 886, 887, 888, 888, 889, 889, 889, 889, 889, 889, 889, 889, 889, 889, 889,
+        889, 889, 889, 889, 889, 889, 889, 889, 889, 889, 890, 890, 890, 890, 890, 890, 890, 890,
+        890, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895,
+        895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895,
+        895, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896,
+        896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896,
+        896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896,
+        896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896,
+        896, 899, 903, 908, 909, 909, 909, 909, 909, 910, 910, 913, 920, 920, 920, 920, 920, 920,
+        920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920,
+        920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920,
+        920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920,
+        920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920,
+        920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920,
+        920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920,
+        920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 923, 924, 924, 927,
+        927, 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, 927,
+        927, 927, 927, 929, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933,
+        933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933,
+        933, 933, 933, 933, 933, 935, 935, 935, 935, 938, 941, 942, 942, 942, 942, 943, 951, 960,
+        960, 960, 964, 968, 973, 973, 973, 973, 973, 976, 979, 979, 979, 979, 979, 979, 979, 979,
+        979, 981, 981, 987, 988, 993, 993, 993, 998, 998, 998, 998, 1001, 1001, 1001, 1001, 1001,
+        1001, 1005, 1005, 1007, 1011, 1011, 1011, 1011, 1011, 1011, 1011, 1011, 1011, 1011, 1039,
+        1044, 1044, 1044, 1044, 1044, 1046, 1048, 1048, 1048, 1048, 1049, 1049, 1049, 1049, 1049,
+        1049, 1049, 1049, 1049, 1049, 1049, 1049, 1049, 1049, 1049, 1049, 1050, 1050, 1050, 1050,
+        1050, 1050, 1050, 1050
+    ];
+
     const word_cat_table: &'static [(char, char, WordCat)] = &[
         ('\u{a}', '\u{a}', WC_LF), ('\u{b}', '\u{c}', WC_Newline), ('\u{d}', '\u{d}', WC_CR),
         ('\u{20}', '\u{20}', WC_WSegSpace), ('\u{22}', '\u{22}', WC_Double_Quote), ('\u{27}',
@@ -1530,7 +1692,7 @@ pub mod emoji {
         EC_Extended_Pictographic,
     }
 
-    fn bsearch_range_value_table(c: char, r: &'static [(char, char, EmojiCat)]) -> (u32, u32, EmojiCat) {
+    fn bsearch_range_value_table(c: char, r: &'static [(char, char, EmojiCat)], default_lower: u32, default_upper: u32) -> (u32, u32, EmojiCat) {
         use core::cmp::Ordering::{Equal, Less, Greater};
         match r.binary_search_by(|&(lo, hi, _)| {
             if lo <= c && c <= hi { Equal }
@@ -1543,8 +1705,8 @@ pub mod emoji {
             }
             Err(idx) => {
                 (
-                    if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
-                    r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
+                    if idx > 0 { r[idx-1].1 as u32 + 1 } else { default_lower },
+                    r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(default_upper),
                     EC_Any,
                 )
             }
@@ -1552,9 +1714,73 @@ pub mod emoji {
     }
 
     pub fn emoji_category(c: char) -> (u32, u32, EmojiCat) {
-        bsearch_range_value_table(c, emoji_cat_table)
+        // Perform a quick O(1) lookup in a precomputed table to determine
+        // the slice of the range table to search in.
+        let lookup_interval = 0x80;
+        let idx = (c as u32 / lookup_interval) as usize;
+        let range = emoji_cat_lookup.get(idx..(idx + 2)).map_or(
+          // If the `idx` is outside of the precomputed table - use the slice
+          // starting from the last covered index in the precomputed table and
+          // ending with the length of the range table.
+          77..78,
+          |r| (r[0] as usize)..((r[1] + 1) as usize)
+        );
+
+        // Compute pessimistic default lower and upper bounds on the category.
+        // If character doesn't map to any range and there is no adjacent range
+        // in the table slice - these bounds has to apply.
+        let lower = idx as u32 * lookup_interval;
+        let upper = lower + lookup_interval - 1;
+        bsearch_range_value_table(c, &emoji_cat_table[range], lower, upper)
     }
 
+    const emoji_cat_lookup: &'static [u8] = &[
+        0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+        2, 2, 2, 4, 4, 6, 8, 8, 8, 10, 14, 14, 15, 15, 19, 21, 22, 37, 41, 41, 41, 42, 42, 42, 42,
+        46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 48, 48, 48, 48, 48, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 51, 55, 58, 63, 63, 63, 64, 64, 64, 65, 65, 66, 67,
+        68, 69, 72, 74, 76, 76, 76, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77
+    ];
+
     const emoji_cat_table: &'static [(char, char, EmojiCat)] = &[
         ('\u{a9}', '\u{a9}', EC_Extended_Pictographic), ('\u{ae}', '\u{ae}',
         EC_Extended_Pictographic), ('\u{203c}', '\u{203c}', EC_Extended_Pictographic), ('\u{2049}',
@@ -1633,7 +1859,7 @@ pub mod sentence {
         SC_Upper,
     }
 
-    fn bsearch_range_value_table(c: char, r: &'static [(char, char, SentenceCat)]) -> (u32, u32, SentenceCat) {
+    fn bsearch_range_value_table(c: char, r: &'static [(char, char, SentenceCat)], default_lower: u32, default_upper: u32) -> (u32, u32, SentenceCat) {
         use core::cmp::Ordering::{Equal, Less, Greater};
         match r.binary_search_by(|&(lo, hi, _)| {
             if lo <= c && c <= hi { Equal }
@@ -1646,8 +1872,8 @@ pub mod sentence {
             }
             Err(idx) => {
                 (
-                    if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
-                    r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
+                    if idx > 0 { r[idx-1].1 as u32 + 1 } else { default_lower },
+                    r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(default_upper),
                     SC_Any,
                 )
             }
@@ -1655,9 +1881,97 @@ pub mod sentence {
     }
 
     pub fn sentence_category(c: char) -> (u32, u32, SentenceCat) {
-        bsearch_range_value_table(c, sentence_cat_table)
+        // Perform a quick O(1) lookup in a precomputed table to determine
+        // the slice of the range table to search in.
+        let lookup_interval = 0x80;
+        let idx = (c as u32 / lookup_interval) as usize;
+        let range = sentence_cat_lookup.get(idx..(idx + 2)).map_or(
+          // If the `idx` is outside of the precomputed table - use the slice
+          // starting from the last covered index in the precomputed table and
+          // ending with the length of the range table.
+          2410..2421,
+          |r| (r[0] as usize)..((r[1] + 1) as usize)
+        );
+
+        // Compute pessimistic default lower and upper bounds on the category.
+        // If character doesn't map to any range and there is no adjacent range
+        // in the table slice - these bounds has to apply.
+        let lower = idx as u32 * lookup_interval;
+        let upper = lower + lookup_interval - 1;
+        bsearch_range_value_table(c, &sentence_cat_table[range], lower, upper)
     }
 
+    const sentence_cat_lookup: &'static [u16] = &[
+        0, 19, 31, 154, 247, 314, 323, 333, 375, 409, 528, 579, 588, 599, 612, 618, 629, 643, 650,
+        661, 683, 702, 720, 738, 755, 771, 790, 806, 818, 825, 840, 850, 856, 871, 882, 882, 882,
+        887, 895, 901, 904, 904, 904, 904, 904, 907, 912, 922, 928, 937, 943, 950, 953, 959, 964,
+        973, 980, 988, 1000, 1000, 1002, 1130, 1249, 1267, 1288, 1308, 1311, 1336, 1340, 1340, 1340,
+        1342, 1342, 1342, 1344, 1344, 1344, 1344, 1344, 1346, 1348, 1348, 1348, 1348, 1351, 1351,
+        1351, 1351, 1351, 1369, 1476, 1482, 1492, 1501, 1501, 1501, 1501, 1512, 1517, 1518, 1521,
+        1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521,
+        1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521,
+        1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521,
+        1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1522, 1522, 1522, 1522, 1522,
+        1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522,
+        1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522,
+        1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522,
+        1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522,
+        1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522,
+        1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522,
+        1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522,
+        1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522,
+        1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522,
+        1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522,
+        1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522,
+        1522, 1522, 1522, 1522, 1525, 1525, 1525, 1580, 1613, 1696, 1769, 1780, 1790, 1797, 1808,
+        1819, 1836, 1843, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849,
+        1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849,
+        1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849,
+        1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849,
+        1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849,
+        1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849,
+        1849, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852,
+        1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852,
+        1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852,
+        1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852,
+        1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1853, 1854, 1864, 1865, 1865,
+        1865, 1867, 1870, 1886, 1888, 1905, 1913, 1919, 1920, 1921, 1922, 1922, 1925, 1929, 1933,
+        1935, 1939, 1942, 1949, 1949, 1949, 1952, 1957, 1964, 1967, 1969, 1971, 1982, 1986, 1989,
+        1990, 1991, 1993, 1996, 1996, 1996, 2000, 2005, 2010, 2019, 2028, 2039, 2051, 2059, 2068,
+        2086, 2086, 2093, 2098, 2098, 2105, 2110, 2114, 2119, 2119, 2121, 2124, 2139, 2146, 2156,
+        2161, 2161, 2161, 2168, 2171, 2183, 2189, 2189, 2192, 2201, 2202, 2202, 2202, 2202, 2202,
+        2202, 2202, 2202, 2203, 2204, 2204, 2205, 2205, 2205, 2205, 2205, 2205, 2205, 2205, 2205,
+        2205, 2205, 2205, 2205, 2205, 2205, 2205, 2205, 2205, 2205, 2205, 2205, 2206, 2206, 2206,
+        2206, 2206, 2206, 2206, 2206, 2206, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211,
+        2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211,
+        2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2212, 2212, 2212,
+        2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212,
+        2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212,
+        2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212,
+        2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212,
+        2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2216, 2221, 2228, 2229, 2229, 2229,
+        2229, 2229, 2231, 2232, 2235, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242,
+        2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242,
+        2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242,
+        2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2243, 2243, 2243, 2243, 2243, 2243, 2243,
+        2243, 2243, 2243, 2244, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245,
+        2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245,
+        2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245,
+        2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245,
+        2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2248, 2248,
+        2248, 2253, 2253, 2253, 2254, 2254, 2254, 2254, 2254, 2254, 2254, 2254, 2254, 2254, 2254,
+        2254, 2254, 2254, 2254, 2254, 2254, 2254, 2254, 2256, 2261, 2261, 2261, 2261, 2261, 2261,
+        2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261,
+        2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261,
+        2261, 2263, 2263, 2263, 2263, 2266, 2269, 2270, 2270, 2270, 2270, 2275, 2288, 2300, 2305,
+        2310, 2316, 2322, 2330, 2330, 2330, 2330, 2330, 2333, 2337, 2337, 2337, 2337, 2337, 2337,
+        2337, 2337, 2337, 2341, 2341, 2347, 2348, 2353, 2353, 2353, 2358, 2358, 2358, 2358, 2361,
+        2361, 2361, 2361, 2361, 2361, 2365, 2365, 2367, 2372, 2372, 2372, 2372, 2372, 2372, 2372,
+        2372, 2372, 2372, 2400, 2405, 2405, 2405, 2405, 2405, 2407, 2408, 2408, 2408, 2408, 2408,
+        2408, 2408, 2408, 2408, 2409, 2409, 2409, 2409, 2409, 2409, 2409, 2409, 2409, 2409, 2409,
+        2410, 2410, 2410, 2410, 2410, 2410, 2410, 2410
+    ];
+
     const sentence_cat_table: &'static [(char, char, SentenceCat)] = &[
         ('\u{9}', '\u{9}', SC_Sp), ('\u{a}', '\u{a}', SC_LF), ('\u{b}', '\u{c}', SC_Sp), ('\u{d}',
         '\u{d}', SC_CR), ('\u{20}', '\u{20}', SC_Sp), ('\u{21}', '\u{21}', SC_STerm), ('\u{22}',
author	Jeff Vander Stoep <jeffv@google.com>	2023-02-22 00:18:19 +0000
committer	Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>	2023-02-22 00:18:19 +0000
commit	fdafdc4b1f0a3e23f015a8af9631bf73ac5dd8eb (patch)
tree	fb9d5538736d1dc8e521d57d80211eb91f95f8cd
parent	84d24275bdccd022929eea40bd56b6308e6114d0 (diff)
parent	3ab9517db750d1a17f1d6b3f83dabeea9bd96e72 (diff)
download	unicode-segmentation-fdafdc4b1f0a3e23f015a8af9631bf73ac5dd8eb.tar.gz