1 files changed, 58 insertions, 6 deletions
diff --git a/scripts/unicode.py b/scripts/unicode.py
index 4976d62..18cea99 100755
--- a/scripts/unicode.py
+++ b/scripts/unicode.py
@@ -54,7 +54,7 @@ expanded_categories = {
 # these are the surrogate codepoints, which are not valid rust characters
 surrogate_codepoints = (0xd800, 0xdfff)
 
-UNICODE_VERSION = (14, 0, 0)
+UNICODE_VERSION = (15, 0, 0)
 
 UNICODE_VERSION_NUMBER = "%s.%s.%s" %UNICODE_VERSION
 
@@ -274,13 +274,36 @@ def emit_break_module(f, break_table, break_cats, name):
     pub enum %sCat {
 """ % (name, Name, Name))
 
+    # We don't want the lookup table to be too large so choose a reasonable
+    # cutoff. 0x20000 is selected because most of the range table entries are
+    # within the interval of [0x0, 0x20000]
+    lookup_value_cutoff = 0x20000
+
+    # Length of lookup table. It has to be a divisor of `lookup_value_cutoff`.
+    lookup_table_len = 0x400
+
+    lookup_interval = round(lookup_value_cutoff / lookup_table_len)
+
+    # Lookup table is a mapping from `character code / lookup_interval` to
+    # the index in the range table that covers the `character code`.
+    lookup_table = [0] * lookup_table_len
+    j = 0
+    for i in range(0, lookup_table_len):
+      lookup_from = i * lookup_interval
+      while j < len(break_table):
+        (_, entry_to, _) = break_table[j]
+        if entry_to >= lookup_from:
+          break
+        j += 1
+      lookup_table[i] = j
+
     break_cats.append("Any")
     break_cats.sort()
     for cat in break_cats:
         f.write(("        %sC_" % Name[0]) + cat + ",\n")
     f.write("""    }
 
-    fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> (u32, u32, %sCat) {
+    fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)], default_lower: u32, default_upper: u32) -> (u32, u32, %sCat) {
         use core::cmp::Ordering::{Equal, Less, Greater};
         match r.binary_search_by(|&(lo, hi, _)| {
             if lo <= c && c <= hi { Equal }
@@ -293,8 +316,8 @@ def emit_break_module(f, break_table, break_cats, name):
             }
             Err(idx) => {
                 (
-                    if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
-                    r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
+                    if idx > 0 { r[idx-1].1 as u32 + 1 } else { default_lower },
+                    r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(default_upper),
                     %sC_Any,
                 )
             }
@@ -302,10 +325,39 @@ def emit_break_module(f, break_table, break_cats, name):
     }
 
     pub fn %s_category(c: char) -> (u32, u32, %sCat) {
-        bsearch_range_value_table(c, %s_cat_table)
+        // Perform a quick O(1) lookup in a precomputed table to determine
+        // the slice of the range table to search in.
+        let lookup_interval = 0x%x;
+        let idx = (c as u32 / lookup_interval) as usize;
+        let range = %s_cat_lookup.get(idx..(idx + 2)).map_or(
+          // If the `idx` is outside of the precomputed table - use the slice
+          // starting from the last covered index in the precomputed table and
+          // ending with the length of the range table.
+          %d..%d,
+          |r| (r[0] as usize)..((r[1] + 1) as usize)
+        );
+
+        // Compute pessimistic default lower and upper bounds on the category.
+        // If character doesn't map to any range and there is no adjacent range
+        // in the table slice - these bounds has to apply.
+        let lower = idx as u32 * lookup_interval;
+        let upper = lower + lookup_interval - 1;
+        bsearch_range_value_table(c, &%s_cat_table[range], lower, upper)
     }
 
-""" % (Name, Name, Name[0], name, Name, name))
+""" % (Name, Name, Name[0], name, Name, lookup_interval, name, j, len(break_table), name))
+
+
+    if len(break_table) <= 0xff:
+      lookup_type = "u8"
+    elif len(break_table) <= 0xffff:
+      lookup_type = "u16"
+    else:
+      lookup_type = "u32"
+
+    emit_table(f, "%s_cat_lookup" % name, lookup_table, "&'static [%s]" % lookup_type,
+        pfun=lambda x: "%d" % x,
+        is_pub=False, is_const=True)
 
     emit_table(f, "%s_cat_table" % name, break_table, "&'static [(char, char, %sCat)]" % Name,
         pfun=lambda x: "(%s,%s,%sC_%s)" % (escape_char(x[0]), escape_char(x[1]), Name[0], x[2]),