diff options
Diffstat (limited to 'scripts/unicode.py')
-rwxr-xr-x | scripts/unicode.py | 64 |
1 files changed, 58 insertions, 6 deletions
diff --git a/scripts/unicode.py b/scripts/unicode.py index 4976d62..18cea99 100755 --- a/scripts/unicode.py +++ b/scripts/unicode.py @@ -54,7 +54,7 @@ expanded_categories = { # these are the surrogate codepoints, which are not valid rust characters surrogate_codepoints = (0xd800, 0xdfff) -UNICODE_VERSION = (14, 0, 0) +UNICODE_VERSION = (15, 0, 0) UNICODE_VERSION_NUMBER = "%s.%s.%s" %UNICODE_VERSION @@ -274,13 +274,36 @@ def emit_break_module(f, break_table, break_cats, name): pub enum %sCat { """ % (name, Name, Name)) + # We don't want the lookup table to be too large so choose a reasonable + # cutoff. 0x20000 is selected because most of the range table entries are + # within the interval of [0x0, 0x20000] + lookup_value_cutoff = 0x20000 + + # Length of lookup table. It has to be a divisor of `lookup_value_cutoff`. + lookup_table_len = 0x400 + + lookup_interval = round(lookup_value_cutoff / lookup_table_len) + + # Lookup table is a mapping from `character code / lookup_interval` to + # the index in the range table that covers the `character code`. + lookup_table = [0] * lookup_table_len + j = 0 + for i in range(0, lookup_table_len): + lookup_from = i * lookup_interval + while j < len(break_table): + (_, entry_to, _) = break_table[j] + if entry_to >= lookup_from: + break + j += 1 + lookup_table[i] = j + break_cats.append("Any") break_cats.sort() for cat in break_cats: f.write((" %sC_" % Name[0]) + cat + ",\n") f.write(""" } - fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> (u32, u32, %sCat) { + fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)], default_lower: u32, default_upper: u32) -> (u32, u32, %sCat) { use core::cmp::Ordering::{Equal, Less, Greater}; match r.binary_search_by(|&(lo, hi, _)| { if lo <= c && c <= hi { Equal } @@ -293,8 +316,8 @@ def emit_break_module(f, break_table, break_cats, name): } Err(idx) => { ( - if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 }, - r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX), + if idx > 0 { r[idx-1].1 as u32 + 1 } else { default_lower }, + r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(default_upper), %sC_Any, ) } @@ -302,10 +325,39 @@ def emit_break_module(f, break_table, break_cats, name): } pub fn %s_category(c: char) -> (u32, u32, %sCat) { - bsearch_range_value_table(c, %s_cat_table) + // Perform a quick O(1) lookup in a precomputed table to determine + // the slice of the range table to search in. + let lookup_interval = 0x%x; + let idx = (c as u32 / lookup_interval) as usize; + let range = %s_cat_lookup.get(idx..(idx + 2)).map_or( + // If the `idx` is outside of the precomputed table - use the slice + // starting from the last covered index in the precomputed table and + // ending with the length of the range table. + %d..%d, + |r| (r[0] as usize)..((r[1] + 1) as usize) + ); + + // Compute pessimistic default lower and upper bounds on the category. + // If character doesn't map to any range and there is no adjacent range + // in the table slice - these bounds has to apply. + let lower = idx as u32 * lookup_interval; + let upper = lower + lookup_interval - 1; + bsearch_range_value_table(c, &%s_cat_table[range], lower, upper) } -""" % (Name, Name, Name[0], name, Name, name)) +""" % (Name, Name, Name[0], name, Name, lookup_interval, name, j, len(break_table), name)) + + + if len(break_table) <= 0xff: + lookup_type = "u8" + elif len(break_table) <= 0xffff: + lookup_type = "u16" + else: + lookup_type = "u32" + + emit_table(f, "%s_cat_lookup" % name, lookup_table, "&'static [%s]" % lookup_type, + pfun=lambda x: "%d" % x, + is_pub=False, is_const=True) emit_table(f, "%s_cat_table" % name, break_table, "&'static [(char, char, %sCat)]" % Name, pfun=lambda x: "(%s,%s,%sC_%s)" % (escape_char(x[0]), escape_char(x[1]), Name[0], x[2]), |