diff options
-rw-r--r-- | .gitignore | 3 | ||||
-rw-r--r-- | .travis.yml | 24 | ||||
-rw-r--r-- | Cargo.toml | 12 | ||||
-rw-r--r-- | README.md | 31 | ||||
-rwxr-xr-x | scripts/unicode.py | 130 | ||||
-rw-r--r-- | src/lib.rs | 2 |
6 files changed, 51 insertions, 151 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5cdcdba --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +target +Cargo.lock +scripts/tmp diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..296ac17 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,24 @@ +language: rust +sudo: false +script: + - cargo build --verbose --features no_std + - cargo test --verbose --features no_std + - cargo clean + - cargo build --verbose --features default + - cargo test --verbose --features default + - cargo bench --verbose --features default + - rustdoc --test README.md -L target/debug -L target/debug/deps + - cargo doc +after_success: | + [ $TRAVIS_BRANCH = master ] && + [ $TRAVIS_PULL_REQUEST = false ] && + echo '<meta http-equiv=refresh content=0;url=unicode_xid/index.html>' > target/doc/index.html && + pip install ghp-import --user $USER && + $HOME/.local/bin/ghp-import -n target/doc && + git push -qf https://${TOKEN}@github.com/${TRAVIS_REPO_SLUG}.git gh-pages +env: + global: + secure: gTlge+/OQlVkV0R+RThWXeN0aknmS7iUTPBMYKJyRdLz7T2vubw3w80a2CVE87JlpV87A5cVGD+LgR+AhYrhKtvqHb1brMDd99gylBBi2DfV7YapDSwSCuFgVR+FjZfJRcXBtI8po5urUZ84V0WLzRX8SyWqWgoD3oCkSL3Wp3w= +notifications: + email: + on_success: never @@ -1,8 +1,10 @@ [package] name = "unicode-xid" -version = "0.1.0" -authors = ["erick.tryzelaar <erick.tryzelaar@gmail.com>"] +version = "0.0.1" +authors = ["erick.tryzelaar <erick.tryzelaar@gmail.com>", + "kwantam <kwantam@gmail.com>", + ] homepage = "https://github.com/unicode-rs/unicode-xid" repository = "https://github.com/unicode-rs/unicode-xid" @@ -11,9 +13,9 @@ license = "MIT/Apache-2.0" keywords = ["text", "unicode", "xid"] readme = "README.md" description = """ -Determine -Determine displayed width of `char` and `str` types -according to Unicode Standard Annex #11 rules. +Determine whether characters have the XID_Start +or XID_Continue properties according to +Unicode Standard Annex #31. """ exclude = [ "target/*", "Cargo.lock" ] @@ -1,39 +1,34 @@ -# unicode-derived-property +# unicode-xid -Determine displayed width of `char` and `str` types according to -[Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/) -rules. +Determine if a `char` is a valid identifier for a parser and/or lexer according to +[Unicode Standard Annex #31](http://www.unicode.org/reports/tr31/) rules. -[![Build Status](https://travis-ci.org/unicode-rs/unicode-derived-property.svg)](https://travis-ci.org/unicode-rs/unicode-derived-property) +[![Build Status](https://travis-ci.org/unicode-rs/unicode-xid.svg)](https://travis-ci.org/unicode-rs/unicode-xid) -[Documentation](https://unicode-rs.github.io/unicode-width/unicode_width/index.html) +[Documentation](https://unicode-rs.github.io/unicode-xid/unicode_xid/index.html) ```rust -extern crate unicode_width; +extern crate unicode_xid; -use unicode_width::UnicodeWidthStr; +use unicode_xid::UnicodeXID; fn main() { - let teststr = "Hello, world!"; - let width = UnicodeWidthStr::width(teststr); - println!("{}", teststr); - println!("The above string is {} columns wide.", width); - let width = teststr.width_cjk(); - println!("The above string is {} columns wide (CJK).", width); + let ch = 'a'; + println!("Is {} a valid start of an identifier? {}", ch, UnicodeXID::is_xid_start(ch)); } ``` -## features +# features -unicode-width supports a `no_std` feature. This eliminates dependence +unicode-xid supports a `no_std` feature. This eliminates dependence on std, and instead uses equivalent functions from core. -## crates.io +# crates.io You can use this package in your project by adding the following to your `Cargo.toml`: ```toml [dependencies] -unicode-derived-property = "0.1.1" +unicode-derived_property = "0.0.1" ``` diff --git a/scripts/unicode.py b/scripts/unicode.py index 6098c33..a9d58d8 100755 --- a/scripts/unicode.py +++ b/scripts/unicode.py @@ -13,12 +13,11 @@ # This script uses the following Unicode tables: # - DerivedCoreProperties.txt # - ReadMe.txt -# - UnicodeData.txt # # Since this should not require frequent updates, we just store this # out-of-line and check the unicode.rs file into git. -import fileinput, re, os, sys, operator +import fileinput, re, os, sys preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT // file at the top-level directory of this distribution and at @@ -35,23 +34,6 @@ preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRI #![allow(missing_docs, non_upper_case_globals, non_snake_case)] ''' -# Mapping taken from Table 12 from: -# http://www.unicode.org/reports/tr44/#General_Category_Values -expanded_categories = { - 'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'], - 'Lm': ['L'], 'Lo': ['L'], - 'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'], - 'Nd': ['N'], 'Nl': ['N'], 'No': ['No'], - 'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'], - 'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'], - 'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'], - 'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'], - 'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'], -} - -# these are the surrogate codepoints, which are not valid rust characters -surrogate_codepoints = (0xd800, 0xdfff) - def fetch(f): if not os.path.exists(os.path.basename(f)): os.system("curl -O http://www.unicode.org/Public/UNIDATA/%s" @@ -61,92 +43,6 @@ def fetch(f): sys.stderr.write("cannot load %s" % f) exit(1) -def is_surrogate(n): - return surrogate_codepoints[0] <= n <= surrogate_codepoints[1] - -def load_unicode_data(f): - fetch(f) - gencats = {} - upperlower = {} - lowerupper = {} - combines = {} - canon_decomp = {} - compat_decomp = {} - - udict = {}; - range_start = -1; - for line in fileinput.input(f): - data = line.split(';'); - if len(data) != 15: - continue - cp = int(data[0], 16); - if is_surrogate(cp): - continue - if range_start >= 0: - for i in xrange(range_start, cp): - udict[i] = data; - range_start = -1; - if data[1].endswith(", First>"): - range_start = cp; - continue; - udict[cp] = data; - - for code in udict: - [code_org, name, gencat, combine, bidi, - decomp, deci, digit, num, mirror, - old, iso, upcase, lowcase, titlecase ] = udict[code]; - - # generate char to char direct common and simple conversions - # uppercase to lowercase - if gencat == "Lu" and lowcase != "" and code_org != lowcase: - upperlower[code] = int(lowcase, 16) - - # lowercase to uppercase - if gencat == "Ll" and upcase != "" and code_org != upcase: - lowerupper[code] = int(upcase, 16) - - # store decomposition, if given - if decomp != "": - if decomp.startswith('<'): - seq = [] - for i in decomp.split()[1:]: - seq.append(int(i, 16)) - compat_decomp[code] = seq - else: - seq = [] - for i in decomp.split(): - seq.append(int(i, 16)) - canon_decomp[code] = seq - - # place letter in categories as appropriate - for cat in [gencat, "Assigned"] + expanded_categories.get(gencat, []): - if cat not in gencats: - gencats[cat] = [] - gencats[cat].append(code) - - # record combining class, if any - if combine != "0": - if combine not in combines: - combines[combine] = [] - combines[combine].append(code) - - # generate Not_Assigned from Assigned - gencats["Cn"] = gen_unassigned(gencats["Assigned"]) - # Assigned is not a real category - del(gencats["Assigned"]) - # Other contains Not_Assigned - gencats["C"].extend(gencats["Cn"]) - gencats = group_cats(gencats) - combines = to_combines(group_cats(combines)) - - return (canon_decomp, compat_decomp, gencats, combines, lowerupper, upperlower) - -def group_cats(cats): - cats_out = {} - for cat in cats: - cats_out[cat] = group_cat(cats[cat]) - return cats_out - def group_cat(cat): cat_out = [] letters = sorted(set(cat)) @@ -171,19 +67,6 @@ def ungroup_cat(cat): lo += 1 return cat_out -def gen_unassigned(assigned): - assigned = set(assigned) - return ([i for i in range(0, 0xd800) if i not in assigned] + - [i for i in range(0xe000, 0x110000) if i not in assigned]) - -def to_combines(combs): - combs_out = [] - for comb in combs: - for (lo, hi) in combs[comb]: - combs_out.append((lo, hi, comb)) - combs_out.sort(key=lambda comb: comb[0]) - return combs_out - def format_table_content(f, content, indent): line = " "*indent first = True @@ -304,15 +187,8 @@ if __name__ == "__main__": /// that this version of unicode-derived-property is based on. pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s); """ % unicode_version) - (canon_decomp, compat_decomp, gencats, combines, - lowerupper, upperlower) = load_unicode_data("UnicodeData.txt") - want_derived = ["XID_Start", "XID_Continue"] - derived = load_properties("DerivedCoreProperties.txt", want_derived) - props = load_properties("PropList.txt", - ["White_Space", "Join_Control", "Noncharacter_Code_Point"]) - - # bsearch_range_table is used in all the property modules below emit_bsearch_range_table(rf) - # category tables + want_derived = ["XID_Start", "XID_Continue"] + derived = load_properties("DerivedCoreProperties.txt", want_derived) emit_property_module(rf, "derived_property", derived, want_derived) @@ -34,7 +34,7 @@ //! //! ```toml //! [dependencies] -//! unicode-derived_property = "0.1.1" +//! unicode-derived_property = "0.0.1" //! ``` #![deny(missing_docs, unsafe_code)] |