aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore3
-rw-r--r--.travis.yml24
-rw-r--r--Cargo.toml12
-rw-r--r--README.md31
-rwxr-xr-xscripts/unicode.py130
-rw-r--r--src/lib.rs2
6 files changed, 51 insertions, 151 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..5cdcdba
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+target
+Cargo.lock
+scripts/tmp
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..296ac17
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,24 @@
+language: rust
+sudo: false
+script:
+ - cargo build --verbose --features no_std
+ - cargo test --verbose --features no_std
+ - cargo clean
+ - cargo build --verbose --features default
+ - cargo test --verbose --features default
+ - cargo bench --verbose --features default
+ - rustdoc --test README.md -L target/debug -L target/debug/deps
+ - cargo doc
+after_success: |
+ [ $TRAVIS_BRANCH = master ] &&
+ [ $TRAVIS_PULL_REQUEST = false ] &&
+ echo '<meta http-equiv=refresh content=0;url=unicode_xid/index.html>' > target/doc/index.html &&
+ pip install ghp-import --user $USER &&
+ $HOME/.local/bin/ghp-import -n target/doc &&
+ git push -qf https://${TOKEN}@github.com/${TRAVIS_REPO_SLUG}.git gh-pages
+env:
+ global:
+ secure: gTlge+/OQlVkV0R+RThWXeN0aknmS7iUTPBMYKJyRdLz7T2vubw3w80a2CVE87JlpV87A5cVGD+LgR+AhYrhKtvqHb1brMDd99gylBBi2DfV7YapDSwSCuFgVR+FjZfJRcXBtI8po5urUZ84V0WLzRX8SyWqWgoD3oCkSL3Wp3w=
+notifications:
+ email:
+ on_success: never
diff --git a/Cargo.toml b/Cargo.toml
index ae01d15..40a4787 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,8 +1,10 @@
[package]
name = "unicode-xid"
-version = "0.1.0"
-authors = ["erick.tryzelaar <erick.tryzelaar@gmail.com>"]
+version = "0.0.1"
+authors = ["erick.tryzelaar <erick.tryzelaar@gmail.com>",
+ "kwantam <kwantam@gmail.com>",
+ ]
homepage = "https://github.com/unicode-rs/unicode-xid"
repository = "https://github.com/unicode-rs/unicode-xid"
@@ -11,9 +13,9 @@ license = "MIT/Apache-2.0"
keywords = ["text", "unicode", "xid"]
readme = "README.md"
description = """
-Determine
-Determine displayed width of `char` and `str` types
-according to Unicode Standard Annex #11 rules.
+Determine whether characters have the XID_Start
+or XID_Continue properties according to
+Unicode Standard Annex #31.
"""
exclude = [ "target/*", "Cargo.lock" ]
diff --git a/README.md b/README.md
index 5c3acbe..66a57da 100644
--- a/README.md
+++ b/README.md
@@ -1,39 +1,34 @@
-# unicode-derived-property
+# unicode-xid
-Determine displayed width of `char` and `str` types according to
-[Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
-rules.
+Determine if a `char` is a valid identifier for a parser and/or lexer according to
+[Unicode Standard Annex #31](http://www.unicode.org/reports/tr31/) rules.
-[![Build Status](https://travis-ci.org/unicode-rs/unicode-derived-property.svg)](https://travis-ci.org/unicode-rs/unicode-derived-property)
+[![Build Status](https://travis-ci.org/unicode-rs/unicode-xid.svg)](https://travis-ci.org/unicode-rs/unicode-xid)
-[Documentation](https://unicode-rs.github.io/unicode-width/unicode_width/index.html)
+[Documentation](https://unicode-rs.github.io/unicode-xid/unicode_xid/index.html)
```rust
-extern crate unicode_width;
+extern crate unicode_xid;
-use unicode_width::UnicodeWidthStr;
+use unicode_xid::UnicodeXID;
fn main() {
- let teststr = "Hello, world!";
- let width = UnicodeWidthStr::width(teststr);
- println!("{}", teststr);
- println!("The above string is {} columns wide.", width);
- let width = teststr.width_cjk();
- println!("The above string is {} columns wide (CJK).", width);
+ let ch = 'a';
+ println!("Is {} a valid start of an identifier? {}", ch, UnicodeXID::is_xid_start(ch));
}
```
-## features
+# features
-unicode-width supports a `no_std` feature. This eliminates dependence
+unicode-xid supports a `no_std` feature. This eliminates dependence
on std, and instead uses equivalent functions from core.
-## crates.io
+# crates.io
You can use this package in your project by adding the following
to your `Cargo.toml`:
```toml
[dependencies]
-unicode-derived-property = "0.1.1"
+unicode-derived_property = "0.0.1"
```
diff --git a/scripts/unicode.py b/scripts/unicode.py
index 6098c33..a9d58d8 100755
--- a/scripts/unicode.py
+++ b/scripts/unicode.py
@@ -13,12 +13,11 @@
# This script uses the following Unicode tables:
# - DerivedCoreProperties.txt
# - ReadMe.txt
-# - UnicodeData.txt
#
# Since this should not require frequent updates, we just store this
# out-of-line and check the unicode.rs file into git.
-import fileinput, re, os, sys, operator
+import fileinput, re, os, sys
preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
@@ -35,23 +34,6 @@ preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRI
#![allow(missing_docs, non_upper_case_globals, non_snake_case)]
'''
-# Mapping taken from Table 12 from:
-# http://www.unicode.org/reports/tr44/#General_Category_Values
-expanded_categories = {
- 'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
- 'Lm': ['L'], 'Lo': ['L'],
- 'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
- 'Nd': ['N'], 'Nl': ['N'], 'No': ['No'],
- 'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'],
- 'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'],
- 'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'],
- 'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'],
- 'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
-}
-
-# these are the surrogate codepoints, which are not valid rust characters
-surrogate_codepoints = (0xd800, 0xdfff)
-
def fetch(f):
if not os.path.exists(os.path.basename(f)):
os.system("curl -O http://www.unicode.org/Public/UNIDATA/%s"
@@ -61,92 +43,6 @@ def fetch(f):
sys.stderr.write("cannot load %s" % f)
exit(1)
-def is_surrogate(n):
- return surrogate_codepoints[0] <= n <= surrogate_codepoints[1]
-
-def load_unicode_data(f):
- fetch(f)
- gencats = {}
- upperlower = {}
- lowerupper = {}
- combines = {}
- canon_decomp = {}
- compat_decomp = {}
-
- udict = {};
- range_start = -1;
- for line in fileinput.input(f):
- data = line.split(';');
- if len(data) != 15:
- continue
- cp = int(data[0], 16);
- if is_surrogate(cp):
- continue
- if range_start >= 0:
- for i in xrange(range_start, cp):
- udict[i] = data;
- range_start = -1;
- if data[1].endswith(", First>"):
- range_start = cp;
- continue;
- udict[cp] = data;
-
- for code in udict:
- [code_org, name, gencat, combine, bidi,
- decomp, deci, digit, num, mirror,
- old, iso, upcase, lowcase, titlecase ] = udict[code];
-
- # generate char to char direct common and simple conversions
- # uppercase to lowercase
- if gencat == "Lu" and lowcase != "" and code_org != lowcase:
- upperlower[code] = int(lowcase, 16)
-
- # lowercase to uppercase
- if gencat == "Ll" and upcase != "" and code_org != upcase:
- lowerupper[code] = int(upcase, 16)
-
- # store decomposition, if given
- if decomp != "":
- if decomp.startswith('<'):
- seq = []
- for i in decomp.split()[1:]:
- seq.append(int(i, 16))
- compat_decomp[code] = seq
- else:
- seq = []
- for i in decomp.split():
- seq.append(int(i, 16))
- canon_decomp[code] = seq
-
- # place letter in categories as appropriate
- for cat in [gencat, "Assigned"] + expanded_categories.get(gencat, []):
- if cat not in gencats:
- gencats[cat] = []
- gencats[cat].append(code)
-
- # record combining class, if any
- if combine != "0":
- if combine not in combines:
- combines[combine] = []
- combines[combine].append(code)
-
- # generate Not_Assigned from Assigned
- gencats["Cn"] = gen_unassigned(gencats["Assigned"])
- # Assigned is not a real category
- del(gencats["Assigned"])
- # Other contains Not_Assigned
- gencats["C"].extend(gencats["Cn"])
- gencats = group_cats(gencats)
- combines = to_combines(group_cats(combines))
-
- return (canon_decomp, compat_decomp, gencats, combines, lowerupper, upperlower)
-
-def group_cats(cats):
- cats_out = {}
- for cat in cats:
- cats_out[cat] = group_cat(cats[cat])
- return cats_out
-
def group_cat(cat):
cat_out = []
letters = sorted(set(cat))
@@ -171,19 +67,6 @@ def ungroup_cat(cat):
lo += 1
return cat_out
-def gen_unassigned(assigned):
- assigned = set(assigned)
- return ([i for i in range(0, 0xd800) if i not in assigned] +
- [i for i in range(0xe000, 0x110000) if i not in assigned])
-
-def to_combines(combs):
- combs_out = []
- for comb in combs:
- for (lo, hi) in combs[comb]:
- combs_out.append((lo, hi, comb))
- combs_out.sort(key=lambda comb: comb[0])
- return combs_out
-
def format_table_content(f, content, indent):
line = " "*indent
first = True
@@ -304,15 +187,8 @@ if __name__ == "__main__":
/// that this version of unicode-derived-property is based on.
pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
""" % unicode_version)
- (canon_decomp, compat_decomp, gencats, combines,
- lowerupper, upperlower) = load_unicode_data("UnicodeData.txt")
- want_derived = ["XID_Start", "XID_Continue"]
- derived = load_properties("DerivedCoreProperties.txt", want_derived)
- props = load_properties("PropList.txt",
- ["White_Space", "Join_Control", "Noncharacter_Code_Point"])
-
- # bsearch_range_table is used in all the property modules below
emit_bsearch_range_table(rf)
- # category tables
+ want_derived = ["XID_Start", "XID_Continue"]
+ derived = load_properties("DerivedCoreProperties.txt", want_derived)
emit_property_module(rf, "derived_property", derived, want_derived)
diff --git a/src/lib.rs b/src/lib.rs
index 46f3466..c952bd1 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -34,7 +34,7 @@
//!
//! ```toml
//! [dependencies]
-//! unicode-derived_property = "0.1.1"
+//! unicode-derived_property = "0.0.1"
//! ```
#![deny(missing_docs, unsafe_code)]