diff options
author | Ivan Lozano <ivanlozano@google.com> | 2021-08-20 22:27:47 +0000 |
---|---|---|
committer | Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com> | 2021-08-20 22:27:47 +0000 |
commit | 9a771a31e3b075cae3bae31d5fc4ba0b6e114667 (patch) | |
tree | 90d87dcb0faf6c6e2805cab833d9f173ac4ed786 | |
parent | d22939409e8930f2426b1b6fe55634859e66104b (diff) | |
parent | 82f44c6e928cff3577ebea54be9ab227c4b64cbd (diff) | |
download | cesu8-9a771a31e3b075cae3bae31d5fc4ba0b6e114667.tar.gz |
Initial commit for cesu8 crate. am: a7e4bc004f am: e0d2931950 am: 82f44c6e92
Original change: https://android-review.googlesource.com/c/platform/external/rust/crates/cesu8/+/1804098
Change-Id: I8ba7a929e5a57c4540aaf37569e99bfe53fa2c77
-rw-r--r-- | COPYRIGHT-RUST.txt | 422 | ||||
-rw-r--r-- | Cargo.toml | 17 | ||||
l--------- | LICENSE | 1 | ||||
-rw-r--r-- | METADATA | 19 | ||||
-rw-r--r-- | MODULE_LICENSE_APACHE2 | 0 | ||||
-rw-r--r-- | OWNERS | 1 | ||||
-rw-r--r-- | README.md | 33 | ||||
-rw-r--r-- | src/lib.rs | 453 | ||||
-rw-r--r-- | src/unicode.rs | 37 |
9 files changed, 983 insertions, 0 deletions
diff --git a/COPYRIGHT-RUST.txt b/COPYRIGHT-RUST.txt new file mode 100644 index 0000000..52039ea --- /dev/null +++ b/COPYRIGHT-RUST.txt @@ -0,0 +1,422 @@ +Short version for non-lawyers: + +The Rust Project is dual-licensed under Apache 2.0 and MIT +terms. + + +Longer version: + +The Rust Project is copyright 2014, The Rust Project +Developers (given in the file AUTHORS.txt). + +Licensed under the Apache License, Version 2.0 +<LICENSE-APACHE or +http://www.apache.org/licenses/LICENSE-2.0> or the MIT +license <LICENSE-MIT or http://opensource.org/licenses/MIT>, +at your option. All files in the project carrying such +notice may not be copied, modified, or distributed except +according to those terms. + + +The Rust Project includes packages written by third parties. +The following third party packages are included, and carry +their own copyright notices and license terms: + +* Two header files that are part of the Valgrind + package. These files are found at src/rt/vg/valgrind.h and + src/rt/vg/memcheck.h, within this distribution. These files + are redistributed under the following terms, as noted in + them: + + for src/rt/vg/valgrind.h: + + This file is part of Valgrind, a dynamic binary + instrumentation framework. + + Copyright (C) 2000-2010 Julian Seward. All rights + reserved. + + Redistribution and use in source and binary forms, with + or without modification, are permitted provided that the + following conditions are met: + + 1. Redistributions of source code must retain the above + copyright notice, this list of conditions and the + following disclaimer. + + 2. The origin of this software must not be + misrepresented; you must not claim that you wrote the + original software. If you use this software in a + product, an acknowledgment in the product + documentation would be appreciated but is not + required. + + 3. Altered source versions must be plainly marked as + such, and must not be misrepresented as being the + original software. + + 4. The name of the author may not be used to endorse or + promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN + NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY + OF SUCH DAMAGE. + + for src/rt/vg/memcheck.h: + + This file is part of MemCheck, a heavyweight Valgrind + tool for detecting memory errors. + + Copyright (C) 2000-2010 Julian Seward. All rights + reserved. + + Redistribution and use in source and binary forms, with + or without modification, are permitted provided that the + following conditions are met: + + 1. Redistributions of source code must retain the above + copyright notice, this list of conditions and the + following disclaimer. + + 2. The origin of this software must not be + misrepresented; you must not claim that you wrote the + original software. If you use this software in a + product, an acknowledgment in the product + documentation would be appreciated but is not + required. + + 3. Altered source versions must be plainly marked as + such, and must not be misrepresented as being the + original software. + + 4. The name of the author may not be used to endorse or + promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN + NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY + OF SUCH DAMAGE. + +* The auxiliary file src/etc/pkg/modpath.iss contains a + library routine compiled, by Inno Setup, into the Windows + installer binary. This file is licensed under the LGPL, + version 3, but, in our legal interpretation, this does not + affect the aggregate "collected work" license of the Rust + distribution (MIT/ASL2) nor any other components of it. We + believe that the terms governing distribution of the + binary Windows installer built from modpath.iss are + therefore LGPL, but not the terms governing distribution + of any of the files installed by such an installer (such + as the Rust compiler or runtime libraries themselves). + +* The src/rt/miniz.c file, carrying an implementation of + RFC1950/RFC1951 DEFLATE, by Rich Geldreich + <richgel99@gmail.com>. All uses of this file are + permitted by the embedded "unlicense" notice + (effectively: public domain with warranty disclaimer). + +* LLVM. Code for this package is found in src/llvm. + + Copyright (c) 2003-2013 University of Illinois at + Urbana-Champaign. All rights reserved. + + Developed by: + + LLVM Team + + University of Illinois at Urbana-Champaign + + http://llvm.org + + Permission is hereby granted, free of charge, to any + person obtaining a copy of this software and associated + documentation files (the "Software"), to deal with the + Software without restriction, including without + limitation the rights to use, copy, modify, merge, + publish, distribute, sublicense, and/or sell copies of + the Software, and to permit persons to whom the Software + is furnished to do so, subject to the following + conditions: + + * Redistributions of source code must retain the + above copyright notice, this list of conditions + and the following disclaimers. + + * Redistributions in binary form must reproduce the + above copyright notice, this list of conditions + and the following disclaimers in the documentation + and/or other materials provided with the + distribution. + + * Neither the names of the LLVM Team, University of + Illinois at Urbana-Champaign, nor the names of its + contributors may be used to endorse or promote + products derived from this Software without + specific prior written permission. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF + ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED + TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A + PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT + SHALL THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE + FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT + OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + OTHER DEALINGS WITH THE SOFTWARE. + +* Additional libraries included in LLVM carry separate + BSD-compatible licenses. See src/llvm/LICENSE.txt for + details. + +* compiler-rt, in src/compiler-rt is dual licensed under + LLVM's license and MIT: + + Copyright (c) 2009-2014 by the contributors listed in + CREDITS.TXT + + All rights reserved. + + Developed by: + + LLVM Team + + University of Illinois at Urbana-Champaign + + http://llvm.org + + Permission is hereby granted, free of charge, to any + person obtaining a copy of this software and associated + documentation files (the "Software"), to deal with the + Software without restriction, including without + limitation the rights to use, copy, modify, merge, + publish, distribute, sublicense, and/or sell copies of + the Software, and to permit persons to whom the Software + is furnished to do so, subject to the following + conditions: + + * Redistributions of source code must retain the + above copyright notice, this list of conditions + and the following disclaimers. + + * Redistributions in binary form must reproduce the + above copyright notice, this list of conditions + and the following disclaimers in the documentation + and/or other materials provided with the + distribution. + + * Neither the names of the LLVM Team, University of + Illinois at Urbana-Champaign, nor the names of its + contributors may be used to endorse or promote + products derived from this Software without + specific prior written permission. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF + ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED + TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A + PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT + SHALL THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE + FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT + OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + OTHER DEALINGS WITH THE SOFTWARE. + + ======================================================== + + Copyright (c) 2009-2014 by the contributors listed in + CREDITS.TXT + + Permission is hereby granted, free of charge, to any + person obtaining a copy of this software and associated + documentation files (the "Software"), to deal in the + Software without restriction, including without + limitation the rights to use, copy, modify, merge, + publish, distribute, sublicense, and/or sell copies of + the Software, and to permit persons to whom the Software + is furnished to do so, subject to the following + conditions: + + The above copyright notice and this permission notice + shall be included in all copies or substantial portions + of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF + ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED + TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A + PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT + SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR + IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + DEALINGS IN THE SOFTWARE. + +* Portions of the FFI code for interacting with the native ABI + is derived from the Clay programming language, which carries + the following license. + + Copyright (C) 2008-2010 Tachyon Technologies. + All rights reserved. + + Redistribution and use in source and binary forms, with + or without modification, are permitted provided that the + following conditions are met: + + 1. Redistributions of source code must retain the above + copyright notice, this list of conditions and the + following disclaimer. + + 2. Redistributions in binary form must reproduce the + above copyright notice, this list of conditions and + the following disclaimer in the documentation and/or + other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + DEVELOPERS AND CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY + OF SUCH DAMAGE. + +* Hoedown, the markdown parser, under src/rt/hoedown, is + licensed as follows. + + Copyright (c) 2008, Natacha Porté + Copyright (c) 2011, Vicent Martí + Copyright (c) 2013, Devin Torres and the Hoedown authors + + Permission to use, copy, modify, and distribute this + software for any purpose with or without fee is hereby + granted, provided that the above copyright notice and + this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR + DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE + INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR + ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA + OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE + OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +* libbacktrace, under src/libbacktrace: + + Copyright (C) 2012-2014 Free Software Foundation, Inc. + Written by Ian Lance Taylor, Google. + + Redistribution and use in source and binary forms, with + or without modification, are permitted provided that the + following conditions are met: + + (1) Redistributions of source code must retain the + above copyright notice, this list of conditions and + the following disclaimer. + + (2) Redistributions in binary form must reproduce + the above copyright notice, this list of conditions + and the following disclaimer in the documentation + and/or other materials provided with the + distribution. + + (3) The name of the author may not be used to + endorse or promote products derived from this + software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN + NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY + OF SUCH DAMAGE. */ + +* jemalloc, under src/jemalloc: + + Copyright (C) 2002-2014 Jason Evans + <jasone@canonware.com>. All rights reserved. + Copyright (C) 2007-2012 Mozilla Foundation. + All rights reserved. + Copyright (C) 2009-2014 Facebook, Inc. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + 1. Redistributions of source code must retain the above copyright notice(s), + this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright notice(s), + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, + INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) + BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY + OF SUCH DAMAGE. + +* Additional copyright may be retained by contributors other + than Mozilla, the Rust Project Developers, or the parties + enumerated in this file. Such copyright can be determined + on a case-by-case basis by examining the author of each + portion of a file in the revision-control commit records + of the project, or by consulting representative comments + claiming copyright ownership for a file. + + For example, the text: + + "Copyright (c) 2011 Google Inc." + + appears in some files, and these files thereby denote + that their author and copyright-holder is Google Inc. + + In all such cases, the absence of explicit licensing text + indicates that the contributor chose to license their work + for distribution under identical terms to those Mozilla + has chosen for the collective work, enumerated at the top + of this file. The only difference is the retention of + copyright itself, held by the contributor. diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..0d8bd9a --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,17 @@ +[package] + +name = "cesu8" +version = "1.1.0" +authors = ["Eric Kidd <git@randomhacks.net>"] + +description = "Convert to and from CESU-8 encoding (similar to UTF-8)" +license = "Apache-2.0/MIT" +readme = "README.md" + +repository = "https://github.com/emk/cesu8-rs" +documentation = "http://emk.github.io/cesu8-rs/cesu8/index.html" + +[features] +# Allow access to unstable features when being built with a nightly compiler, +# to keep travis-cargo happy and enable access to benchmarks if we want them. +unstable = [] @@ -0,0 +1 @@ +COPYRIGHT-RUST.txt
\ No newline at end of file diff --git a/METADATA b/METADATA new file mode 100644 index 0000000..ae7ceba --- /dev/null +++ b/METADATA @@ -0,0 +1,19 @@ +name: "cesu8" +description: "Convert to and from CESU-8 encoding (similar to UTF-8)" +third_party { + url { + type: HOMEPAGE + value: "https://crates.io/crates/cesu8" + } + url { + type: ARCHIVE + value: "https://static.crates.io/crates/cesu8/cesu8-1.1.0.crate" + } + version: "1.1.0" + license_type: NOTICE + last_upgrade_date { + year: 2021 + month: 7 + day: 30 + } +} diff --git a/MODULE_LICENSE_APACHE2 b/MODULE_LICENSE_APACHE2 new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/MODULE_LICENSE_APACHE2 @@ -0,0 +1 @@ +include platform/prebuilts/rust:master:/OWNERS diff --git a/README.md b/README.md new file mode 100644 index 0000000..161f865 --- /dev/null +++ b/README.md @@ -0,0 +1,33 @@ +# CESU-8 encoder/decoder for Rust + +[![Build Status](https://travis-ci.org/emk/cesu8-rs.svg)](https://travis-ci.org/emk/cesu8-rs) [![Latest version](https://img.shields.io/crates/v/cesu8.svg)](https://crates.io/crates/cesu8) [![License](https://img.shields.io/crates/l/cesu8.svg)](https://crates.io/crates/cesu8) + +[Documentation][apidoc]. + +[apidoc]: http://emk.github.io/cesu8-rs/cesu8/index.html + +Convert between ordinary UTF-8 and [CESU-8][] encodings. + +CESU-8 encodes characters outside the Basic Multilingual Plane as two +UTF-16 surrogate chacaters, which are then further re-encoded as invalid, +3-byte UTF-8 characters. This means that 4-byte UTF-8 sequences become +6-byte CESU-8 sequences. + +**Note that CESU-8 is only intended for internal use within tightly-coupled +systems, and not for data interchange.** + +This encoding is sometimes needed when working with Java, Oracle or MySQL, +and when trying to store emoji, hieroglyphs, or other characters on the +Supplementary Multilingual Plane or the Supplementary Ideographic Plane. + +[CESU-8]: http://www.unicode.org/reports/tr26/tr26-2.html + +## License + +Some of this code is adapted from Rust's [`src/libcore/str.rs` file][str.rs]. +This code is covered by LICENSE-RUST.txt and copyright by The Rust Project +Developers and individual Rust contributors, as described in that file. + +The new code in this project is distributed under the same terms. + +[str.rs]: https://github.com/rust-lang/rust/blob/master/src/libcore/str.rs diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..7fac5ff --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,453 @@ +// Copyright 2012-2014 The Rust Project Developers and Eric Kidd. See the +// COPYRIGHT-RUST.txt file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed except +// according to those terms. + + +//! A simple library implementing the [CESU-8 compatibility encoding +//! scheme](http://www.unicode.org/reports/tr26/tr26-2.html). This is a +//! non-standard variant of UTF-8 that is used internally by some systems +//! that need to represent UTF-16 data as 8-bit characters. Yes, this is +//! ugly. +//! +//! Use of this encoding is discouraged by the Unicode Consortium. It's OK +//! for working with existing internal APIs, but it should not be used for +//! transmitting or storing data. +//! +//! ``` +//! use std::borrow::Cow; +//! use cesu8::{from_cesu8, to_cesu8}; +//! +//! // 16-bit Unicode characters are the same in UTF-8 and CESU-8. +//! assert_eq!(Cow::Borrowed("aé日".as_bytes()), +//! to_cesu8("aé日")); +//! assert_eq!(Cow::Borrowed("aé日"), +//! from_cesu8("aé日".as_bytes()).unwrap()); +//! +//! // This string is CESU-8 data containing a 6-byte surrogate pair, +//! // which decodes to a 4-byte UTF-8 string. +//! let data = &[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81]; +//! assert_eq!(Cow::Borrowed("\u{10401}"), +//! from_cesu8(data).unwrap()); +//! ``` +//! +//! ### A note about security +//! +//! As a general rule, this library is intended to fail on malformed or +//! unexpected input. CESU-8 is supposed to be an internal-only format, +//! and if we're seeing malformed data, we assume that it's either a bug in +//! somebody's code, or an attacker is trying to improperly encode data to +//! evade security checks. +//! +//! If you have a use case for lossy conversion to UTF-8, or conversion +//! from mixed UTF-8/CESU-8 data, please feel free to submit a pull request +//! for `from_cesu8_lossy_permissive` with appropriate behavior. +//! +//! ### Java and U+0000, and other variants +//! +//! Java uses the CESU-8 encoding as described above, but with one +//! difference: The null character U+0000 is represented as an overlong +//! UTF-8 sequence `C0 80`. This is supported by the `from_java_cesu8` and +//! `to_java_cesu8` methods. +//! +//! ### Surrogate pairs and UTF-8 +//! +//! The UTF-16 encoding uses "surrogate pairs" to represent Unicode code +//! points in the range from U+10000 to U+10FFFF. These are 16-bit numbers +//! in the range 0xD800 to 0xDFFF. +//! +//! * 0xD800 to 0xDBFF: First half of surrogate pair. When encoded as +//! CESU-8, these become **1110**1101 **10**100000 **10**000000 to +//! **1110**1101 **10**101111 **10**111111. +//! +//! * 0xDC00 to 0xDFFF: Second half of surrogate pair. These become +//! **1110**1101 **10**110000 **10**000000 to +//! **1110**1101 **10**111111 **10**111111. +//! +//! Wikipedia [explains](http://en.wikipedia.org/wiki/UTF-16) the +//! code point to UTF-16 conversion process: +//! +//! > Consider the encoding of U+10437 (𐐷): +//! > +//! > * Subtract 0x10000 from 0x10437. The result is 0x00437, 0000 0000 0100 +//! > 0011 0111. +//! > * Split this into the high 10-bit value and the low 10-bit value: +//! > 0000000001 and 0000110111. +//! > * Add 0xD800 to the high value to form the high surrogate: 0xD800 + +//! > 0x0001 = 0xD801. +//! > * Add 0xDC00 to the low value to form the low surrogate: 0xDC00 + +//! > 0x0037 = 0xDC37. + +#![warn(missing_docs)] + + +use std::borrow::Cow; +use std::error::Error; +use std::fmt; +use std::result::Result; +use std::slice; +use std::str::{from_utf8, from_utf8_unchecked}; +use unicode::utf8_char_width; + +mod unicode; + +/// Mask of the value bits of a continuation byte. +const CONT_MASK: u8 = 0b0011_1111u8; +/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte. +const TAG_CONT_U8: u8 = 0b1000_0000u8; + +/// The CESU-8 data could not be decoded as valid UTF-8 data. +#[derive(Clone, Copy, Debug)] +pub struct Cesu8DecodingError; + +impl Error for Cesu8DecodingError { + fn description(&self) -> &str { "decoding error" } + fn cause(&self) -> Option<&Error> { None } +} + +impl fmt::Display for Cesu8DecodingError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "could not convert CESU-8 data to UTF-8") + } +} + +/// Which variant of the encoding are we working with? +#[derive(PartialEq, Eq)] +enum Variant { + /// Regular CESU-8, with '\0' represented by itself. + Standard, + /// This is technically Java's "Modified UTF-8", which is supposedly + /// like CESU-8, except that it UTF-8 encodes the '\0' byte. I'm sure + /// it seemed like a good idea at the time. + Java, +} + +/// Convert CESU-8 data to a Rust string, re-encoding only if necessary. +/// Returns an error if the data cannot be represented as valid UTF-8. +/// +/// ``` +/// use std::borrow::Cow; +/// use cesu8::from_cesu8; +/// +/// // This string is valid as UTF-8 or CESU-8, so it doesn't change, +/// // and we can convert it without allocating memory. +/// assert_eq!(Cow::Borrowed("aé日"), +/// from_cesu8("aé日".as_bytes()).unwrap()); +/// +/// // This string is CESU-8 data containing a 6-byte surrogate pair, +/// // which becomes a 4-byte UTF-8 string. +/// let data = &[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81]; +/// assert_eq!(Cow::Borrowed("\u{10401}"), +/// from_cesu8(data).unwrap()); +/// ``` +pub fn from_cesu8(bytes: &[u8]) -> Result<Cow<str>, Cesu8DecodingError> { + from_cesu8_internal(bytes, Variant::Standard) +} + +/// Convert Java's modified UTF-8 data to a Rust string, re-encoding only if +/// necessary. Returns an error if the data cannot be represented as valid +/// UTF-8. +/// +/// ``` +/// use std::borrow::Cow; +/// use cesu8::from_java_cesu8; +/// +/// // This string is valid as UTF-8 or modified UTF-8, so it doesn't change, +/// // and we can convert it without allocating memory. +/// assert_eq!(Cow::Borrowed("aé日"), +/// from_java_cesu8("aé日".as_bytes()).unwrap()); +/// +/// // This string is modified UTF-8 data containing a 6-byte surrogate pair, +/// // which becomes a 4-byte UTF-8 string. +/// let data = &[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81]; +/// assert_eq!(Cow::Borrowed("\u{10401}"), +/// from_java_cesu8(data).unwrap()); +/// +/// // This string is modified UTF-8 data containing null code-points. +/// let data = &[0xC0, 0x80, 0xC0, 0x80]; +/// assert_eq!(Cow::Borrowed("\0\0"), +/// from_java_cesu8(data).unwrap()); +/// ``` +pub fn from_java_cesu8(bytes: &[u8]) -> Result<Cow<str>, Cesu8DecodingError> { + from_cesu8_internal(bytes, Variant::Java) +} + +/// Do the actual work of decoding. +fn from_cesu8_internal(bytes: &[u8], variant: Variant) -> + Result<Cow<str>, Cesu8DecodingError> +{ + match from_utf8(bytes) { + Ok(str) => Ok(Cow::Borrowed(str)), + _ => { + let mut decoded = Vec::with_capacity(bytes.len()); + if decode_from_iter(&mut decoded, &mut bytes.iter(), variant) { + // Keep this assertion in debug mode only. It's important + // that this assertion is true, because Rust assumes that + // all UTF-8 strings are valid. + debug_assert!(from_utf8(&decoded[..]).is_ok()); + Ok(Cow::Owned(unsafe { String::from_utf8_unchecked(decoded) })) + } else { + Err(Cesu8DecodingError) + } + } + } +} + +#[test] +fn test_from_cesu8() { + // The surrogate-encoded character below is from the ICU library's + // icu/source/test/testdata/conversion.txt test case. + let data = &[0x4D, 0xE6, 0x97, 0xA5, 0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81, 0x7F]; + assert_eq!(Cow::Borrowed("M日\u{10401}\u{7F}"), + from_cesu8(data).unwrap()); + + // We used to have test data from the CESU-8 specification, but when we + // worked it through manually, we got the wrong answer: + // + // Input: [0xED, 0xAE, 0x80, 0xED, 0xB0, 0x80] + // Binary: 11101101 10101110 10000000 11101101 10110000 10000000 + // + // 0b1101_101110_000000 -> 0xDB80 + // 0b1101_110000_000000 -> 0xDC00 + // + // ((0xDB80 - 0xD800) << 10) | (0xDC00 - 0xDC00) -> 0xE0000 + // 0x10000 + 0xE0000 -> 0xF0000 + // + // The spec claims that we are supposed to get 0x10000, not 0xF0000. + // Since I can't reconcile this example data with the text of the + // specification, I decided to use a test character from ICU instead. +} + +// Our internal decoder, based on Rust's is_utf8 implementation. +fn decode_from_iter( + decoded: &mut Vec<u8>, iter: &mut slice::Iter<u8>, variant: Variant) + -> bool +{ + macro_rules! err { + () => { return false } + } + macro_rules! next { + () => { + match iter.next() { + Some(a) => *a, + // We needed data, but there was none: error! + None => err!() + } + } + } + macro_rules! next_cont { + () => { + { + let byte = next!(); + if (byte) & !CONT_MASK == TAG_CONT_U8 { byte } else { err!() } + } + } + } + + loop { + let first = match iter.next() { + Some(&b) => b, + // We're at the end of the iterator and a codepoint boundary at + // the same time, so this string is valid. + None => return true + }; + + if variant == Variant::Java && first == 0 { + // Java's modified UTF-8 should never contain \0 directly. + err!(); + } else if first < 128 { + // Pass ASCII through directly. + decoded.push(first); + } else if first == 0xc0 && variant == Variant::Java { + match next!() { + 0x80 => decoded.push(0), + _ => err!(), + } + } else { + let w = utf8_char_width(first); + let second = next_cont!(); + match w { + // Two-byte sequences can be used directly. + 2 => { decoded.extend([first, second].iter().cloned()); } + 3 => { + let third = next_cont!(); + match (first, second) { + // These are valid UTF-8, so pass them through. + (0xE0 , 0xA0 ... 0xBF) | + (0xE1 ... 0xEC, 0x80 ... 0xBF) | + (0xED , 0x80 ... 0x9F) | + (0xEE ... 0xEF, 0x80 ... 0xBF) => { + decoded.extend([first, second, third].iter() + .cloned()) + } + // First half a surrogate pair, so decode. + (0xED , 0xA0 ... 0xAF) => { + if next!() != 0xED { err!() } + let fifth = next_cont!(); + if fifth < 0xB0 || 0xBF < fifth { err!() } + let sixth = next_cont!(); + let s = dec_surrogates(second, third, fifth, sixth); + decoded.extend(s.iter().cloned()); + } + _ => err!() + } + } + _ => err!() + } + } + } +} + +/// Convert the two trailing bytes from a CESU-8 surrogate to a regular +/// surrogate value. +fn dec_surrogate(second: u8, third: u8) -> u32 { + 0xD000u32 | ((second & CONT_MASK) as u32) << 6 | (third & CONT_MASK) as u32 +} + +/// Convert the bytes from a CESU-8 surrogate pair into a valid UTF-8 +/// sequence. Assumes input is valid. +fn dec_surrogates(second: u8, third: u8, fifth: u8, sixth: u8) -> [u8; 4] { + // Convert to a 32-bit code point. + let s1 = dec_surrogate(second, third); + let s2 = dec_surrogate(fifth, sixth); + let c = 0x10000 + (((s1 - 0xD800) << 10) | (s2 - 0xDC00)); + //println!("{:0>8b} {:0>8b} {:0>8b} -> {:0>16b}", 0xEDu8, second, third, s1); + //println!("{:0>8b} {:0>8b} {:0>8b} -> {:0>16b}", 0xEDu8, fifth, sixth, s2); + //println!("-> {:0>32b}", c); + assert!(0x010000 <= c && c <= 0x10FFFF); + + // Convert to UTF-8. + // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + [0b1111_0000u8 | ((c & 0b1_1100_0000_0000_0000_0000) >> 18) as u8, + TAG_CONT_U8 | ((c & 0b0_0011_1111_0000_0000_0000) >> 12) as u8, + TAG_CONT_U8 | ((c & 0b0_0000_0000_1111_1100_0000) >> 6) as u8, + TAG_CONT_U8 | ((c & 0b0_0000_0000_0000_0011_1111) ) as u8] +} + +/// Convert a Rust `&str` to CESU-8 bytes. +/// +/// ``` +/// use std::borrow::Cow; +/// use cesu8::to_cesu8; +/// +/// // This string is valid as UTF-8 or CESU-8, so it doesn't change, +/// // and we can convert it without allocating memory. +/// assert_eq!(Cow::Borrowed("aé日".as_bytes()), to_cesu8("aé日")); +/// +/// // This string is a 4-byte UTF-8 string, which becomes a 6-byte CESU-8 +/// // vector. +/// assert_eq!(Cow::Borrowed(&[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81]), +/// to_cesu8("\u{10401}")); +/// ``` +pub fn to_cesu8(text: &str) -> Cow<[u8]> { + if is_valid_cesu8(text) { + Cow::Borrowed(text.as_bytes()) + } else { + Cow::Owned(to_cesu8_internal(text, Variant::Standard)) + } +} + +/// Convert a Rust `&str` to Java's modified UTF-8 bytes. +/// +/// ``` +/// use std::borrow::Cow; +/// use cesu8::to_java_cesu8; +/// +/// // This string is valid as UTF-8 or CESU-8, so it doesn't change, +/// // and we can convert it without allocating memory. +/// assert_eq!(Cow::Borrowed("aé日".as_bytes()), to_java_cesu8("aé日")); +/// +/// // This string is a 4-byte UTF-8 string, which becomes a 6-byte modified +/// // UTF-8 vector. +/// assert_eq!(Cow::Borrowed(&[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81]), +/// to_java_cesu8("\u{10401}")); +/// +/// // This string contains null, which becomes 2-byte modified UTF-8 encoding +/// assert_eq!(Cow::Borrowed(&[0xC0, 0x80, 0xC0, 0x80]), +/// to_java_cesu8("\0\0")); +/// ``` +pub fn to_java_cesu8(text: &str) -> Cow<[u8]> { + if is_valid_java_cesu8(text) { + Cow::Borrowed(text.as_bytes()) + } else { + Cow::Owned(to_cesu8_internal(text, Variant::Java)) + } +} + +fn to_cesu8_internal(text: &str, variant: Variant) -> Vec<u8> { + let bytes = text.as_bytes(); + let mut encoded = Vec::with_capacity(bytes.len() + bytes.len() >> 2); + let mut i = 0; + while i < bytes.len() { + let b = bytes[i]; + if variant == Variant::Java && b == 0 { + encoded.push(0xc0); + encoded.push(0x80); + i += 1; + } else if b < 128 { + // Pass ASCII through quickly. + encoded.push(b); + i += 1; + } else { + // Figure out how many bytes we need for this character. + let w = utf8_char_width(b); + assert!(w <= 4); + assert!(i + w <= bytes.len()); + if w != 4 { + // Pass through short UTF-8 sequences unmodified. + encoded.extend(bytes[i..i+w].iter().cloned()); + } else { + // Encode 4-byte sequences as 6 bytes. + let s = unsafe { from_utf8_unchecked(&bytes[i..i+w]) }; + let c = s.chars().next().unwrap() as u32 - 0x10000; + let mut s: [u16; 2] = [0; 2]; + s[0] = ((c >> 10) as u16) | 0xD800; + s[1] = ((c & 0x3FF) as u16) | 0xDC00; + encoded.extend(enc_surrogate(s[0]).iter().cloned()); + encoded.extend(enc_surrogate(s[1]).iter().cloned()); + } + i += w; + } + } + encoded +} + +/// Check whether a Rust string contains valid CESU-8 data. +pub fn is_valid_cesu8(text: &str) -> bool { + // We rely on the fact that Rust strings are guaranteed to be valid + // UTF-8. + for b in text.bytes() { + if (b & !CONT_MASK) == TAG_CONT_U8 { continue; } + if utf8_char_width(b) > 3 { return false; } + } + true +} + +/// Check whether a Rust string contains valid Java's modified UTF-8 data. +pub fn is_valid_java_cesu8(text: &str) -> bool { + !text.contains('\0') && is_valid_cesu8(text) +} + +#[test] +fn test_valid_cesu8() { + assert!(is_valid_cesu8("aé日")); + assert!(is_valid_java_cesu8("aé日")); + assert!(!is_valid_cesu8("\u{10401}")); + assert!(!is_valid_java_cesu8("\u{10401}")); + assert!(is_valid_cesu8("\0\0")); + assert!(!is_valid_java_cesu8("\0\0")); +} + + +/// Encode a single surrogate as CESU-8. +fn enc_surrogate(surrogate: u16) -> [u8; 3] { + assert!(0xD800 <= surrogate && surrogate <= 0xDFFF); + // 1110xxxx 10xxxxxx 10xxxxxx + [0b11100000 | ((surrogate & 0b11110000_00000000) >> 12) as u8, + TAG_CONT_U8 | ((surrogate & 0b00001111_11000000) >> 6) as u8, + TAG_CONT_U8 | ((surrogate & 0b00000000_00111111) ) as u8] +} diff --git a/src/unicode.rs b/src/unicode.rs new file mode 100644 index 0000000..a236d31 --- /dev/null +++ b/src/unicode.rs @@ -0,0 +1,37 @@ +// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT +// file at http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// (This used to be in the Rust unicode crate, which is now gone, so we'll +// just include it inline.) + +// https://tools.ietf.org/html/rfc3629 +static UTF8_CHAR_WIDTH: [u8; 256] = [ +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF +0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, +2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF +3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF +4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF +]; + +/// Given a first byte, determine how many bytes are in this UTF-8 character +#[inline] +pub fn utf8_char_width(b: u8) -> usize { + return UTF8_CHAR_WIDTH[b as usize] as usize; +} |