aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIvan Lozano <ivanlozano@google.com>2021-08-20 23:37:39 +0000
committerAutomerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>2021-08-20 23:37:39 +0000
commit5e39cdae8df55b50c2da9d77a1a6da3bbbc7e9a7 (patch)
tree90d87dcb0faf6c6e2805cab833d9f173ac4ed786
parentd22939409e8930f2426b1b6fe55634859e66104b (diff)
parent9a771a31e3b075cae3bae31d5fc4ba0b6e114667 (diff)
downloadcesu8-5e39cdae8df55b50c2da9d77a1a6da3bbbc7e9a7.tar.gz
Initial commit for cesu8 crate. am: a7e4bc004f am: e0d2931950 am: 82f44c6e92 am: 9a771a31e3
Original change: https://android-review.googlesource.com/c/platform/external/rust/crates/cesu8/+/1804098 Change-Id: I44bbede8e60e4e5c6f2b525c3f5013c8b77f5a9b
-rw-r--r--COPYRIGHT-RUST.txt422
-rw-r--r--Cargo.toml17
l---------LICENSE1
-rw-r--r--METADATA19
-rw-r--r--MODULE_LICENSE_APACHE20
-rw-r--r--OWNERS1
-rw-r--r--README.md33
-rw-r--r--src/lib.rs453
-rw-r--r--src/unicode.rs37
9 files changed, 983 insertions, 0 deletions
diff --git a/COPYRIGHT-RUST.txt b/COPYRIGHT-RUST.txt
new file mode 100644
index 0000000..52039ea
--- /dev/null
+++ b/COPYRIGHT-RUST.txt
@@ -0,0 +1,422 @@
+Short version for non-lawyers:
+
+The Rust Project is dual-licensed under Apache 2.0 and MIT
+terms.
+
+
+Longer version:
+
+The Rust Project is copyright 2014, The Rust Project
+Developers (given in the file AUTHORS.txt).
+
+Licensed under the Apache License, Version 2.0
+<LICENSE-APACHE or
+http://www.apache.org/licenses/LICENSE-2.0> or the MIT
+license <LICENSE-MIT or http://opensource.org/licenses/MIT>,
+at your option. All files in the project carrying such
+notice may not be copied, modified, or distributed except
+according to those terms.
+
+
+The Rust Project includes packages written by third parties.
+The following third party packages are included, and carry
+their own copyright notices and license terms:
+
+* Two header files that are part of the Valgrind
+ package. These files are found at src/rt/vg/valgrind.h and
+ src/rt/vg/memcheck.h, within this distribution. These files
+ are redistributed under the following terms, as noted in
+ them:
+
+ for src/rt/vg/valgrind.h:
+
+ This file is part of Valgrind, a dynamic binary
+ instrumentation framework.
+
+ Copyright (C) 2000-2010 Julian Seward. All rights
+ reserved.
+
+ Redistribution and use in source and binary forms, with
+ or without modification, are permitted provided that the
+ following conditions are met:
+
+ 1. Redistributions of source code must retain the above
+ copyright notice, this list of conditions and the
+ following disclaimer.
+
+ 2. The origin of this software must not be
+ misrepresented; you must not claim that you wrote the
+ original software. If you use this software in a
+ product, an acknowledgment in the product
+ documentation would be appreciated but is not
+ required.
+
+ 3. Altered source versions must be plainly marked as
+ such, and must not be misrepresented as being the
+ original software.
+
+ 4. The name of the author may not be used to endorse or
+ promote products derived from this software without
+ specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
+ NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
+ OF SUCH DAMAGE.
+
+ for src/rt/vg/memcheck.h:
+
+ This file is part of MemCheck, a heavyweight Valgrind
+ tool for detecting memory errors.
+
+ Copyright (C) 2000-2010 Julian Seward. All rights
+ reserved.
+
+ Redistribution and use in source and binary forms, with
+ or without modification, are permitted provided that the
+ following conditions are met:
+
+ 1. Redistributions of source code must retain the above
+ copyright notice, this list of conditions and the
+ following disclaimer.
+
+ 2. The origin of this software must not be
+ misrepresented; you must not claim that you wrote the
+ original software. If you use this software in a
+ product, an acknowledgment in the product
+ documentation would be appreciated but is not
+ required.
+
+ 3. Altered source versions must be plainly marked as
+ such, and must not be misrepresented as being the
+ original software.
+
+ 4. The name of the author may not be used to endorse or
+ promote products derived from this software without
+ specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
+ NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
+ OF SUCH DAMAGE.
+
+* The auxiliary file src/etc/pkg/modpath.iss contains a
+ library routine compiled, by Inno Setup, into the Windows
+ installer binary. This file is licensed under the LGPL,
+ version 3, but, in our legal interpretation, this does not
+ affect the aggregate "collected work" license of the Rust
+ distribution (MIT/ASL2) nor any other components of it. We
+ believe that the terms governing distribution of the
+ binary Windows installer built from modpath.iss are
+ therefore LGPL, but not the terms governing distribution
+ of any of the files installed by such an installer (such
+ as the Rust compiler or runtime libraries themselves).
+
+* The src/rt/miniz.c file, carrying an implementation of
+ RFC1950/RFC1951 DEFLATE, by Rich Geldreich
+ <richgel99@gmail.com>. All uses of this file are
+ permitted by the embedded "unlicense" notice
+ (effectively: public domain with warranty disclaimer).
+
+* LLVM. Code for this package is found in src/llvm.
+
+ Copyright (c) 2003-2013 University of Illinois at
+ Urbana-Champaign. All rights reserved.
+
+ Developed by:
+
+ LLVM Team
+
+ University of Illinois at Urbana-Champaign
+
+ http://llvm.org
+
+ Permission is hereby granted, free of charge, to any
+ person obtaining a copy of this software and associated
+ documentation files (the "Software"), to deal with the
+ Software without restriction, including without
+ limitation the rights to use, copy, modify, merge,
+ publish, distribute, sublicense, and/or sell copies of
+ the Software, and to permit persons to whom the Software
+ is furnished to do so, subject to the following
+ conditions:
+
+ * Redistributions of source code must retain the
+ above copyright notice, this list of conditions
+ and the following disclaimers.
+
+ * Redistributions in binary form must reproduce the
+ above copyright notice, this list of conditions
+ and the following disclaimers in the documentation
+ and/or other materials provided with the
+ distribution.
+
+ * Neither the names of the LLVM Team, University of
+ Illinois at Urbana-Champaign, nor the names of its
+ contributors may be used to endorse or promote
+ products derived from this Software without
+ specific prior written permission.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+ TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+ PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+ SHALL THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE
+ FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
+ OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ OTHER DEALINGS WITH THE SOFTWARE.
+
+* Additional libraries included in LLVM carry separate
+ BSD-compatible licenses. See src/llvm/LICENSE.txt for
+ details.
+
+* compiler-rt, in src/compiler-rt is dual licensed under
+ LLVM's license and MIT:
+
+ Copyright (c) 2009-2014 by the contributors listed in
+ CREDITS.TXT
+
+ All rights reserved.
+
+ Developed by:
+
+ LLVM Team
+
+ University of Illinois at Urbana-Champaign
+
+ http://llvm.org
+
+ Permission is hereby granted, free of charge, to any
+ person obtaining a copy of this software and associated
+ documentation files (the "Software"), to deal with the
+ Software without restriction, including without
+ limitation the rights to use, copy, modify, merge,
+ publish, distribute, sublicense, and/or sell copies of
+ the Software, and to permit persons to whom the Software
+ is furnished to do so, subject to the following
+ conditions:
+
+ * Redistributions of source code must retain the
+ above copyright notice, this list of conditions
+ and the following disclaimers.
+
+ * Redistributions in binary form must reproduce the
+ above copyright notice, this list of conditions
+ and the following disclaimers in the documentation
+ and/or other materials provided with the
+ distribution.
+
+ * Neither the names of the LLVM Team, University of
+ Illinois at Urbana-Champaign, nor the names of its
+ contributors may be used to endorse or promote
+ products derived from this Software without
+ specific prior written permission.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+ TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+ PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+ SHALL THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE
+ FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
+ OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ OTHER DEALINGS WITH THE SOFTWARE.
+
+ ========================================================
+
+ Copyright (c) 2009-2014 by the contributors listed in
+ CREDITS.TXT
+
+ Permission is hereby granted, free of charge, to any
+ person obtaining a copy of this software and associated
+ documentation files (the "Software"), to deal in the
+ Software without restriction, including without
+ limitation the rights to use, copy, modify, merge,
+ publish, distribute, sublicense, and/or sell copies of
+ the Software, and to permit persons to whom the Software
+ is furnished to do so, subject to the following
+ conditions:
+
+ The above copyright notice and this permission notice
+ shall be included in all copies or substantial portions
+ of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+ TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+ PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+ SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+ IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ DEALINGS IN THE SOFTWARE.
+
+* Portions of the FFI code for interacting with the native ABI
+ is derived from the Clay programming language, which carries
+ the following license.
+
+ Copyright (C) 2008-2010 Tachyon Technologies.
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with
+ or without modification, are permitted provided that the
+ following conditions are met:
+
+ 1. Redistributions of source code must retain the above
+ copyright notice, this list of conditions and the
+ following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the
+ above copyright notice, this list of conditions and
+ the following disclaimer in the documentation and/or
+ other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR
+ IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ DEVELOPERS AND CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
+ OF SUCH DAMAGE.
+
+* Hoedown, the markdown parser, under src/rt/hoedown, is
+ licensed as follows.
+
+ Copyright (c) 2008, Natacha Porté
+ Copyright (c) 2011, Vicent Martí
+ Copyright (c) 2013, Devin Torres and the Hoedown authors
+
+ Permission to use, copy, modify, and distribute this
+ software for any purpose with or without fee is hereby
+ granted, provided that the above copyright notice and
+ this permission notice appear in all copies.
+
+ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR
+ DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE
+ INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR
+ ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA
+ OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+ OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+* libbacktrace, under src/libbacktrace:
+
+ Copyright (C) 2012-2014 Free Software Foundation, Inc.
+ Written by Ian Lance Taylor, Google.
+
+ Redistribution and use in source and binary forms, with
+ or without modification, are permitted provided that the
+ following conditions are met:
+
+ (1) Redistributions of source code must retain the
+ above copyright notice, this list of conditions and
+ the following disclaimer.
+
+ (2) Redistributions in binary form must reproduce
+ the above copyright notice, this list of conditions
+ and the following disclaimer in the documentation
+ and/or other materials provided with the
+ distribution.
+
+ (3) The name of the author may not be used to
+ endorse or promote products derived from this
+ software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
+ NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
+ OF SUCH DAMAGE. */
+
+* jemalloc, under src/jemalloc:
+
+ Copyright (C) 2002-2014 Jason Evans
+ <jasone@canonware.com>. All rights reserved.
+ Copyright (C) 2007-2012 Mozilla Foundation.
+ All rights reserved.
+ Copyright (C) 2009-2014 Facebook, Inc.
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ 1. Redistributions of source code must retain the above copyright notice(s),
+ this list of conditions and the following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright notice(s),
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S)
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S)
+ BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
+ OF SUCH DAMAGE.
+
+* Additional copyright may be retained by contributors other
+ than Mozilla, the Rust Project Developers, or the parties
+ enumerated in this file. Such copyright can be determined
+ on a case-by-case basis by examining the author of each
+ portion of a file in the revision-control commit records
+ of the project, or by consulting representative comments
+ claiming copyright ownership for a file.
+
+ For example, the text:
+
+ "Copyright (c) 2011 Google Inc."
+
+ appears in some files, and these files thereby denote
+ that their author and copyright-holder is Google Inc.
+
+ In all such cases, the absence of explicit licensing text
+ indicates that the contributor chose to license their work
+ for distribution under identical terms to those Mozilla
+ has chosen for the collective work, enumerated at the top
+ of this file. The only difference is the retention of
+ copyright itself, held by the contributor.
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..0d8bd9a
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,17 @@
+[package]
+
+name = "cesu8"
+version = "1.1.0"
+authors = ["Eric Kidd <git@randomhacks.net>"]
+
+description = "Convert to and from CESU-8 encoding (similar to UTF-8)"
+license = "Apache-2.0/MIT"
+readme = "README.md"
+
+repository = "https://github.com/emk/cesu8-rs"
+documentation = "http://emk.github.io/cesu8-rs/cesu8/index.html"
+
+[features]
+# Allow access to unstable features when being built with a nightly compiler,
+# to keep travis-cargo happy and enable access to benchmarks if we want them.
+unstable = []
diff --git a/LICENSE b/LICENSE
new file mode 120000
index 0000000..004484b
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1 @@
+COPYRIGHT-RUST.txt \ No newline at end of file
diff --git a/METADATA b/METADATA
new file mode 100644
index 0000000..ae7ceba
--- /dev/null
+++ b/METADATA
@@ -0,0 +1,19 @@
+name: "cesu8"
+description: "Convert to and from CESU-8 encoding (similar to UTF-8)"
+third_party {
+ url {
+ type: HOMEPAGE
+ value: "https://crates.io/crates/cesu8"
+ }
+ url {
+ type: ARCHIVE
+ value: "https://static.crates.io/crates/cesu8/cesu8-1.1.0.crate"
+ }
+ version: "1.1.0"
+ license_type: NOTICE
+ last_upgrade_date {
+ year: 2021
+ month: 7
+ day: 30
+ }
+}
diff --git a/MODULE_LICENSE_APACHE2 b/MODULE_LICENSE_APACHE2
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/MODULE_LICENSE_APACHE2
diff --git a/OWNERS b/OWNERS
new file mode 100644
index 0000000..45dc4dd
--- /dev/null
+++ b/OWNERS
@@ -0,0 +1 @@
+include platform/prebuilts/rust:master:/OWNERS
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..161f865
--- /dev/null
+++ b/README.md
@@ -0,0 +1,33 @@
+# CESU-8 encoder/decoder for Rust
+
+[![Build Status](https://travis-ci.org/emk/cesu8-rs.svg)](https://travis-ci.org/emk/cesu8-rs) [![Latest version](https://img.shields.io/crates/v/cesu8.svg)](https://crates.io/crates/cesu8) [![License](https://img.shields.io/crates/l/cesu8.svg)](https://crates.io/crates/cesu8)
+
+[Documentation][apidoc].
+
+[apidoc]: http://emk.github.io/cesu8-rs/cesu8/index.html
+
+Convert between ordinary UTF-8 and [CESU-8][] encodings.
+
+CESU-8 encodes characters outside the Basic Multilingual Plane as two
+UTF-16 surrogate chacaters, which are then further re-encoded as invalid,
+3-byte UTF-8 characters. This means that 4-byte UTF-8 sequences become
+6-byte CESU-8 sequences.
+
+**Note that CESU-8 is only intended for internal use within tightly-coupled
+systems, and not for data interchange.**
+
+This encoding is sometimes needed when working with Java, Oracle or MySQL,
+and when trying to store emoji, hieroglyphs, or other characters on the
+Supplementary Multilingual Plane or the Supplementary Ideographic Plane.
+
+[CESU-8]: http://www.unicode.org/reports/tr26/tr26-2.html
+
+## License
+
+Some of this code is adapted from Rust's [`src/libcore/str.rs` file][str.rs].
+This code is covered by LICENSE-RUST.txt and copyright by The Rust Project
+Developers and individual Rust contributors, as described in that file.
+
+The new code in this project is distributed under the same terms.
+
+[str.rs]: https://github.com/rust-lang/rust/blob/master/src/libcore/str.rs
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..7fac5ff
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,453 @@
+// Copyright 2012-2014 The Rust Project Developers and Eric Kidd. See the
+// COPYRIGHT-RUST.txt file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed except
+// according to those terms.
+
+
+//! A simple library implementing the [CESU-8 compatibility encoding
+//! scheme](http://www.unicode.org/reports/tr26/tr26-2.html). This is a
+//! non-standard variant of UTF-8 that is used internally by some systems
+//! that need to represent UTF-16 data as 8-bit characters. Yes, this is
+//! ugly.
+//!
+//! Use of this encoding is discouraged by the Unicode Consortium. It's OK
+//! for working with existing internal APIs, but it should not be used for
+//! transmitting or storing data.
+//!
+//! ```
+//! use std::borrow::Cow;
+//! use cesu8::{from_cesu8, to_cesu8};
+//!
+//! // 16-bit Unicode characters are the same in UTF-8 and CESU-8.
+//! assert_eq!(Cow::Borrowed("aé日".as_bytes()),
+//! to_cesu8("aé日"));
+//! assert_eq!(Cow::Borrowed("aé日"),
+//! from_cesu8("aé日".as_bytes()).unwrap());
+//!
+//! // This string is CESU-8 data containing a 6-byte surrogate pair,
+//! // which decodes to a 4-byte UTF-8 string.
+//! let data = &[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81];
+//! assert_eq!(Cow::Borrowed("\u{10401}"),
+//! from_cesu8(data).unwrap());
+//! ```
+//!
+//! ### A note about security
+//!
+//! As a general rule, this library is intended to fail on malformed or
+//! unexpected input. CESU-8 is supposed to be an internal-only format,
+//! and if we're seeing malformed data, we assume that it's either a bug in
+//! somebody's code, or an attacker is trying to improperly encode data to
+//! evade security checks.
+//!
+//! If you have a use case for lossy conversion to UTF-8, or conversion
+//! from mixed UTF-8/CESU-8 data, please feel free to submit a pull request
+//! for `from_cesu8_lossy_permissive` with appropriate behavior.
+//!
+//! ### Java and U+0000, and other variants
+//!
+//! Java uses the CESU-8 encoding as described above, but with one
+//! difference: The null character U+0000 is represented as an overlong
+//! UTF-8 sequence `C0 80`. This is supported by the `from_java_cesu8` and
+//! `to_java_cesu8` methods.
+//!
+//! ### Surrogate pairs and UTF-8
+//!
+//! The UTF-16 encoding uses "surrogate pairs" to represent Unicode code
+//! points in the range from U+10000 to U+10FFFF. These are 16-bit numbers
+//! in the range 0xD800 to 0xDFFF.
+//!
+//! * 0xD800 to 0xDBFF: First half of surrogate pair. When encoded as
+//! CESU-8, these become **1110**1101 **10**100000 **10**000000 to
+//! **1110**1101 **10**101111 **10**111111.
+//!
+//! * 0xDC00 to 0xDFFF: Second half of surrogate pair. These become
+//! **1110**1101 **10**110000 **10**000000 to
+//! **1110**1101 **10**111111 **10**111111.
+//!
+//! Wikipedia [explains](http://en.wikipedia.org/wiki/UTF-16) the
+//! code point to UTF-16 conversion process:
+//!
+//! > Consider the encoding of U+10437 (𐐷):
+//! >
+//! > * Subtract 0x10000 from 0x10437. The result is 0x00437, 0000 0000 0100
+//! > 0011 0111.
+//! > * Split this into the high 10-bit value and the low 10-bit value:
+//! > 0000000001 and 0000110111.
+//! > * Add 0xD800 to the high value to form the high surrogate: 0xD800 +
+//! > 0x0001 = 0xD801.
+//! > * Add 0xDC00 to the low value to form the low surrogate: 0xDC00 +
+//! > 0x0037 = 0xDC37.
+
+#![warn(missing_docs)]
+
+
+use std::borrow::Cow;
+use std::error::Error;
+use std::fmt;
+use std::result::Result;
+use std::slice;
+use std::str::{from_utf8, from_utf8_unchecked};
+use unicode::utf8_char_width;
+
+mod unicode;
+
+/// Mask of the value bits of a continuation byte.
+const CONT_MASK: u8 = 0b0011_1111u8;
+/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte.
+const TAG_CONT_U8: u8 = 0b1000_0000u8;
+
+/// The CESU-8 data could not be decoded as valid UTF-8 data.
+#[derive(Clone, Copy, Debug)]
+pub struct Cesu8DecodingError;
+
+impl Error for Cesu8DecodingError {
+ fn description(&self) -> &str { "decoding error" }
+ fn cause(&self) -> Option<&Error> { None }
+}
+
+impl fmt::Display for Cesu8DecodingError {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ write!(f, "could not convert CESU-8 data to UTF-8")
+ }
+}
+
+/// Which variant of the encoding are we working with?
+#[derive(PartialEq, Eq)]
+enum Variant {
+ /// Regular CESU-8, with '\0' represented by itself.
+ Standard,
+ /// This is technically Java's "Modified UTF-8", which is supposedly
+ /// like CESU-8, except that it UTF-8 encodes the '\0' byte. I'm sure
+ /// it seemed like a good idea at the time.
+ Java,
+}
+
+/// Convert CESU-8 data to a Rust string, re-encoding only if necessary.
+/// Returns an error if the data cannot be represented as valid UTF-8.
+///
+/// ```
+/// use std::borrow::Cow;
+/// use cesu8::from_cesu8;
+///
+/// // This string is valid as UTF-8 or CESU-8, so it doesn't change,
+/// // and we can convert it without allocating memory.
+/// assert_eq!(Cow::Borrowed("aé日"),
+/// from_cesu8("aé日".as_bytes()).unwrap());
+///
+/// // This string is CESU-8 data containing a 6-byte surrogate pair,
+/// // which becomes a 4-byte UTF-8 string.
+/// let data = &[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81];
+/// assert_eq!(Cow::Borrowed("\u{10401}"),
+/// from_cesu8(data).unwrap());
+/// ```
+pub fn from_cesu8(bytes: &[u8]) -> Result<Cow<str>, Cesu8DecodingError> {
+ from_cesu8_internal(bytes, Variant::Standard)
+}
+
+/// Convert Java's modified UTF-8 data to a Rust string, re-encoding only if
+/// necessary. Returns an error if the data cannot be represented as valid
+/// UTF-8.
+///
+/// ```
+/// use std::borrow::Cow;
+/// use cesu8::from_java_cesu8;
+///
+/// // This string is valid as UTF-8 or modified UTF-8, so it doesn't change,
+/// // and we can convert it without allocating memory.
+/// assert_eq!(Cow::Borrowed("aé日"),
+/// from_java_cesu8("aé日".as_bytes()).unwrap());
+///
+/// // This string is modified UTF-8 data containing a 6-byte surrogate pair,
+/// // which becomes a 4-byte UTF-8 string.
+/// let data = &[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81];
+/// assert_eq!(Cow::Borrowed("\u{10401}"),
+/// from_java_cesu8(data).unwrap());
+///
+/// // This string is modified UTF-8 data containing null code-points.
+/// let data = &[0xC0, 0x80, 0xC0, 0x80];
+/// assert_eq!(Cow::Borrowed("\0\0"),
+/// from_java_cesu8(data).unwrap());
+/// ```
+pub fn from_java_cesu8(bytes: &[u8]) -> Result<Cow<str>, Cesu8DecodingError> {
+ from_cesu8_internal(bytes, Variant::Java)
+}
+
+/// Do the actual work of decoding.
+fn from_cesu8_internal(bytes: &[u8], variant: Variant) ->
+ Result<Cow<str>, Cesu8DecodingError>
+{
+ match from_utf8(bytes) {
+ Ok(str) => Ok(Cow::Borrowed(str)),
+ _ => {
+ let mut decoded = Vec::with_capacity(bytes.len());
+ if decode_from_iter(&mut decoded, &mut bytes.iter(), variant) {
+ // Keep this assertion in debug mode only. It's important
+ // that this assertion is true, because Rust assumes that
+ // all UTF-8 strings are valid.
+ debug_assert!(from_utf8(&decoded[..]).is_ok());
+ Ok(Cow::Owned(unsafe { String::from_utf8_unchecked(decoded) }))
+ } else {
+ Err(Cesu8DecodingError)
+ }
+ }
+ }
+}
+
+#[test]
+fn test_from_cesu8() {
+ // The surrogate-encoded character below is from the ICU library's
+ // icu/source/test/testdata/conversion.txt test case.
+ let data = &[0x4D, 0xE6, 0x97, 0xA5, 0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81, 0x7F];
+ assert_eq!(Cow::Borrowed("M日\u{10401}\u{7F}"),
+ from_cesu8(data).unwrap());
+
+ // We used to have test data from the CESU-8 specification, but when we
+ // worked it through manually, we got the wrong answer:
+ //
+ // Input: [0xED, 0xAE, 0x80, 0xED, 0xB0, 0x80]
+ // Binary: 11101101 10101110 10000000 11101101 10110000 10000000
+ //
+ // 0b1101_101110_000000 -> 0xDB80
+ // 0b1101_110000_000000 -> 0xDC00
+ //
+ // ((0xDB80 - 0xD800) << 10) | (0xDC00 - 0xDC00) -> 0xE0000
+ // 0x10000 + 0xE0000 -> 0xF0000
+ //
+ // The spec claims that we are supposed to get 0x10000, not 0xF0000.
+ // Since I can't reconcile this example data with the text of the
+ // specification, I decided to use a test character from ICU instead.
+}
+
+// Our internal decoder, based on Rust's is_utf8 implementation.
+fn decode_from_iter(
+ decoded: &mut Vec<u8>, iter: &mut slice::Iter<u8>, variant: Variant)
+ -> bool
+{
+ macro_rules! err {
+ () => { return false }
+ }
+ macro_rules! next {
+ () => {
+ match iter.next() {
+ Some(a) => *a,
+ // We needed data, but there was none: error!
+ None => err!()
+ }
+ }
+ }
+ macro_rules! next_cont {
+ () => {
+ {
+ let byte = next!();
+ if (byte) & !CONT_MASK == TAG_CONT_U8 { byte } else { err!() }
+ }
+ }
+ }
+
+ loop {
+ let first = match iter.next() {
+ Some(&b) => b,
+ // We're at the end of the iterator and a codepoint boundary at
+ // the same time, so this string is valid.
+ None => return true
+ };
+
+ if variant == Variant::Java && first == 0 {
+ // Java's modified UTF-8 should never contain \0 directly.
+ err!();
+ } else if first < 128 {
+ // Pass ASCII through directly.
+ decoded.push(first);
+ } else if first == 0xc0 && variant == Variant::Java {
+ match next!() {
+ 0x80 => decoded.push(0),
+ _ => err!(),
+ }
+ } else {
+ let w = utf8_char_width(first);
+ let second = next_cont!();
+ match w {
+ // Two-byte sequences can be used directly.
+ 2 => { decoded.extend([first, second].iter().cloned()); }
+ 3 => {
+ let third = next_cont!();
+ match (first, second) {
+ // These are valid UTF-8, so pass them through.
+ (0xE0 , 0xA0 ... 0xBF) |
+ (0xE1 ... 0xEC, 0x80 ... 0xBF) |
+ (0xED , 0x80 ... 0x9F) |
+ (0xEE ... 0xEF, 0x80 ... 0xBF) => {
+ decoded.extend([first, second, third].iter()
+ .cloned())
+ }
+ // First half a surrogate pair, so decode.
+ (0xED , 0xA0 ... 0xAF) => {
+ if next!() != 0xED { err!() }
+ let fifth = next_cont!();
+ if fifth < 0xB0 || 0xBF < fifth { err!() }
+ let sixth = next_cont!();
+ let s = dec_surrogates(second, third, fifth, sixth);
+ decoded.extend(s.iter().cloned());
+ }
+ _ => err!()
+ }
+ }
+ _ => err!()
+ }
+ }
+ }
+}
+
+/// Convert the two trailing bytes from a CESU-8 surrogate to a regular
+/// surrogate value.
+fn dec_surrogate(second: u8, third: u8) -> u32 {
+ 0xD000u32 | ((second & CONT_MASK) as u32) << 6 | (third & CONT_MASK) as u32
+}
+
+/// Convert the bytes from a CESU-8 surrogate pair into a valid UTF-8
+/// sequence. Assumes input is valid.
+fn dec_surrogates(second: u8, third: u8, fifth: u8, sixth: u8) -> [u8; 4] {
+ // Convert to a 32-bit code point.
+ let s1 = dec_surrogate(second, third);
+ let s2 = dec_surrogate(fifth, sixth);
+ let c = 0x10000 + (((s1 - 0xD800) << 10) | (s2 - 0xDC00));
+ //println!("{:0>8b} {:0>8b} {:0>8b} -> {:0>16b}", 0xEDu8, second, third, s1);
+ //println!("{:0>8b} {:0>8b} {:0>8b} -> {:0>16b}", 0xEDu8, fifth, sixth, s2);
+ //println!("-> {:0>32b}", c);
+ assert!(0x010000 <= c && c <= 0x10FFFF);
+
+ // Convert to UTF-8.
+ // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ [0b1111_0000u8 | ((c & 0b1_1100_0000_0000_0000_0000) >> 18) as u8,
+ TAG_CONT_U8 | ((c & 0b0_0011_1111_0000_0000_0000) >> 12) as u8,
+ TAG_CONT_U8 | ((c & 0b0_0000_0000_1111_1100_0000) >> 6) as u8,
+ TAG_CONT_U8 | ((c & 0b0_0000_0000_0000_0011_1111) ) as u8]
+}
+
+/// Convert a Rust `&str` to CESU-8 bytes.
+///
+/// ```
+/// use std::borrow::Cow;
+/// use cesu8::to_cesu8;
+///
+/// // This string is valid as UTF-8 or CESU-8, so it doesn't change,
+/// // and we can convert it without allocating memory.
+/// assert_eq!(Cow::Borrowed("aé日".as_bytes()), to_cesu8("aé日"));
+///
+/// // This string is a 4-byte UTF-8 string, which becomes a 6-byte CESU-8
+/// // vector.
+/// assert_eq!(Cow::Borrowed(&[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81]),
+/// to_cesu8("\u{10401}"));
+/// ```
+pub fn to_cesu8(text: &str) -> Cow<[u8]> {
+ if is_valid_cesu8(text) {
+ Cow::Borrowed(text.as_bytes())
+ } else {
+ Cow::Owned(to_cesu8_internal(text, Variant::Standard))
+ }
+}
+
+/// Convert a Rust `&str` to Java's modified UTF-8 bytes.
+///
+/// ```
+/// use std::borrow::Cow;
+/// use cesu8::to_java_cesu8;
+///
+/// // This string is valid as UTF-8 or CESU-8, so it doesn't change,
+/// // and we can convert it without allocating memory.
+/// assert_eq!(Cow::Borrowed("aé日".as_bytes()), to_java_cesu8("aé日"));
+///
+/// // This string is a 4-byte UTF-8 string, which becomes a 6-byte modified
+/// // UTF-8 vector.
+/// assert_eq!(Cow::Borrowed(&[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81]),
+/// to_java_cesu8("\u{10401}"));
+///
+/// // This string contains null, which becomes 2-byte modified UTF-8 encoding
+/// assert_eq!(Cow::Borrowed(&[0xC0, 0x80, 0xC0, 0x80]),
+/// to_java_cesu8("\0\0"));
+/// ```
+pub fn to_java_cesu8(text: &str) -> Cow<[u8]> {
+ if is_valid_java_cesu8(text) {
+ Cow::Borrowed(text.as_bytes())
+ } else {
+ Cow::Owned(to_cesu8_internal(text, Variant::Java))
+ }
+}
+
+fn to_cesu8_internal(text: &str, variant: Variant) -> Vec<u8> {
+ let bytes = text.as_bytes();
+ let mut encoded = Vec::with_capacity(bytes.len() + bytes.len() >> 2);
+ let mut i = 0;
+ while i < bytes.len() {
+ let b = bytes[i];
+ if variant == Variant::Java && b == 0 {
+ encoded.push(0xc0);
+ encoded.push(0x80);
+ i += 1;
+ } else if b < 128 {
+ // Pass ASCII through quickly.
+ encoded.push(b);
+ i += 1;
+ } else {
+ // Figure out how many bytes we need for this character.
+ let w = utf8_char_width(b);
+ assert!(w <= 4);
+ assert!(i + w <= bytes.len());
+ if w != 4 {
+ // Pass through short UTF-8 sequences unmodified.
+ encoded.extend(bytes[i..i+w].iter().cloned());
+ } else {
+ // Encode 4-byte sequences as 6 bytes.
+ let s = unsafe { from_utf8_unchecked(&bytes[i..i+w]) };
+ let c = s.chars().next().unwrap() as u32 - 0x10000;
+ let mut s: [u16; 2] = [0; 2];
+ s[0] = ((c >> 10) as u16) | 0xD800;
+ s[1] = ((c & 0x3FF) as u16) | 0xDC00;
+ encoded.extend(enc_surrogate(s[0]).iter().cloned());
+ encoded.extend(enc_surrogate(s[1]).iter().cloned());
+ }
+ i += w;
+ }
+ }
+ encoded
+}
+
+/// Check whether a Rust string contains valid CESU-8 data.
+pub fn is_valid_cesu8(text: &str) -> bool {
+ // We rely on the fact that Rust strings are guaranteed to be valid
+ // UTF-8.
+ for b in text.bytes() {
+ if (b & !CONT_MASK) == TAG_CONT_U8 { continue; }
+ if utf8_char_width(b) > 3 { return false; }
+ }
+ true
+}
+
+/// Check whether a Rust string contains valid Java's modified UTF-8 data.
+pub fn is_valid_java_cesu8(text: &str) -> bool {
+ !text.contains('\0') && is_valid_cesu8(text)
+}
+
+#[test]
+fn test_valid_cesu8() {
+ assert!(is_valid_cesu8("aé日"));
+ assert!(is_valid_java_cesu8("aé日"));
+ assert!(!is_valid_cesu8("\u{10401}"));
+ assert!(!is_valid_java_cesu8("\u{10401}"));
+ assert!(is_valid_cesu8("\0\0"));
+ assert!(!is_valid_java_cesu8("\0\0"));
+}
+
+
+/// Encode a single surrogate as CESU-8.
+fn enc_surrogate(surrogate: u16) -> [u8; 3] {
+ assert!(0xD800 <= surrogate && surrogate <= 0xDFFF);
+ // 1110xxxx 10xxxxxx 10xxxxxx
+ [0b11100000 | ((surrogate & 0b11110000_00000000) >> 12) as u8,
+ TAG_CONT_U8 | ((surrogate & 0b00001111_11000000) >> 6) as u8,
+ TAG_CONT_U8 | ((surrogate & 0b00000000_00111111) ) as u8]
+}
diff --git a/src/unicode.rs b/src/unicode.rs
new file mode 100644
index 0000000..a236d31
--- /dev/null
+++ b/src/unicode.rs
@@ -0,0 +1,37 @@
+// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
+// file at http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+// (This used to be in the Rust unicode crate, which is now gone, so we'll
+// just include it inline.)
+
+// https://tools.ietf.org/html/rfc3629
+static UTF8_CHAR_WIDTH: [u8; 256] = [
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
+0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
+3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
+4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
+];
+
+/// Given a first byte, determine how many bytes are in this UTF-8 character
+#[inline]
+pub fn utf8_char_width(b: u8) -> usize {
+ return UTF8_CHAR_WIDTH[b as usize] as usize;
+}